[llvm] bdf2fbb - [AMDGPU] Convert some tests to opaque pointers (NFC)

Mon Dec 19 03:42:37 PST 2022

Author: Nikita Popov
Date: 2022-12-19T12:41:13+01:00
New Revision: bdf2fbba9cee60b4b260ff17e4f44c475c11e715

URL: https://github.com/llvm/llvm-project/commit/bdf2fbba9cee60b4b260ff17e4f44c475c11e715
DIFF: https://github.com/llvm/llvm-project/commit/bdf2fbba9cee60b4b260ff17e4f44c475c11e715.diff

LOG: [AMDGPU] Convert some tests to opaque pointers (NFC)

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll
    llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll
    llvm/test/CodeGen/AMDGPU/add-debug.ll
    llvm/test/CodeGen/AMDGPU/add.i16.ll
    llvm/test/CodeGen/AMDGPU/add.ll
    llvm/test/CodeGen/AMDGPU/add.v2i16.ll
    llvm/test/CodeGen/AMDGPU/add_i1.ll
    llvm/test/CodeGen/AMDGPU/add_i128.ll
    llvm/test/CodeGen/AMDGPU/add_i64.ll
    llvm/test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll
    llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
    llvm/test/CodeGen/AMDGPU/agpr-csr.ll
    llvm/test/CodeGen/AMDGPU/agpr-remat.ll
    llvm/test/CodeGen/AMDGPU/always-uniform.ll
    llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
    llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
    llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
    llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
    llvm/test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll
    llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
    llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
    llvm/test/CodeGen/AMDGPU/atomic_load_add.ll
    llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
    llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
    llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
    llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
    llvm/test/CodeGen/AMDGPU/atomics-cas-remarks-gfx90a.ll
    llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll
    llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll
    llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
    llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
    llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll
    llvm/test/CodeGen/AMDGPU/basic-branch.ll
    llvm/test/CodeGen/AMDGPU/basic-loop.ll
    llvm/test/CodeGen/AMDGPU/bfi_nested.ll
    llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll
    llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
    llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
    llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
    llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
    llvm/test/CodeGen/AMDGPU/branch-relax-bundle.ll
    llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
    llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
    llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll
    llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
    llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
    llvm/test/CodeGen/AMDGPU/buffer-schedule.ll
    llvm/test/CodeGen/AMDGPU/bug-sdag-scheduler-cycle.ll
    llvm/test/CodeGen/AMDGPU/bug-v4f64-subvector.ll
    llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll
    llvm/test/CodeGen/AMDGPU/build_vector.ll
    llvm/test/CodeGen/AMDGPU/carryout-selection.ll
    llvm/test/CodeGen/AMDGPU/cayman-loop-bug.ll
    llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
    llvm/test/CodeGen/AMDGPU/cf-stack-bug.ll
    llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
    llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
    llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll
    llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
    llvm/test/CodeGen/AMDGPU/code-object-v3.ll
    llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
    llvm/test/CodeGen/AMDGPU/combine-ftrunc.ll
    llvm/test/CodeGen/AMDGPU/commute-compares.ll
    llvm/test/CodeGen/AMDGPU/concat_vectors.ll
    llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
    llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
    llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
    llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll
    llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll
    llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
    llvm/test/CodeGen/AMDGPU/copy-to-reg.ll
    llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
    llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
    llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
    llvm/test/CodeGen/AMDGPU/dag-divergence.ll
    llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll
    llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
    llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll
    llvm/test/CodeGen/AMDGPU/debug-value.ll
    llvm/test/CodeGen/AMDGPU/debug-value2.ll
    llvm/test/CodeGen/AMDGPU/debug.ll
    llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
    llvm/test/CodeGen/AMDGPU/default-fp-mode.ll
    llvm/test/CodeGen/AMDGPU/diverge-extra-formal-args.ll
    llvm/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll
    llvm/test/CodeGen/AMDGPU/diverge-switch-default.ll
    llvm/test/CodeGen/AMDGPU/divergence-at-use.ll
    llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll
    llvm/test/CodeGen/AMDGPU/divergence-driven-bfe-isel.ll
    llvm/test/CodeGen/AMDGPU/divergence-driven-bitreverse.ll
    llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
    llvm/test/CodeGen/AMDGPU/divergence-driven-ctlz-cttz.ll
    llvm/test/CodeGen/AMDGPU/divergence-driven-ctpop.ll
    llvm/test/CodeGen/AMDGPU/divergence-driven-min-max.ll
    llvm/test/CodeGen/AMDGPU/divergence-driven-negsubinlineconst.ll
    llvm/test/CodeGen/AMDGPU/divergence-driven-not-isel.ll
    llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll
    llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll
    llvm/test/CodeGen/AMDGPU/divrem24-assume.ll
    llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
    llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
    llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll
    llvm/test/CodeGen/AMDGPU/ds_read2.ll
    llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
    llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
    llvm/test/CodeGen/AMDGPU/ds_read2st64.ll
    llvm/test/CodeGen/AMDGPU/ds_write2.ll
    llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
    llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll
    llvm/test/CodeGen/AMDGPU/expand-atomicrmw-syncscope.ll
    llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
    llvm/test/CodeGen/AMDGPU/extload-align.ll
    llvm/test/CodeGen/AMDGPU/extload-private.ll
    llvm/test/CodeGen/AMDGPU/extload.ll
    llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
    llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
    llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
    llvm/test/CodeGen/AMDGPU/extract-subvector.ll
    llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
    llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
    llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
    llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
    llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
    llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
    llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
    llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
    llvm/test/CodeGen/AMDGPU/fail.llvm.fptrunc.round.ll
    llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
    llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
    llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
    llvm/test/CodeGen/AMDGPU/fcmp-cnd.ll
    llvm/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
    llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
    llvm/test/CodeGen/AMDGPU/fcmp.ll
    llvm/test/CodeGen/AMDGPU/fcmp64.ll
    llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
    llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
    llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
    llvm/test/CodeGen/AMDGPU/fdot2.ll
    llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
    llvm/test/CodeGen/AMDGPU/flat-address-space.ll
    llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll
    llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
    llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll
    llvm/test/CodeGen/AMDGPU/flat-error-unsupported-gpu-hsa.ll
    llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
    llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch.ll
    llvm/test/CodeGen/AMDGPU/flat_atomics.ll
    llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
    llvm/test/CodeGen/AMDGPU/flat_atomics_i64_min_max_system.ll
    llvm/test/CodeGen/AMDGPU/flat_atomics_min_max_system.ll
    llvm/test/CodeGen/AMDGPU/fmed3.ll
    llvm/test/CodeGen/AMDGPU/fmin3.ll
    llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
    llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
    llvm/test/CodeGen/AMDGPU/fminnum.f64.ll
    llvm/test/CodeGen/AMDGPU/fminnum.ll
    llvm/test/CodeGen/AMDGPU/fneg-fabs-divergence-driven-isel.ll
    llvm/test/CodeGen/AMDGPU/fold-fmul-to-neg-abs.ll
    llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
    llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll
    llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
    llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
    llvm/test/CodeGen/AMDGPU/fptrunc.ll
    llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
    llvm/test/CodeGen/AMDGPU/frem.ll
    llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll
    llvm/test/CodeGen/AMDGPU/ftrunc.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
    llvm/test/CodeGen/AMDGPU/gfx10-vop-literal.ll
    llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
    llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
    llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
    llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll
    llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll
    llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll
    llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
    llvm/test/CodeGen/AMDGPU/global-constant.ll
    llvm/test/CodeGen/AMDGPU/global-directive.ll
    llvm/test/CodeGen/AMDGPU/global-extload-i16.ll
    llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
    llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll
    llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll
    llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
    llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
    llvm/test/CodeGen/AMDGPU/global-saddr-store.ll
    llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll
    llvm/test/CodeGen/AMDGPU/global_atomics.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_i64_min_max_system.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_min_max_system.ll
    llvm/test/CodeGen/AMDGPU/global_smrd.ll
    llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll
    llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll
    llvm/test/CodeGen/AMDGPU/gv-offset-folding.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-images.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-multigrid-sync-arg-v5.ll
    llvm/test/CodeGen/AMDGPU/hsa.ll
    llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
    llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
    llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
    llvm/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
    llvm/test/CodeGen/AMDGPU/icmp.i16.ll
    llvm/test/CodeGen/AMDGPU/icmp64.ll
    llvm/test/CodeGen/AMDGPU/idiv-licm.ll
    llvm/test/CodeGen/AMDGPU/idot2.ll
    llvm/test/CodeGen/AMDGPU/idot4s.ll
    llvm/test/CodeGen/AMDGPU/idot4u.ll
    llvm/test/CodeGen/AMDGPU/idot8s.ll
    llvm/test/CodeGen/AMDGPU/idot8u.ll
    llvm/test/CodeGen/AMDGPU/image-attributes.ll
    llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
    llvm/test/CodeGen/AMDGPU/image-resource-id.ll
    llvm/test/CodeGen/AMDGPU/image-schedule.ll
    llvm/test/CodeGen/AMDGPU/imm.ll
    llvm/test/CodeGen/AMDGPU/imm16.ll
    llvm/test/CodeGen/AMDGPU/immv216.ll
    llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
    llvm/test/CodeGen/AMDGPU/implicit-def-muse.ll
    llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
    llvm/test/CodeGen/AMDGPU/implicit-kernel-argument-alignment.ll
    llvm/test/CodeGen/AMDGPU/implicitarg-attributes.ll
    llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
    llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
    llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
    llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
    llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
    llvm/test/CodeGen/AMDGPU/indirect-call.ll
    llvm/test/CodeGen/AMDGPU/indirect-private-64.ll
    llvm/test/CodeGen/AMDGPU/infer-uniform-load-shader.ll
    llvm/test/CodeGen/AMDGPU/infinite-loop.ll
    llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll
    llvm/test/CodeGen/AMDGPU/inline-asm.ll
    llvm/test/CodeGen/AMDGPU/inline-attr.ll
    llvm/test/CodeGen/AMDGPU/inline-constraints.ll
    llvm/test/CodeGen/AMDGPU/inline-maxbb.ll
    llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll
    llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll
    llvm/test/CodeGen/AMDGPU/insert_subreg.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll
    llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
    llvm/test/CodeGen/AMDGPU/kcache-fold.ll
    llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
    llvm/test/CodeGen/AMDGPU/lds-alignment.ll
    llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll
    llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
    llvm/test/CodeGen/AMDGPU/lds-bounds.ll
    llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
    llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
    llvm/test/CodeGen/AMDGPU/lds-initializer.ll
    llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
    llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
    llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll
    llvm/test/CodeGen/AMDGPU/lds-output-queue.ll
    llvm/test/CodeGen/AMDGPU/lds-reject-anonymous-kernels.ll
    llvm/test/CodeGen/AMDGPU/lds-relocs.ll
    llvm/test/CodeGen/AMDGPU/lds-size.ll
    llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
    llvm/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll
    llvm/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
    llvm/test/CodeGen/AMDGPU/literal-constant-like-operand-instruction-size.ll
    llvm/test/CodeGen/AMDGPU/literals.ll
    llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
    llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
    llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
    llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
    llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
    llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
    llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
    llvm/test/CodeGen/AMDGPU/load-global-f64.ll
    llvm/test/CodeGen/AMDGPU/load-global-i1.ll
    llvm/test/CodeGen/AMDGPU/load-global-i16.ll
    llvm/test/CodeGen/AMDGPU/load-global-i64.ll
    llvm/test/CodeGen/AMDGPU/load-global-i8.ll
    llvm/test/CodeGen/AMDGPU/load-hi16.ll
    llvm/test/CodeGen/AMDGPU/load-input-fold.ll
    llvm/test/CodeGen/AMDGPU/load-lo16.ll
    llvm/test/CodeGen/AMDGPU/load-local-f32-no-ds128.ll
    llvm/test/CodeGen/AMDGPU/load-local-f32.ll
    llvm/test/CodeGen/AMDGPU/load-local-f64.ll
    llvm/test/CodeGen/AMDGPU/load-local-i1.ll
    llvm/test/CodeGen/AMDGPU/load-local-i16.ll
    llvm/test/CodeGen/AMDGPU/load-local-i32.ll
    llvm/test/CodeGen/AMDGPU/load-local-i64.ll
    llvm/test/CodeGen/AMDGPU/load-local-i8.ll
    llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
    llvm/test/CodeGen/AMDGPU/load-local.128.ll
    llvm/test/CodeGen/AMDGPU/load-local.96.ll
    llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
    llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll
    llvm/test/CodeGen/AMDGPU/local-64.ll
    llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
    llvm/test/CodeGen/AMDGPU/local-atomics.ll
    llvm/test/CodeGen/AMDGPU/local-atomics64.ll
    llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
    llvm/test/CodeGen/AMDGPU/local-memory.ll
    llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
    llvm/test/CodeGen/AMDGPU/local-stack-slot-offset.ll
    llvm/test/CodeGen/AMDGPU/loop-address.ll
    llvm/test/CodeGen/AMDGPU/loop-idiom.ll
    llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll
    llvm/test/CodeGen/AMDGPU/loop-prefetch.ll
    llvm/test/CodeGen/AMDGPU/loop_break.ll
    llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-declaration.ll
    llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
    llvm/test/CodeGen/AMDGPU/lower-empty-ctor-dtor.ll
    llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-check-metadata.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
    llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
    llvm/test/CodeGen/AMDGPU/mad.u16.ll
    llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
    llvm/test/CodeGen/AMDGPU/mad_64_32.ll
    llvm/test/CodeGen/AMDGPU/mad_int24.ll
    llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
    llvm/test/CodeGen/AMDGPU/mad_uint24.ll
    llvm/test/CodeGen/AMDGPU/mai-inline.ll
    llvm/test/CodeGen/AMDGPU/max.i16.ll
    llvm/test/CodeGen/AMDGPU/max.ll
    llvm/test/CodeGen/AMDGPU/max3.ll
    llvm/test/CodeGen/AMDGPU/med3-no-simplify.ll
    llvm/test/CodeGen/AMDGPU/memory_clause.ll
    llvm/test/CodeGen/AMDGPU/mesa_regression.ll
    llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll
    llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
    llvm/test/CodeGen/AMDGPU/mfma-loop.ll
    llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
    llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll
    llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll
    llvm/test/CodeGen/AMDGPU/min.ll
    llvm/test/CodeGen/AMDGPU/min3.ll
    llvm/test/CodeGen/AMDGPU/minmax.ll
    llvm/test/CodeGen/AMDGPU/mmo-target-flags-folding.ll
    llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
    llvm/test/CodeGen/AMDGPU/mul.i16.ll
    llvm/test/CodeGen/AMDGPU/mul.ll
    llvm/test/CodeGen/AMDGPU/mul_int24.ll
    llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
    llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
    llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll
    llvm/test/CodeGen/AMDGPU/multilevel-break.ll
    llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
    llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
    llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll
    llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll
    llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
    llvm/test/CodeGen/AMDGPU/offset-split-global.ll
    llvm/test/CodeGen/AMDGPU/opencl-image-metadata.ll
    llvm/test/CodeGen/AMDGPU/opencl-printf-and-hostcall.ll
    llvm/test/CodeGen/AMDGPU/operand-folding.ll
    llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll
    llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
    llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
    llvm/test/CodeGen/AMDGPU/packed-fp32.ll
    llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
    llvm/test/CodeGen/AMDGPU/packetizer.ll
    llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
    llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
    llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll
    llvm/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
    llvm/test/CodeGen/AMDGPU/pk_max_f16_literal.ll
    llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
    llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll
    llvm/test/CodeGen/AMDGPU/ptr-arg-dbg-value.ll
    llvm/test/CodeGen/AMDGPU/ptrmask.ll
    llvm/test/CodeGen/AMDGPU/pv-packing.ll
    llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
    llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
    llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
    llvm/test/CodeGen/AMDGPU/read_register.ll
    llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll
    llvm/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
    llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
    llvm/test/CodeGen/AMDGPU/register-count-comments.ll
    llvm/test/CodeGen/AMDGPU/rel32.ll
    llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
    llvm/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
    llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-to-declare-only-func.ll
    llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll
    llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll
    llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll
    llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll
    llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll
    llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll
    llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr2.ll
    llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
    llvm/test/CodeGen/AMDGPU/ret.ll
    llvm/test/CodeGen/AMDGPU/ret_jump.ll
    llvm/test/CodeGen/AMDGPU/returnaddress.ll
    llvm/test/CodeGen/AMDGPU/rotl.i64.ll
    llvm/test/CodeGen/AMDGPU/rotl.ll
    llvm/test/CodeGen/AMDGPU/rotr.i64.ll
    llvm/test/CodeGen/AMDGPU/rotr.ll
    llvm/test/CodeGen/AMDGPU/scalar-branch-missing-and-exec.ll
    llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
    llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
    llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll
    llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
    llvm/test/CodeGen/AMDGPU/sched-setprio.ll
    llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
    llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll
    llvm/test/CodeGen/AMDGPU/schedule-if-2.ll
    llvm/test/CodeGen/AMDGPU/schedule-if.ll
    llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
    llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
    llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
    llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
    llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
    llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
    llvm/test/CodeGen/AMDGPU/schedule-regpressure-misched-max-waves.ll
    llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
    llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll
    llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll
    llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
    llvm/test/CodeGen/AMDGPU/sdiv.ll
    llvm/test/CodeGen/AMDGPU/sdiv64.ll
    llvm/test/CodeGen/AMDGPU/sdivrem24.ll
    llvm/test/CodeGen/AMDGPU/sdwa-op64-test.ll
    llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
    llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
    llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
    llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
    llvm/test/CodeGen/AMDGPU/select-i1.ll
    llvm/test/CodeGen/AMDGPU/select-opt.ll
    llvm/test/CodeGen/AMDGPU/select-undef.ll
    llvm/test/CodeGen/AMDGPU/select-vectors.ll
    llvm/test/CodeGen/AMDGPU/select.f16.ll
    llvm/test/CodeGen/AMDGPU/select.ll
    llvm/test/CodeGen/AMDGPU/select64.ll
    llvm/test/CodeGen/AMDGPU/selectcc-cnd.ll
    llvm/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
    llvm/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
    llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
    llvm/test/CodeGen/AMDGPU/selectcc.ll
    llvm/test/CodeGen/AMDGPU/setcc-equivalent.ll
    llvm/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
    llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll
    llvm/test/CodeGen/AMDGPU/setcc-multiple-use.ll
    llvm/test/CodeGen/AMDGPU/setcc-opt.ll
    llvm/test/CodeGen/AMDGPU/setcc.ll
    llvm/test/CodeGen/AMDGPU/setcc64.ll
    llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
    llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
    llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
    llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll
    llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
    llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll
    llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
    llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
    llvm/test/CodeGen/AMDGPU/sgprcopies.ll
    llvm/test/CodeGen/AMDGPU/shift-select.ll
    llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll
    llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
    llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll
    llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
    llvm/test/CodeGen/AMDGPU/si-annotatecfg-multiple-backedges.ll
    llvm/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
    llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
    llvm/test/CodeGen/AMDGPU/si-scheduler.ll
    llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll
    llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
    llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
    llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll
    llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
    llvm/test/CodeGen/AMDGPU/smed3.ll
    llvm/test/CodeGen/AMDGPU/sminmax.ll
    llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
    llvm/test/CodeGen/AMDGPU/smrd.ll
    llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
    llvm/test/CodeGen/AMDGPU/sopk-compares.ll
    llvm/test/CodeGen/AMDGPU/speculative-execution-freecasts.ll
    llvm/test/CodeGen/AMDGPU/spill-agpr.ll
    llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
    llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll
    llvm/test/CodeGen/AMDGPU/spill-m0.ll
    llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
    llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
    llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
    llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
    llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
    llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll
    llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
    llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll
    llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
    llvm/test/CodeGen/AMDGPU/split-smrd.ll
    llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
    llvm/test/CodeGen/AMDGPU/srem.ll
    llvm/test/CodeGen/AMDGPU/srem64.ll
    llvm/test/CodeGen/AMDGPU/store-barrier.ll
    llvm/test/CodeGen/AMDGPU/store-global.ll
    llvm/test/CodeGen/AMDGPU/store-hi16.ll
    llvm/test/CodeGen/AMDGPU/store-local.128.ll
    llvm/test/CodeGen/AMDGPU/store-local.96.ll
    llvm/test/CodeGen/AMDGPU/store-local.ll
    llvm/test/CodeGen/AMDGPU/store-private.ll
    llvm/test/CodeGen/AMDGPU/store-to-constant-error.ll
    llvm/test/CodeGen/AMDGPU/store-v3i64.ll
    llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll
    llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
    llvm/test/CodeGen/AMDGPU/structurize.ll
    llvm/test/CodeGen/AMDGPU/structurize1.ll
    llvm/test/CodeGen/AMDGPU/sub.i16.ll
    llvm/test/CodeGen/AMDGPU/sub.ll
    llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
    llvm/test/CodeGen/AMDGPU/sub_i1.ll
    llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
    llvm/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
    llvm/test/CodeGen/AMDGPU/switch-default-block-unreachable.ll
    llvm/test/CodeGen/AMDGPU/switch-unreachable.ll
    llvm/test/CodeGen/AMDGPU/syncscopes.ll
    llvm/test/CodeGen/AMDGPU/target-cpu.ll
    llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
    llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
    llvm/test/CodeGen/AMDGPU/trunc-combine.ll
    llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
    llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll
    llvm/test/CodeGen/AMDGPU/trunc-store-vec-i16-to-i8.ll
    llvm/test/CodeGen/AMDGPU/trunc-store.ll
    llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
    llvm/test/CodeGen/AMDGPU/trunc.ll
    llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
    llvm/test/CodeGen/AMDGPU/udiv.ll
    llvm/test/CodeGen/AMDGPU/udiv64.ll
    llvm/test/CodeGen/AMDGPU/udivrem.ll
    llvm/test/CodeGen/AMDGPU/udivrem24.ll
    llvm/test/CodeGen/AMDGPU/umed3.ll
    llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
    llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
    llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
    llvm/test/CodeGen/AMDGPU/uniform-crash.ll
    llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
    llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
    llvm/test/CodeGen/AMDGPU/unigine-liveness-crash.ll
    llvm/test/CodeGen/AMDGPU/unpack-half.ll
    llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
    llvm/test/CodeGen/AMDGPU/urem.ll
    llvm/test/CodeGen/AMDGPU/urem64.ll
    llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
    llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
    llvm/test/CodeGen/AMDGPU/v_illegal-atomics.ll
    llvm/test/CodeGen/AMDGPU/v_illegal-image_sample.ll
    llvm/test/CodeGen/AMDGPU/v_pack.ll
    llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
    llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
    llvm/test/CodeGen/AMDGPU/vector-legalizer-divergence.ll
    llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
    llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll
    llvm/test/CodeGen/AMDGPU/vectorize-loads.ll
    llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
    llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
    llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
    llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
    llvm/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
    llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll
    llvm/test/CodeGen/AMDGPU/vselect.ll
    llvm/test/CodeGen/AMDGPU/vselect64.ll
    llvm/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
    llvm/test/CodeGen/AMDGPU/vtx-schedule.ll
    llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
    llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
    llvm/test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll
    llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
    llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
    llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
    llvm/test/CodeGen/AMDGPU/xnor.ll
    llvm/test/CodeGen/AMDGPU/xor.ll
    llvm/test/CodeGen/AMDGPU/zero_extend.ll
    llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll
    llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
    llvm/test/CodeGen/AMDGPU/zext-lid.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
index 32463af9f4377..bf364345c3d08 100644

--- a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -13,10 +13,10 @@
 ; FUNC-LABEL: {{^}}local_address_load:
 ; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]]
 ; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
-define amdgpu_kernel void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_load(ptr addrspace(1) %out, ptr addrspace(3) %in) {
 entry:
-  %0 = load i32, i32 addrspace(3)* %in
-  store i32 %0, i32 addrspace(1)* %out
+  %0 = load i32, ptr addrspace(3) %in
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -24,22 +24,22 @@ entry:
 ; SI: s_add_i32 [[SPTR:s[0-9]]]
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_read_b32 [[VPTR]]
-define amdgpu_kernel void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
+define amdgpu_kernel void @local_address_gep(ptr addrspace(1) %out, ptr addrspace(3) %in, i32 %offset) {
 entry:
-  %0 = getelementptr i32, i32 addrspace(3)* %in, i32 %offset
-  %1 = load i32, i32 addrspace(3)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %0 = getelementptr i32, ptr addrspace(3) %in, i32 %offset
+  %1 = load i32, ptr addrspace(3) %0
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_address_gep_const_offset:
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4
-define amdgpu_kernel void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_gep_const_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) {
 entry:
-  %0 = getelementptr i32, i32 addrspace(3)* %in, i32 1
-  %1 = load i32, i32 addrspace(3)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %0 = getelementptr i32, ptr addrspace(3) %in, i32 1
+  %1 = load i32, ptr addrspace(3) %0
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -48,11 +48,11 @@ entry:
 ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_read_b32 [[VPTR]]
-define amdgpu_kernel void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_gep_large_const_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) {
 entry:
-  %0 = getelementptr i32, i32 addrspace(3)* %in, i32 16385
-  %1 = load i32, i32 addrspace(3)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %0 = getelementptr i32, ptr addrspace(3) %in, i32 16385
+  %1 = load i32, ptr addrspace(3) %0
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -62,10 +62,10 @@ entry:
 ; GFX8: s_cmp_lg_u32
 ; GFX8-NOT: v_cmp_ne_u32
 ; GFX8: s_cselect_b32
-define amdgpu_kernel void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
-  %cmp = icmp ne i32 addrspace(3)* %lds, null
+define amdgpu_kernel void @null_32bit_lds_ptr(ptr addrspace(1) %out, ptr addrspace(3) %lds) nounwind {
+  %cmp = icmp ne ptr addrspace(3) %lds, null
   %x = select i1 %cmp, i32 123, i32 456
-  store i32 %x, i32 addrspace(1)* %out
+  store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
@@ -73,10 +73,10 @@ define amdgpu_kernel void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrsp
 ; SI: s_mul_i32
 ; SI-NEXT: s_add_i32
 ; SI: ds_read_b32
-define amdgpu_kernel void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
-  %ptr = getelementptr [3 x float], [3 x float] addrspace(3)* %lds, i32 %tid, i32 0
-  %val = load float, float addrspace(3)* %ptr
-  store float %val, float addrspace(1)* %out
+define amdgpu_kernel void @mul_32bit_ptr(ptr addrspace(1) %out, ptr addrspace(3) %lds, i32 %tid) {
+  %ptr = getelementptr [3 x float], ptr addrspace(3) %lds, i32 %tid, i32 0
+  %val = load float, ptr addrspace(3) %ptr
+  store float %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -85,27 +85,27 @@ define amdgpu_kernel void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] a
 ; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
 ; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}}
 ; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
-define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
-  %val = load float, float addrspace(3)* @g_lds
-  store float %val, float addrspace(1)* %out
+define amdgpu_kernel void @infer_ptr_alignment_global_offset(ptr addrspace(1) %out, i32 %tid) {
+  %val = load float, ptr addrspace(3) @g_lds
+  store float %val, ptr addrspace(1) %out
   ret void
 }
 
 
- at ptr = addrspace(3) global i32 addrspace(3)* undef
+ at ptr = addrspace(3) global ptr addrspace(3) undef
 @dst = addrspace(3) global [16383 x i32] undef
 
 ; FUNC-LABEL: {{^}}global_ptr:
 ; SI: ds_write_b32
 define amdgpu_kernel void @global_ptr() nounwind {
-  store i32 addrspace(3)* getelementptr ([16383 x i32], [16383 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
+  store ptr addrspace(3) getelementptr ([16383 x i32], ptr addrspace(3) @dst, i32 0, i32 16), ptr addrspace(3) @ptr
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_address_store:
 ; SI: ds_write_b32
-define amdgpu_kernel void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
-  store i32 %val, i32 addrspace(3)* %out
+define amdgpu_kernel void @local_address_store(ptr addrspace(3) %out, i32 %val) {
+  store i32 %val, ptr addrspace(3) %out
   ret void
 }
 
@@ -113,9 +113,9 @@ define amdgpu_kernel void @local_address_store(i32 addrspace(3)* %out, i32 %val)
 ; SI: s_add_i32 [[SADDR:s[0-9]+]],
 ; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
 ; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}}
-define amdgpu_kernel void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
-  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 %offset
-  store i32 %val, i32 addrspace(3)* %gep, align 4
+define amdgpu_kernel void @local_address_gep_store(ptr addrspace(3) %out, i32, i32 %val, i32 %offset) {
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 %offset
+  store i32 %val, ptr addrspace(3) %gep, align 4
   ret void
 }
 
@@ -123,9 +123,9 @@ define amdgpu_kernel void @local_address_gep_store(i32 addrspace(3)* %out, i32,
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
 ; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4
-define amdgpu_kernel void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
-  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 1
-  store i32 %val, i32 addrspace(3)* %gep, align 4
+define amdgpu_kernel void @local_address_gep_const_offset_store(ptr addrspace(3) %out, i32 %val) {
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 1
+  store i32 %val, ptr addrspace(3) %gep, align 4
   ret void
 }
 
@@ -134,8 +134,8 @@ define amdgpu_kernel void @local_address_gep_const_offset_store(i32 addrspace(3)
 ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}}
-define amdgpu_kernel void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
-  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 16385
-  store i32 %val, i32 addrspace(3)* %gep, align 4
+define amdgpu_kernel void @local_address_gep_large_const_offset_store(ptr addrspace(3) %out, i32 %val) {
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 16385
+  store i32 %val, ptr addrspace(3) %gep, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll
index 552e72ef30b51..40aec00a8b7d8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll
@@ -109,7 +109,7 @@ define amdgpu_ps { i32, i32 } @sgpr_struct_return_i32_i32(i32 %vgpr0, i32 %vgpr1
   ret { i32, i32 } %value
 }
 
-define amdgpu_ps i8 addrspace(3)* @sgpr_return_p3i8(i8 addrspace(3)* %vgpr) {
+define amdgpu_ps ptr addrspace(3) @sgpr_return_p3i8(ptr addrspace(3) %vgpr) {
   ; CHECK-LABEL: name: sgpr_return_p3i8
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $vgpr0
@@ -119,10 +119,10 @@ define amdgpu_ps i8 addrspace(3)* @sgpr_return_p3i8(i8 addrspace(3)* %vgpr) {
   ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[PTRTOINT]](s32)
   ; CHECK-NEXT:   $sgpr0 = COPY [[INT]](s32)
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0
-  ret i8 addrspace(3)* %vgpr
+  ret ptr addrspace(3) %vgpr
 }
 
-define amdgpu_ps i8 addrspace(1)* @sgpr_return_p1i8(i8 addrspace(1)* %vgpr) {
+define amdgpu_ps ptr addrspace(1) @sgpr_return_p1i8(ptr addrspace(1) %vgpr) {
   ; CHECK-LABEL: name: sgpr_return_p1i8
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
@@ -136,7 +136,7 @@ define amdgpu_ps i8 addrspace(1)* @sgpr_return_p1i8(i8 addrspace(1)* %vgpr) {
   ; CHECK-NEXT:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
   ; CHECK-NEXT:   $sgpr1 = COPY [[INT1]](s32)
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
-  ret i8 addrspace(1)* %vgpr
+  ret ptr addrspace(1) %vgpr
 }
 
 define amdgpu_ps <2 x i16> @sgpr_return_v2i16(<2 x i16> %vgpr) {

diff  --git a/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll b/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll
index 2079613d069ad..d41e6a4b946c6 100644
--- a/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll
+++ b/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll
@@ -4,10 +4,10 @@
 ; CHECK-NEXT: s_nop 0
 ; CHECK-NEXT: ;;#ASMEND
 
-define void @foo(i32 addrspace(5)* %ptr) #0 {
+define void @foo(ptr addrspace(5) %ptr) #0 {
   %tmp = tail call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "s_nop 0", "=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65"(i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2)
   %tmp2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %tmp, 0
-  store i32 %tmp2, i32 addrspace(5)* %ptr, align 4
+  store i32 %tmp2, ptr addrspace(5) %ptr, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/add-debug.ll b/llvm/test/CodeGen/AMDGPU/add-debug.ll
index b90c20b974823..a7a7269b23b9a 100644
--- a/llvm/test/CodeGen/AMDGPU/add-debug.ll
+++ b/llvm/test/CodeGen/AMDGPU/add-debug.ll
@@ -3,13 +3,13 @@
 ; REQUIRES: asserts
 
 ; Check that SelectionDAGDumper does not crash on int_SI_if.
-define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
 
 if:
-  %1 = load i64, i64 addrspace(1)* %in
+  %1 = load i64, ptr addrspace(1) %in
   br label %endif
 
 else:
@@ -18,7 +18,7 @@ else:
 
 endif:
   %3 = phi i64 [%1, %if], [%2, %else]
-  store i64 %3, i64 addrspace(1)* %out
+  store i64 %3, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/add.i16.ll b/llvm/test/CodeGen/AMDGPU/add.i16.ll
index e3b239d910806..4d9efe3ed2ddb 100644
--- a/llvm/test/CodeGen/AMDGPU/add.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.i16.ll
@@ -6,15 +6,15 @@
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define amdgpu_kernel void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
-  %a = load volatile i16, i16 addrspace(1)* %gep.in0
-  %b = load volatile i16, i16 addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds i16, ptr addrspace(1) %in1, i32 %tid
+  %a = load volatile i16, ptr addrspace(1) %gep.in0
+  %b = load volatile i16, ptr addrspace(1) %gep.in1
   %add = add i16 %a, %b
-  store i16 %add, i16 addrspace(1)* %out
+  store i16 %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -23,13 +23,13 @@ define amdgpu_kernel void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x7b, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define amdgpu_kernel void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
-  %a = load volatile i16, i16 addrspace(1)* %gep.in0
+  %gep.out = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, ptr addrspace(1) %in0, i32 %tid
+  %a = load volatile i16, ptr addrspace(1) %gep.in0
   %add = add i16 %a, 123
-  store i16 %add, i16 addrspace(1)* %out
+  store i16 %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -38,13 +38,13 @@ define amdgpu_kernel void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 a
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xfcb3, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define amdgpu_kernel void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
-  %a = load volatile i16, i16 addrspace(1)* %gep.in0
+  %gep.out = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, ptr addrspace(1) %in0, i32 %tid
+  %a = load volatile i16, ptr addrspace(1) %gep.in0
   %add = add i16 %a, -845
-  store i16 %add, i16 addrspace(1)* %out
+  store i16 %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -53,13 +53,13 @@ define amdgpu_kernel void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], -1, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define amdgpu_kernel void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
-  %a = load volatile i16, i16 addrspace(1)* %gep.in0
+  %gep.out = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, ptr addrspace(1) %in0, i32 %tid
+  %a = load volatile i16, ptr addrspace(1) %gep.in0
   %add = add i16 %a, -1
-  store i16 %add, i16 addrspace(1)* %out
+  store i16 %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -69,16 +69,16 @@ define amdgpu_kernel void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i1
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; VI-NEXT: buffer_store_dword [[ADD]]
-define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_zext_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
-  %a = load volatile i16, i16 addrspace(1)* %gep.in0
-  %b = load volatile i16, i16 addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds i16, ptr addrspace(1) %in1, i32 %tid
+  %a = load volatile i16, ptr addrspace(1) %gep.in0
+  %b = load volatile i16, ptr addrspace(1) %gep.in1
   %add = add i16 %a, %b
   %ext = zext i16 %add to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -88,16 +88,16 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i1
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]]
 ; VI: buffer_store_dwordx2 v[[[ADD]]:{{[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_zext_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
-  %a = load volatile i16, i16 addrspace(1)* %gep.in0
-  %b = load volatile i16, i16 addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds i64, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds i16, ptr addrspace(1) %in1, i32 %tid
+  %a = load volatile i16, ptr addrspace(1) %gep.in0
+  %b = load volatile i16, ptr addrspace(1) %gep.in1
   %add = add i16 %a, %b
   %ext = zext i16 %add to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -108,16 +108,16 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i1
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]],  [[A]], [[B]]
 ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: buffer_store_dword [[SEXT]]
-define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_sext_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
-  %a = load i16, i16 addrspace(1)* %gep.in0
-  %b = load i16, i16 addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds i16, ptr addrspace(1) %in1, i32 %tid
+  %a = load i16, ptr addrspace(1) %gep.in0
+  %b = load i16, ptr addrspace(1) %gep.in1
   %add = add i16 %a, %b
   %ext = sext i16 %add to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -129,16 +129,16 @@ define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i1
 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; VI-NEXT: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
-define amdgpu_kernel void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_sext_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
-  %a = load i16, i16 addrspace(1)* %gep.in0
-  %b = load i16, i16 addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds i64, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds i16, ptr addrspace(1) %in1, i32 %tid
+  %a = load i16, ptr addrspace(1) %gep.in0
+  %b = load i16, ptr addrspace(1) %gep.in1
   %add = add i16 %a, %b
   %ext = sext i16 %add to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll
index 19d72274dc5dc..4b6891e7aa20d 100644
--- a/llvm/test/CodeGen/AMDGPU/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.ll
@@ -8,24 +8,24 @@
 ; GCN: s_add_i32 s[[REG:[0-9]+]], {{s[0-9]+, s[0-9]+}}
 ; GCN: v_mov_b32_e32 v[[V_REG:[0-9]+]], s[[REG]]
 ; GCN: buffer_store_{{dword|b32}} v[[V_REG]],
-define amdgpu_kernel void @s_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
+define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %a = load i32, ptr addrspace(1) %in
+  %b = load i32, ptr addrspace(1) %b_ptr
   %result = add i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_add_v2i32:
 ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
 ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
-define amdgpu_kernel void @s_add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
-  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
+define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
+  %a = load <2 x i32>, ptr addrspace(1) %in
+  %b = load <2 x i32>, ptr addrspace(1) %b_ptr
   %result = add <2 x i32> %a, %b
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -34,12 +34,12 @@ define amdgpu_kernel void @s_add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> a
 ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
 ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
 ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
-define amdgpu_kernel void @s_add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
-  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
+define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
+  %a = load <4 x i32>, ptr addrspace(1) %in
+  %b = load <4 x i32>, ptr addrspace(1) %b_ptr
   %result = add <4 x i32> %a, %b
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -52,10 +52,10 @@ define amdgpu_kernel void @s_add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> a
 ; GCN: s_add_i32
 ; GCN: s_add_i32
 ; GCN: s_add_i32
-define amdgpu_kernel void @s_add_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
+define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b) {
 entry:
   %0 = add <8 x i32> %a, %b
-  store <8 x i32> %0, <8 x i32> addrspace(1)* %out
+  store <8 x i32> %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -76,10 +76,10 @@ entry:
 ; GCN: s_add_i32
 ; GCN: s_add_i32
 ; GCN: s_add_i32
-define amdgpu_kernel void @s_add_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
+define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <16 x i32> %b) {
 entry:
   %0 = add <16 x i32> %a, %b
-  store <16 x i32> %0, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -89,14 +89,14 @@ entry:
 ; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc, [[A]], [[B]]
 ; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[A]], [[B]]
 ; GFX10: v_add_nc_u32_e32 v{{[0-9]+}}, [[A]], [[B]]
-define amdgpu_kernel void @v_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1
-  %a = load volatile i32, i32 addrspace(1)* %gep
-  %b = load volatile i32, i32 addrspace(1)* %b_ptr
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
+  %b_ptr = getelementptr i32, ptr addrspace(1) %gep, i32 1
+  %a = load volatile i32, ptr addrspace(1) %gep
+  %b = load volatile i32, ptr addrspace(1) %b_ptr
   %result = add i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -105,23 +105,23 @@ define amdgpu_kernel void @v_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %
 ; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc, 0x7b, [[A]]
 ; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0x7b, [[A]]
 ; GFX10: v_add_nc_u32_e32 v{{[0-9]+}}, 0x7b, [[A]]
-define amdgpu_kernel void @v_add_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1
-  %a = load volatile i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
+  %b_ptr = getelementptr i32, ptr addrspace(1) %gep, i32 1
+  %a = load volatile i32, ptr addrspace(1) %gep
   %result = add i32 %a, 123
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}add64:
 ; GCN: s_add_u32
 ; GCN: s_addc_u32
-define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 entry:
   %add = add i64 %a, %b
-  store i64 %add, i64 addrspace(1)* %out
+  store i64 %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -132,11 +132,11 @@ entry:
 
 ; FUNC-LABEL: {{^}}add64_sgpr_vgpr:
 ; GCN-NOT: v_addc_u32_e32 s
-define amdgpu_kernel void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr addrspace(1) %in) {
 entry:
-  %0 = load i64, i64 addrspace(1)* %in
+  %0 = load i64, ptr addrspace(1) %in
   %1 = add i64 %a, %0
-  store i64 %1, i64 addrspace(1)* %out
+  store i64 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -144,13 +144,13 @@ entry:
 ; FUNC-LABEL: {{^}}add64_in_branch:
 ; GCN: s_add_u32
 ; GCN: s_addc_u32
-define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
 
 if:
-  %1 = load i64, i64 addrspace(1)* %in
+  %1 = load i64, ptr addrspace(1) %in
   br label %endif
 
 else:
@@ -159,7 +159,7 @@ else:
 
 endif:
   %3 = phi i64 [%1, %if], [%2, %else]
-  store i64 %3, i64 addrspace(1)* %out
+  store i64 %3, ptr addrspace(1) %out
   ret void
 }
 
@@ -178,7 +178,7 @@ endif:
 define amdgpu_ps void @add_select_vop3(i32 inreg %s, i32 %v) {
   %vcc = call i64 asm sideeffect "; def vcc", "={vcc}"()
   %sub = add i32 %v, %s
-  store i32 %sub, i32 addrspace(3)* undef
+  store i32 %sub, ptr addrspace(3) undef
   call void asm sideeffect "; use vcc", "{vcc}"(i64 %vcc)
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 7cdaa80574943..0de05c6caddbc 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -6,7 +6,7 @@
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; FIXME: VI or should be unnecessary
-define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
 ; VI-LABEL: v_test_add_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -79,17 +79,17 @@ define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
-  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
-  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
+  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
+  %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
   %add = add <2 x i16> %a, %b
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %add, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 {
+define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 {
 ; VI-LABEL: s_test_add_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -153,14 +153,14 @@ define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
-  %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1
+  %a = load <2 x i16>, ptr addrspace(4) %in0
+  %b = load <2 x i16>, ptr addrspace(4) %in1
   %add = add <2 x i16> %a, %b
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %add, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 {
+define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 {
 ; VI-LABEL: s_test_add_self_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -213,14 +213,14 @@ define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
+  %a = load <2 x i16>, ptr addrspace(4) %in0
   %add = add <2 x i16> %a, %a
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %add, ptr addrspace(1) %out
   ret void
 }
 
 ; FIXME: VI should not scalarize arg access.
-define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
+define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 {
 ; VI-LABEL: s_test_add_v2i16_kernarg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -267,12 +267,12 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %add = add <2 x i16> %a, %b
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %add, ptr addrspace(1) %out
   ret void
 }
 
 ; FIXME: Eliminate or with sdwa
-define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; VI-LABEL: v_test_add_v2i16_constant:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -329,16 +329,16 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %ou
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
   %add = add <2 x i16> %a, <i16 123, i16 456>
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %add, ptr addrspace(1) %out
   ret void
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; VI-LABEL: v_test_add_v2i16_neg_constant:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -395,15 +395,15 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)*
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
   %add = add <2 x i16> %a, <i16 -845, i16 -991>
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %add, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; VI-LABEL: v_test_add_v2i16_inline_neg1:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -459,15 +459,15 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)*
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
   %add = add <2 x i16> %a, <i16 -1, i16 -1>
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %add, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; VI-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -522,16 +522,16 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
   %add = add <2 x i16> %a, <i16 32, i16 0>
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %add, ptr addrspace(1) %out
   ret void
 }
 
 ; The high element gives fp
-define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; VI-LABEL: v_test_add_v2i16_inline_fp_split:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -587,16 +587,16 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
   %add = add <2 x i16> %a, <i16 0, i16 16256>
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %add, ptr addrspace(1) %out
   ret void
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
 ; VI-LABEL: v_test_add_v2i16_zext_to_v2i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -675,19 +675,19 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
-  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
-  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
+  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
+  %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
   %add = add <2 x i16> %a, %b
   %ext = zext <2 x i16> %add to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
 ; VI-LABEL: v_test_add_v2i16_zext_to_v2i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -770,19 +770,19 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
-  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
-  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
+  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
+  %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
   %add = add <2 x i16> %a, %b
   %ext = zext <2 x i16> %add to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
 ; VI-LABEL: v_test_add_v2i16_sext_to_v2i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -863,19 +863,19 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
-  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
-  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
+  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
+  %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
   %add = add <2 x i16> %a, %b
   %ext = sext <2 x i16> %add to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
 ; VI-LABEL: v_test_add_v2i16_sext_to_v2i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -966,14 +966,14 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
-  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
-  %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
+  %a = load <2 x i16>, ptr addrspace(1) %gep.in0
+  %b = load <2 x i16>, ptr addrspace(1) %gep.in1
   %add = add <2 x i16> %a, %b
   %ext = sext <2 x i16> %add to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/add_i1.ll b/llvm/test/CodeGen/AMDGPU/add_i1.ll
index 7910490d31c3d..b2cdc12fb11d5 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i1.ll
@@ -5,21 +5,21 @@
 ; GCN-LABEL: {{^}}add_var_var_i1:
 ; GFX9:  s_xor_b64
 ; GFX10: s_xor_b32
-define amdgpu_kernel void @add_var_var_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
-  %a = load volatile i1, i1 addrspace(1)* %in0
-  %b = load volatile i1, i1 addrspace(1)* %in1
+define amdgpu_kernel void @add_var_var_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load volatile i1, ptr addrspace(1) %in0
+  %b = load volatile i1, ptr addrspace(1) %in1
   %add = add i1 %a, %b
-  store i1 %add, i1 addrspace(1)* %out
+  store i1 %add, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_var_imm_i1:
 ; GFX9:  s_not_b64
 ; GFX10: s_not_b32
-define amdgpu_kernel void @add_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) {
-  %a = load volatile i1, i1 addrspace(1)* %in
+define amdgpu_kernel void @add_var_imm_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %a = load volatile i1, ptr addrspace(1) %in
   %add = add i1 %a, 1
-  store i1 %add, i1 addrspace(1)* %out
+  store i1 %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -27,24 +27,24 @@ define amdgpu_kernel void @add_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)
 ; GCN: ; %endif
 ; GFX9: s_not_b64
 ; GFX10: s_not_b32
-define amdgpu_kernel void @add_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
+define amdgpu_kernel void @add_i1_cf(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %d_cmp = icmp ult i32 %tid, 16
   br i1 %d_cmp, label %if, label %else
 
 if:
-  %0 = load volatile i1, i1 addrspace(1)* %a
+  %0 = load volatile i1, ptr addrspace(1) %a
   br label %endif
 
 else:
-  %1 = load volatile i1, i1 addrspace(1)* %b
+  %1 = load volatile i1, ptr addrspace(1) %b
   br label %endif
 
 endif:
   %2 = phi i1 [%0, %if], [%1, %else]
   %3 = add i1 %2, -1
-  store i1 %3, i1 addrspace(1)* %out
+  store i1 %3, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/add_i128.ll b/llvm/test/CodeGen/AMDGPU/add_i128.ll
index aa36095389bd5..d292002f58623 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i128.ll
@@ -6,14 +6,14 @@
 ; GCN-NEXT: v_addc_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 ; GCN-NEXT: v_addc_u32_e32 v[[HI:[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx4 v[[[LO]]:[[HI]]],
-define amdgpu_kernel void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_i128_vreg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
-  %a_ptr = getelementptr i128, i128 addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr i128, i128 addrspace(1)* %inB, i32 %tid
-  %a = load i128, i128 addrspace(1)* %a_ptr
-  %b = load i128, i128 addrspace(1)* %b_ptr
+  %a_ptr = getelementptr i128, ptr addrspace(1) %inA, i32 %tid
+  %b_ptr = getelementptr i128, ptr addrspace(1) %inB, i32 %tid
+  %a = load i128, ptr addrspace(1) %a_ptr
+  %b = load i128, ptr addrspace(1) %b_ptr
   %result = add i128 %a, %b
-  store i128 %result, i128 addrspace(1)* %out
+  store i128 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -23,10 +23,10 @@ define amdgpu_kernel void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128
 ; GCN: s_addc_u32
 ; GCN: s_addc_u32
 ; GCN: s_addc_u32
-define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
-  %foo = load i128, i128 addrspace(1)* %in, align 8
+define amdgpu_kernel void @sgpr_operand(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i128 %a) {
+  %foo = load i128, ptr addrspace(1) %in, align 8
   %result = add i128 %foo, %a
-  store i128 %result, i128 addrspace(1)* %out
+  store i128 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -35,10 +35,10 @@ define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 ad
 ; GCN: s_addc_u32
 ; GCN: s_addc_u32
 ; GCN: s_addc_u32
-define amdgpu_kernel void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
-  %foo = load i128, i128 addrspace(1)* %in, align 8
+define amdgpu_kernel void @sgpr_operand_reversed(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i128 %a) {
+  %foo = load i128, ptr addrspace(1) %in, align 8
   %result = add i128 %a, %foo
-  store i128 %result, i128 addrspace(1)* %out
+  store i128 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -47,9 +47,9 @@ define amdgpu_kernel void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out
 ; GCN: s_addc_u32
 ; GCN: s_addc_u32
 ; GCN: s_addc_u32
-define amdgpu_kernel void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) {
+define amdgpu_kernel void @test_sreg(ptr addrspace(1) noalias %out, i128 %a, i128 %b) {
   %result = add i128 %a, %b
-  store i128 %result, i128 addrspace(1)* %out
+  store i128 %result, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/add_i64.ll b/llvm/test/CodeGen/AMDGPU/add_i64.ll
index 894e9c7578cdb..6ad2cd86a0ebc 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i64.ll
@@ -6,14 +6,14 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone
 ; SI-LABEL: {{^}}test_i64_vreg:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define amdgpu_kernel void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_i64_vreg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
-  %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
-  %a = load i64, i64 addrspace(1)* %a_ptr
-  %b = load i64, i64 addrspace(1)* %b_ptr
+  %a_ptr = getelementptr i64, ptr addrspace(1) %inA, i32 %tid
+  %b_ptr = getelementptr i64, ptr addrspace(1) %inB, i32 %tid
+  %a = load i64, ptr addrspace(1) %a_ptr
+  %b = load i64, ptr addrspace(1) %b_ptr
   %result = add i64 %a, %b
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -21,10 +21,10 @@ define amdgpu_kernel void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 add
 ; SI-LABEL: {{^}}sgpr_operand:
 ; SI: s_add_u32
 ; SI: s_addc_u32
-define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
-  %foo = load i64, i64 addrspace(1)* %in, align 8
+define amdgpu_kernel void @sgpr_operand(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in_bar, i64 %a) {
+  %foo = load i64, ptr addrspace(1) %in, align 8
   %result = add i64 %foo, %a
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -34,10 +34,10 @@ define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addr
 ; SI-LABEL: {{^}}sgpr_operand_reversed:
 ; SI: s_add_u32
 ; SI: s_addc_u32
-define amdgpu_kernel void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
-  %foo = load i64, i64 addrspace(1)* %in, align 8
+define amdgpu_kernel void @sgpr_operand_reversed(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i64 %a) {
+  %foo = load i64, ptr addrspace(1) %in, align 8
   %result = add i64 %a, %foo
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -47,9 +47,9 @@ define amdgpu_kernel void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out,
 ; SI: s_addc_u32
 ; SI: s_add_u32
 ; SI: s_addc_u32
-define amdgpu_kernel void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
+define amdgpu_kernel void @test_v2i64_sreg(ptr addrspace(1) noalias %out, <2 x i64> %a, <2 x i64> %b) {
   %result = add <2 x i64> %a, %b
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -58,14 +58,14 @@ define amdgpu_kernel void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out,
 ; SI: v_addc_u32
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define amdgpu_kernel void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_v2i64_vreg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
-  %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
-  %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr
-  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
+  %a_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inA, i32 %tid
+  %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inB, i32 %tid
+  %a = load <2 x i64>, ptr addrspace(1) %a_ptr
+  %b = load <2 x i64>, ptr addrspace(1) %b_ptr
   %result = add <2 x i64> %a, %b
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -76,9 +76,9 @@ define amdgpu_kernel void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out,
 ; SI-NOT: addc
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
-define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i32, i64 %a, i32, i64 %b) {
+define amdgpu_kernel void @trunc_i64_add_to_i32(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b) {
   %add = add i64 %b, %a
   %trunc = trunc i64 %add to i32
-  store i32 %trunc, i32 addrspace(1)* %out, align 8
+  store i32 %trunc, ptr addrspace(1) %out, align 8
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll b/llvm/test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll
index 7776c66cbe79f..f838b853e64fb 100644
--- a/llvm/test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll
@@ -12,7 +12,7 @@ main_body:
   %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float>
   %tmp4 = extractelement <4 x float> %tmp3, i32 0
-  store volatile float %tmp4, float addrspace(1)* undef
+  store volatile float %tmp4, ptr addrspace(1) undef
   ret void
 }
 
@@ -28,7 +28,7 @@ main_body:
   %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
   %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float>
   %tmp4 = extractelement <4 x float> %tmp3, i32 1
-  store volatile float %tmp4, float addrspace(1)* undef
+  store volatile float %tmp4, ptr addrspace(1) undef
   ret void
 }
 
@@ -44,7 +44,7 @@ main_body:
   %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float>
   %tmp4 = extractelement <4 x float> %tmp3, i32 0
-  store volatile float %tmp4, float addrspace(1)* undef
+  store volatile float %tmp4, ptr addrspace(1) undef
   ret void
 }
 
@@ -60,7 +60,7 @@ main_body:
   %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
   %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float>
   %tmp4 = extractelement <4 x float> %tmp3, i32 1
-  store volatile float %tmp4, float addrspace(1)* undef
+  store volatile float %tmp4, ptr addrspace(1) undef
   ret void
 }
 
@@ -71,7 +71,7 @@ main_body:
   %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float>
   %tmp4 = extractelement <4 x float> %tmp3, i32 0
-  store volatile float %tmp4, float addrspace(1)* undef
+  store volatile float %tmp4, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 533e85e42ec86..a7f4bfe64373a 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -250,7 +250,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
 }
 
 ; Check that we do make use of v32 if there are no AGPRs present in the function
-define amdgpu_kernel void @no_agpr_no_reserve(<32 x i32> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 {
 ; GFX908-LABEL: no_agpr_no_reserve:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -367,10 +367,10 @@ define amdgpu_kernel void @no_agpr_no_reserve(<32 x i32> addrspace(1)* %arg) #0
 ; GFX90A-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
 ; GFX90A-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <32 x i32>, <32 x i32> addrspace(1)* %arg, i32 %id
-  %load = load <32 x i32>, <32 x i32> addrspace(1)* %gep
+  %gep = getelementptr inbounds <32 x i32>, ptr addrspace(1) %arg, i32 %id
+  %load = load <32 x i32>, ptr addrspace(1) %gep
   %add = add <32 x i32> %load, %load
-  store <32 x i32> %add, <32 x i32> addrspace(1)* %gep
+  store <32 x i32> %add, ptr addrspace(1) %gep
   ret void
 }
 
@@ -775,7 +775,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_pk_add_f32 v[12:13], v[12:13], v[14:15]
 ; GFX90A-NEXT:    s_branch .LBB3_4
 bb:
-  %i = load volatile i16, i16 addrspace(4)* undef, align 2
+  %i = load volatile i16, ptr addrspace(4) undef, align 2
   %i6 = zext i16 %i to i64
   %i7 = udiv i32 %arg1, %arg2
   %i8 = zext i32 %i7 to i64
@@ -791,7 +791,7 @@ bb12:                                             ; preds = %bb58, %bb9
 
 bb14:                                             ; preds = %bb9
   %i11 = icmp slt i64 %i10, 0
-  %i15 = load i64, i64 addrspace(1)* null, align 8
+  %i15 = load i64, ptr addrspace(1) null, align 8
   br label %bb16
 
 bb16:                                             ; preds = %bb58, %bb14
@@ -803,20 +803,16 @@ bb16:                                             ; preds = %bb58, %bb14
   %i22 = add nsw i64 %i17, 1
   %i23 = mul nsw i64 %i22, %arg
   %i24 = add nsw i64 %i23, %i10
-  %i25 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 8
-  %i26 = bitcast half addrspace(1)* %i25 to <2 x half> addrspace(1)*
-  %i27 = load volatile <2 x half>, <2 x half> addrspace(1)* %i26, align 16
-  %i28 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 10
-  %i29 = bitcast half addrspace(1)* %i28 to <2 x half> addrspace(1)*
-  %i30 = load volatile <2 x half>, <2 x half> addrspace(1)* %i29, align 4
-  %i31 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 12
-  %i32 = bitcast half addrspace(1)* %i31 to <2 x half> addrspace(1)*
-  %i33 = load volatile <2 x half>, <2 x half> addrspace(1)* %i32, align 8
-  %i34 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 14
-  %i35 = bitcast half addrspace(1)* %i34 to <2 x half> addrspace(1)*
-  %i36 = load volatile <2 x half>, <2 x half> addrspace(1)* %i35, align 4
-  %i43 = load volatile <2 x float>, <2 x float> addrspace(3)* null, align 8
-  %i46 = load volatile <2 x float>, <2 x float> addrspace(3)* undef, align 32
+  %i25 = getelementptr inbounds [16 x half], ptr addrspace(1) null, i64 %i24, i64 8
+  %i27 = load volatile <2 x half>, ptr addrspace(1) %i25, align 16
+  %i28 = getelementptr inbounds [16 x half], ptr addrspace(1) null, i64 %i24, i64 10
+  %i30 = load volatile <2 x half>, ptr addrspace(1) %i28, align 4
+  %i31 = getelementptr inbounds [16 x half], ptr addrspace(1) null, i64 %i24, i64 12
+  %i33 = load volatile <2 x half>, ptr addrspace(1) %i31, align 8
+  %i34 = getelementptr inbounds [16 x half], ptr addrspace(1) null, i64 %i24, i64 14
+  %i36 = load volatile <2 x half>, ptr addrspace(1) %i34, align 4
+  %i43 = load volatile <2 x float>, ptr addrspace(3) null, align 8
+  %i46 = load volatile <2 x float>, ptr addrspace(3) undef, align 32
   fence syncscope("workgroup") acquire
   br i1 %i11, label %bb58, label %bb51
 

diff  --git a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
index 8bb9d95920a75..289260d9fad60 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
@@ -94,7 +94,7 @@ define amdgpu_kernel void @test_call_empty() #0 {
 bb:
   %reg = call <32 x float> asm sideeffect "; def $0", "=a"()
   call void @func_empty()
-  store volatile <32 x float> %reg, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %reg, ptr addrspace(1) undef
   ret void
 }
 
@@ -116,7 +116,7 @@ define amdgpu_kernel void @test_call_areg4() #0 {
 bb:
   %reg = call <32 x float> asm sideeffect "; def $0", "=a"()
   call void @func_areg_4()
-  store volatile <32 x float> %reg, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %reg, ptr addrspace(1) undef
   ret void
 }
 
@@ -138,7 +138,7 @@ define amdgpu_kernel void @test_call_areg32() #0 {
 bb:
   %reg = call <32 x float> asm sideeffect "; def $0", "=a"()
   call void @func_areg_32()
-  store volatile <32 x float> %reg, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %reg, ptr addrspace(1) undef
   ret void
 }
 
@@ -159,7 +159,7 @@ define amdgpu_kernel void @test_call_areg64() #0 {
 bb:
   %reg = call <32 x float> asm sideeffect "; def $0", "=a"()
   call void @func_areg_64()
-  store volatile <32 x float> %reg, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %reg, ptr addrspace(1) undef
   ret void
 }
 
@@ -181,7 +181,7 @@ define amdgpu_kernel void @test_call_areg31_63() #0 {
 bb:
   %reg = call <32 x float> asm sideeffect "; def $0", "=a"()
   call void @func_areg_31_63()
-  store volatile <32 x float> %reg, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %reg, ptr addrspace(1) undef
   ret void
 }
 
@@ -203,7 +203,7 @@ define amdgpu_kernel void @test_call_unknown() #0 {
 bb:
   %reg = call <32 x float> asm sideeffect "; def $0", "=a"()
   call void @func_unknown()
-  store volatile <32 x float> %reg, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %reg, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
index 0851af20e29dc..d42fd6366c0ba 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
@@ -3,7 +3,7 @@
 
 ; Make sure there are no v_accvgpr_read_b32 copying back and forth
 ; between AGPR and VGPR.
-define amdgpu_kernel void @remat_constant_voids_spill(i32 addrspace(1)* %p) #1 {
+define amdgpu_kernel void @remat_constant_voids_spill(ptr addrspace(1) %p) #1 {
 ; GFX908-LABEL: remat_constant_voids_spill:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    v_accvgpr_write_b32 a1, 1

diff  --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
index fced8d0bbb392..51398ce61e9f1 100644
--- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
@@ -9,13 +9,13 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
 ; GCN: 	s_add_u32 s[[LOAD_ADDR:[0-9]+]], s[[IN_ADDR]], s[[SCALAR]]
 ; GCN:	s_load_dword s{{[0-9]+}}, s[[[LOAD_ADDR]]
 
-define amdgpu_kernel void @readfirstlane_uniform(float addrspace(1)* noalias nocapture readonly, float addrspace(1)* noalias nocapture readonly) {
+define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapture readonly, ptr addrspace(1) noalias nocapture readonly) {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %scalar = tail call i32 @llvm.amdgcn.readfirstlane(i32 %tid)
   %idx = zext i32 %scalar to i64
-  %gep0 = getelementptr inbounds float, float addrspace(1)* %0, i64 %idx
-  %val = load float, float addrspace(1)* %gep0, align 4
-  %gep1 = getelementptr inbounds float, float addrspace(1)* %1, i64 10
-  store float %val, float addrspace(1)* %gep1, align 4
+  %gep0 = getelementptr inbounds float, ptr addrspace(1) %0, i64 %idx
+  %val = load float, ptr addrspace(1) %gep0, align 4
+  %gep1 = getelementptr inbounds float, ptr addrspace(1) %1, i64 10
+  store float %val, ptr addrspace(1) %gep1, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
index 73762206ad074..51b273b909f6d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
@@ -8,10 +8,10 @@
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
 ; GCN-NOT: v_mul_f32
 define amdgpu_kernel void @kernel_ieee_mode_default() #0 {
-  %val0 = load volatile float, float addrspace(1)* undef
-  %val1 = load volatile float, float addrspace(1)* undef
+  %val0 = load volatile float, ptr addrspace(1) undef
+  %val1 = load volatile float, ptr addrspace(1) undef
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
-  store volatile float %min, float addrspace(1)* undef
+  store volatile float %min, ptr addrspace(1) undef
   ret void
 }
 
@@ -23,10 +23,10 @@ define amdgpu_kernel void @kernel_ieee_mode_default() #0 {
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
 ; GCN-NOT: v_mul_f32
 define amdgpu_kernel void @kernel_ieee_mode_on() #1 {
-  %val0 = load volatile float, float addrspace(1)* undef
-  %val1 = load volatile float, float addrspace(1)* undef
+  %val0 = load volatile float, ptr addrspace(1) undef
+  %val1 = load volatile float, ptr addrspace(1) undef
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
-  store volatile float %min, float addrspace(1)* undef
+  store volatile float %min, ptr addrspace(1) undef
   ret void
 }
 
@@ -38,10 +38,10 @@ define amdgpu_kernel void @kernel_ieee_mode_on() #1 {
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
 ; GCN-NOT: v_mul_f32
 define amdgpu_kernel void @kernel_ieee_mode_off() #2 {
-  %val0 = load volatile float, float addrspace(1)* undef
-  %val1 = load volatile float, float addrspace(1)* undef
+  %val0 = load volatile float, ptr addrspace(1) undef
+  %val1 = load volatile float, ptr addrspace(1) undef
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
-  store volatile float %min, float addrspace(1)* undef
+  store volatile float %min, ptr addrspace(1) undef
   ret void
 }
 
@@ -53,10 +53,10 @@ define amdgpu_kernel void @kernel_ieee_mode_off() #2 {
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
 ; GCN-NOT: v_mul_f32
 define void @func_ieee_mode_default() #0 {
-  %val0 = load volatile float, float addrspace(1)* undef
-  %val1 = load volatile float, float addrspace(1)* undef
+  %val0 = load volatile float, ptr addrspace(1) undef
+  %val1 = load volatile float, ptr addrspace(1) undef
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
-  store volatile float %min, float addrspace(1)* undef
+  store volatile float %min, ptr addrspace(1) undef
   ret void
 }
 
@@ -68,10 +68,10 @@ define void @func_ieee_mode_default() #0 {
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
 ; GCN-NOT: v_mul_f32
 define void @func_ieee_mode_on() #1 {
-  %val0 = load volatile float, float addrspace(1)* undef
-  %val1 = load volatile float, float addrspace(1)* undef
+  %val0 = load volatile float, ptr addrspace(1) undef
+  %val1 = load volatile float, ptr addrspace(1) undef
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
-  store volatile float %min, float addrspace(1)* undef
+  store volatile float %min, ptr addrspace(1) undef
   ret void
 }
 
@@ -83,10 +83,10 @@ define void @func_ieee_mode_on() #1 {
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
 ; GCN-NOT: v_mul_f32
 define void @func_ieee_mode_off() #2 {
-  %val0 = load volatile float, float addrspace(1)* undef
-  %val1 = load volatile float, float addrspace(1)* undef
+  %val0 = load volatile float, ptr addrspace(1) undef
+  %val1 = load volatile float, ptr addrspace(1) undef
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
-  store volatile float %min, float addrspace(1)* undef
+  store volatile float %min, ptr addrspace(1) undef
   ret void
 }
 
@@ -98,10 +98,10 @@ define void @func_ieee_mode_off() #2 {
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
 ; GCN-NOT: v_mul_f32
 define amdgpu_cs void @cs_ieee_mode_default() #0 {
-  %val0 = load volatile float, float addrspace(1)* undef
-  %val1 = load volatile float, float addrspace(1)* undef
+  %val0 = load volatile float, ptr addrspace(1) undef
+  %val1 = load volatile float, ptr addrspace(1) undef
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
-  store volatile float %min, float addrspace(1)* undef
+  store volatile float %min, ptr addrspace(1) undef
   ret void
 }
 
@@ -113,10 +113,10 @@ define amdgpu_cs void @cs_ieee_mode_default() #0 {
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
 ; GCN-NOT: v_mul_f32
 define amdgpu_cs void @cs_ieee_mode_on() #1 {
-  %val0 = load volatile float, float addrspace(1)* undef
-  %val1 = load volatile float, float addrspace(1)* undef
+  %val0 = load volatile float, ptr addrspace(1) undef
+  %val1 = load volatile float, ptr addrspace(1) undef
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
-  store volatile float %min, float addrspace(1)* undef
+  store volatile float %min, ptr addrspace(1) undef
   ret void
 }
 
@@ -128,10 +128,10 @@ define amdgpu_cs void @cs_ieee_mode_on() #1 {
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
 ; GCN-NOT: v_mul_f32
 define amdgpu_cs void @cs_ieee_mode_off() #2 {
-  %val0 = load volatile float, float addrspace(1)* undef
-  %val1 = load volatile float, float addrspace(1)* undef
+  %val0 = load volatile float, ptr addrspace(1) undef
+  %val1 = load volatile float, ptr addrspace(1) undef
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
-  store volatile float %min, float addrspace(1)* undef
+  store volatile float %min, ptr addrspace(1) undef
   ret void
 }
 
@@ -143,10 +143,10 @@ define amdgpu_cs void @cs_ieee_mode_off() #2 {
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
 ; GCN-NOT: v_mul_f32
 define amdgpu_ps void @ps_ieee_mode_default() #0 {
-  %val0 = load volatile float, float addrspace(1)* undef
-  %val1 = load volatile float, float addrspace(1)* undef
+  %val0 = load volatile float, ptr addrspace(1) undef
+  %val1 = load volatile float, ptr addrspace(1) undef
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
-  store volatile float %min, float addrspace(1)* undef
+  store volatile float %min, ptr addrspace(1) undef
   ret void
 }
 
@@ -158,10 +158,10 @@ define amdgpu_ps void @ps_ieee_mode_default() #0 {
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
 ; GCN-NOT: v_mul_f32
 define amdgpu_ps void @ps_ieee_mode_on() #1 {
-  %val0 = load volatile float, float addrspace(1)* undef
-  %val1 = load volatile float, float addrspace(1)* undef
+  %val0 = load volatile float, ptr addrspace(1) undef
+  %val1 = load volatile float, ptr addrspace(1) undef
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
-  store volatile float %min, float addrspace(1)* undef
+  store volatile float %min, ptr addrspace(1) undef
   ret void
 }
 
@@ -173,10 +173,10 @@ define amdgpu_ps void @ps_ieee_mode_on() #1 {
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
 ; GCN-NOT: v_mul_f32
 define amdgpu_ps void @ps_ieee_mode_off() #2 {
-  %val0 = load volatile float, float addrspace(1)* undef
-  %val1 = load volatile float, float addrspace(1)* undef
+  %val0 = load volatile float, ptr addrspace(1) undef
+  %val1 = load volatile float, ptr addrspace(1) undef
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
-  store volatile float %min, float addrspace(1)* undef
+  store volatile float %min, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
index 4ae7193fea4fa..757da88d8d108 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
@@ -5,9 +5,9 @@
 ; This test just checks that the compiler doesn't crash.
 
 ; FUNC-LABEL: {{^}}v32i8_to_v8i32:
-define amdgpu_ps float @v32i8_to_v8i32(<32 x i8> addrspace(4)* inreg) #0 {
+define amdgpu_ps float @v32i8_to_v8i32(ptr addrspace(4) inreg) #0 {
 entry:
-  %1 = load <32 x i8>, <32 x i8> addrspace(4)* %0
+  %1 = load <32 x i8>, ptr addrspace(4) %0
   %2 = bitcast <32 x i8> %1 to <8 x i32>
   %3 = extractelement <8 x i32> %2, i32 1
   %4 = icmp ne i32 %3, 0
@@ -17,87 +17,86 @@ entry:
 
 ; FUNC-LABEL: {{^}}i8ptr_v16i8ptr:
 ; SI: s_endpgm
-define amdgpu_kernel void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @i8ptr_v16i8ptr(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
-  %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0
-  store <16 x i8> %1, <16 x i8> addrspace(1)* %out
+  %0 = load <16 x i8>, ptr addrspace(1) %in
+  store <16 x i8> %0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
-  %load = load float, float addrspace(1)* %in, align 4
+define amdgpu_kernel void @f32_to_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load float, ptr addrspace(1) %in, align 4
   %fadd32 = fadd float %load, 1.0
   %bc = bitcast float %fadd32 to <2 x i16>
   %add.bitcast = add <2 x i16> %bc, <i16 2, i16 2>
-  store <2 x i16> %add.bitcast, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %add.bitcast, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
+define amdgpu_kernel void @v2i16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load <2 x i16>, ptr addrspace(1) %in, align 4
   %add.v2i16 = add <2 x i16> %load, <i16 2, i16 2>
   %bc = bitcast <2 x i16> %add.v2i16 to float
   %fadd.bitcast = fadd float %bc, 1.0
-  store float %fadd.bitcast, float addrspace(1)* %out
+  store float %fadd.bitcast, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
-  %load = load float, float addrspace(1)* %in, align 4
+define amdgpu_kernel void @f32_to_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load float, ptr addrspace(1) %in, align 4
   %fadd32 = fadd float %load, 1.0
   %bc = bitcast float %fadd32 to <2 x half>
   %add.bitcast = fadd <2 x half> %bc, <half 2.0, half 2.0>
-  store <2 x half> %add.bitcast, <2 x half> addrspace(1)* %out
+  store <2 x half> %add.bitcast, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in) nounwind {
-  %load = load <2 x half>, <2 x half> addrspace(1)* %in, align 4
+define amdgpu_kernel void @v2f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load <2 x half>, ptr addrspace(1) %in, align 4
   %add.v2f16 = fadd <2 x half> %load, <half 2.0, half 2.0>
   %bc = bitcast <2 x half> %add.v2f16 to float
   %fadd.bitcast = fadd float %bc, 1.0
-  store float %fadd.bitcast, float addrspace(1)* %out
+  store float %fadd.bitcast, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+define amdgpu_kernel void @v4i8_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load <4 x i8>, ptr addrspace(1) %in, align 4
   %bc = bitcast <4 x i8> %load to i32
-  store i32 %bc, i32 addrspace(1)* %out, align 4
+  store i32 %bc, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
+define amdgpu_kernel void @i32_to_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load i32, ptr addrspace(1) %in, align 4
   %bc = bitcast i32 %load to <4 x i8>
-  store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
+  store <4 x i8> %bc, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64:
 ; SI: s_endpgm
-define amdgpu_kernel void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
+define amdgpu_kernel void @bitcast_v2i32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %val = load <2 x i32>, ptr addrspace(1) %in, align 8
   %add = add <2 x i32> %val, <i32 4, i32 9>
   %bc = bitcast <2 x i32> %add to double
   %fadd.bc = fadd double %bc, 1.0
-  store double %fadd.bc, double addrspace(1)* %out, align 8
+  store double %fadd.bc, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32:
 ; SI: s_endpgm
-define amdgpu_kernel void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
-  %val = load double, double addrspace(1)* %in, align 8
+define amdgpu_kernel void @bitcast_f64_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %val = load double, ptr addrspace(1) %in, align 8
   %add = fadd double %val, 4.0
   %bc = bitcast double %add to <2 x i32>
-  store <2 x i32> %bc, <2 x i32> addrspace(1)* %out, align 8
+  store <2 x i32> %bc, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2i64_to_v2f64:
-define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
+define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, ptr addrspace(1) %out, <2 x i64> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
@@ -108,12 +107,12 @@ if:
 
 end:
   %phi = phi <2 x double> [zeroinitializer, %entry], [%cast, %if]
-  store <2 x double> %phi, <2 x double> addrspace(1)* %out
+  store <2 x double> %phi, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2f64_to_v2i64:
-define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
+define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, ptr addrspace(1) %out, <2 x double> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
@@ -124,167 +123,167 @@ if:
 
 end:
   %phi = phi <2 x i64> [zeroinitializer, %entry], [%cast, %if]
-  store <2 x i64> %phi, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %phi, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v4i16_to_f64:
-define amdgpu_kernel void @v4i16_to_f64(double addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind {
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in, align 4
+define amdgpu_kernel void @v4i16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load <4 x i16>, ptr addrspace(1) %in, align 4
   %add.v4i16 = add <4 x i16> %load, <i16 4, i16 4, i16 4, i16 4>
   %bc = bitcast <4 x i16> %add.v4i16 to double
   %fadd.bitcast = fadd double %bc, 1.0
-  store double %fadd.bitcast, double addrspace(1)* %out
+  store double %fadd.bitcast, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v4f16_to_f64:
-define amdgpu_kernel void @v4f16_to_f64(double addrspace(1)* %out, <4 x half> addrspace(1)* %in) nounwind {
-  %load = load <4 x half>, <4 x half> addrspace(1)* %in, align 4
+define amdgpu_kernel void @v4f16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load <4 x half>, ptr addrspace(1) %in, align 4
   %add.v4half = fadd <4 x half> %load, <half 4.0, half 4.0, half 4.0, half 4.0>
   %bc = bitcast <4 x half> %add.v4half to double
   %fadd.bitcast = fadd double %bc, 1.0
-  store double %fadd.bitcast, double addrspace(1)* %out
+  store double %fadd.bitcast, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}f64_to_v4f16:
-define amdgpu_kernel void @f64_to_v4f16(<4 x half> addrspace(1)* %out, double addrspace(1)* %in) nounwind {
-  %load = load double, double addrspace(1)* %in, align 4
+define amdgpu_kernel void @f64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load double, ptr addrspace(1) %in, align 4
   %fadd32 = fadd double %load, 1.0
   %bc = bitcast double %fadd32 to <4 x half>
   %add.bitcast = fadd <4 x half> %bc, <half 2.0, half 2.0, half 2.0, half 2.0>
-  store <4 x half> %add.bitcast, <4 x half> addrspace(1)* %out
+  store <4 x half> %add.bitcast, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}f64_to_v4i16:
-define amdgpu_kernel void @f64_to_v4i16(<4 x i16> addrspace(1)* %out, double addrspace(1)* %in) nounwind {
-  %load = load double, double addrspace(1)* %in, align 4
+define amdgpu_kernel void @f64_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load double, ptr addrspace(1) %in, align 4
   %fadd32 = fadd double %load, 1.0
   %bc = bitcast double %fadd32 to <4 x i16>
   %add.bitcast = add <4 x i16> %bc, <i16 2, i16 2, i16 2, i16 2>
-  store <4 x i16> %add.bitcast, <4 x i16> addrspace(1)* %out
+  store <4 x i16> %add.bitcast, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v4i16_to_i64:
-define amdgpu_kernel void @v4i16_to_i64(i64 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind {
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in, align 4
+define amdgpu_kernel void @v4i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load <4 x i16>, ptr addrspace(1) %in, align 4
   %add.v4i16 = add <4 x i16> %load, <i16 4, i16 4, i16 4, i16 4>
   %bc = bitcast <4 x i16> %add.v4i16 to i64
   %add.bitcast = add i64 %bc, 1
-  store i64 %add.bitcast, i64 addrspace(1)* %out
+  store i64 %add.bitcast, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v4f16_to_i64:
-define amdgpu_kernel void @v4f16_to_i64(i64 addrspace(1)* %out, <4 x half> addrspace(1)* %in) nounwind {
-  %load = load <4 x half>, <4 x half> addrspace(1)* %in, align 4
+define amdgpu_kernel void @v4f16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load <4 x half>, ptr addrspace(1) %in, align 4
   %add.v4half = fadd <4 x half> %load, <half 4.0, half 4.0, half 4.0, half 4.0>
   %bc = bitcast <4 x half> %add.v4half to i64
   %add.bitcast = add i64 %bc, 1
-  store i64 %add.bitcast, i64 addrspace(1)* %out
+  store i64 %add.bitcast, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}bitcast_i64_to_v4i16:
-define amdgpu_kernel void @bitcast_i64_to_v4i16(<4 x i16> addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %val = load i64, i64 addrspace(1)* %in, align 8
+define amdgpu_kernel void @bitcast_i64_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %val = load i64, ptr addrspace(1) %in, align 8
   %add = add i64 %val, 4
   %bc = bitcast i64 %add to <4 x i16>
   %add.v4i16 = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
-  store <4 x i16> %add.v4i16, <4 x i16> addrspace(1)* %out, align 8
+  store <4 x i16> %add.v4i16, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}bitcast_i64_to_v4f16:
-define amdgpu_kernel void @bitcast_i64_to_v4f16(<4 x half> addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %val = load i64, i64 addrspace(1)* %in, align 8
+define amdgpu_kernel void @bitcast_i64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %val = load i64, ptr addrspace(1) %in, align 8
   %add = add i64 %val, 4
   %bc = bitcast i64 %add to <4 x half>
   %add.v4i16 = fadd <4 x half> %bc, <half 1.0, half 2.0, half 4.0, half 8.0>
-  store <4 x half> %add.v4i16, <4 x half> addrspace(1)* %out, align 8
+  store <4 x half> %add.v4i16, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v4i16_to_v2f32:
-define amdgpu_kernel void @v4i16_to_v2f32(<2 x float> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind {
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in, align 4
+define amdgpu_kernel void @v4i16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load <4 x i16>, ptr addrspace(1) %in, align 4
   %add.v4i16 = add <4 x i16> %load, <i16 4, i16 4, i16 4, i16 4>
   %bc = bitcast <4 x i16> %add.v4i16 to <2 x float>
   %fadd.bitcast = fadd <2 x float> %bc, <float 1.0, float 1.0>
-  store <2 x float> %fadd.bitcast, <2 x float> addrspace(1)* %out
+  store <2 x float> %fadd.bitcast, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v4f16_to_v2f32:
-define amdgpu_kernel void @v4f16_to_v2f32(<2 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) nounwind {
-  %load = load <4 x half>, <4 x half> addrspace(1)* %in, align 4
+define amdgpu_kernel void @v4f16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load <4 x half>, ptr addrspace(1) %in, align 4
   %add.v4half = fadd <4 x half> %load, <half 4.0, half 4.0, half 4.0, half 4.0>
   %bc = bitcast <4 x half> %add.v4half to <2 x float>
   %fadd.bitcast = fadd <2 x float> %bc, <float 1.0, float 1.0>
-  store <2 x float> %fadd.bitcast, <2 x float> addrspace(1)* %out
+  store <2 x float> %fadd.bitcast, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v2f32_to_v4i16:
-define amdgpu_kernel void @v2f32_to_v4i16(<4 x i16> addrspace(1)* %out, <2 x float> addrspace(1)* %in) nounwind {
-  %load = load <2 x float>, <2 x float> addrspace(1)* %in, align 4
+define amdgpu_kernel void @v2f32_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load <2 x float>, ptr addrspace(1) %in, align 4
   %add.v2f32 = fadd <2 x float> %load, <float 2.0, float 4.0>
   %bc = bitcast <2 x float> %add.v2f32 to <4 x i16>
   %add.bitcast = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
-  store <4 x i16> %add.bitcast, <4 x i16> addrspace(1)* %out
+  store <4 x i16> %add.bitcast, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v2f32_to_v4f16:
-define amdgpu_kernel void @v2f32_to_v4f16(<4 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) nounwind {
-  %load = load <2 x float>, <2 x float> addrspace(1)* %in, align 4
+define amdgpu_kernel void @v2f32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load <2 x float>, ptr addrspace(1) %in, align 4
   %add.v2f32 = fadd <2 x float> %load, <float 2.0, float 4.0>
   %bc = bitcast <2 x float> %add.v2f32 to <4 x half>
   %add.bitcast = fadd <4 x half> %bc, <half 1.0, half 2.0, half 4.0, half 8.0>
-  store <4 x half> %add.bitcast, <4 x half> addrspace(1)* %out
+  store <4 x half> %add.bitcast, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v4i16_to_v2i32:
-define amdgpu_kernel void @v4i16_to_v2i32(<2 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind {
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in, align 4
+define amdgpu_kernel void @v4i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load <4 x i16>, ptr addrspace(1) %in, align 4
   %add.v4i16 = add <4 x i16> %load, <i16 4, i16 4, i16 4, i16 4>
   %bc = bitcast <4 x i16> %add.v4i16 to <2 x i32>
   %add.bitcast = add <2 x i32> %bc, <i32 1, i32 1>
-  store <2 x i32> %add.bitcast, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %add.bitcast, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v4f16_to_v2i32:
-define amdgpu_kernel void @v4f16_to_v2i32(<2 x i32> addrspace(1)* %out, <4 x half> addrspace(1)* %in) nounwind {
-  %load = load <4 x half>, <4 x half> addrspace(1)* %in, align 4
+define amdgpu_kernel void @v4f16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load <4 x half>, ptr addrspace(1) %in, align 4
   %add.v4half = fadd <4 x half> %load, <half 4.0, half 4.0, half 4.0, half 4.0>
   %bc = bitcast <4 x half> %add.v4half to <2 x i32>
   %add.bitcast = add <2 x i32> %bc, <i32 1, i32 1>
-  store <2 x i32> %add.bitcast, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %add.bitcast, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v2i32_to_v4i16:
-define amdgpu_kernel void @v2i32_to_v4i16(<4 x i16> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
-  %load = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 4
+define amdgpu_kernel void @v2i32_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load <2 x i32>, ptr addrspace(1) %in, align 4
   %add.v2i32 = add <2 x i32> %load, <i32 2, i32 4>
   %bc = bitcast <2 x i32> %add.v2i32 to <4 x i16>
   %add.bitcast = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
-  store <4 x i16> %add.bitcast, <4 x i16> addrspace(1)* %out
+  store <4 x i16> %add.bitcast, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v2i32_to_v4f16:
-define amdgpu_kernel void @v2i32_to_v4f16(<4 x half> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
-  %load = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 4
+define amdgpu_kernel void @v2i32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load <2 x i32>, ptr addrspace(1) %in, align 4
   %add.v2i32 = add <2 x i32> %load, <i32 2, i32 4>
   %bc = bitcast <2 x i32> %add.v2i32 to <4 x half>
   %add.bitcast = fadd <4 x half> %bc, <half 1.0, half 2.0, half 4.0, half 8.0>
-  store <4 x half> %add.bitcast, <4 x half> addrspace(1)* %out
+  store <4 x half> %add.bitcast, ptr addrspace(1) %out
   ret void
 }
 
@@ -302,17 +301,17 @@ define <2 x i64> @bitcast_v4f32_to_v2i64(<2 x i64> %arg) {
 declare half @llvm.canonicalize.f16(half)
 
 ; FUNC-LABEL: {{^}}bitcast_f32_to_v1i32:
-define amdgpu_kernel void @bitcast_f32_to_v1i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @bitcast_f32_to_v1i32(ptr addrspace(1) %out) {
   %f16 = call arcp afn half @llvm.canonicalize.f16(half 0xH03F0)
   %f32 = fpext half %f16 to float
   %v = bitcast float %f32 to <1 x i32>
   %v1 = extractelement <1 x i32> %v, i32 0
-  store i32 %v1, i32 addrspace(1)* %out
+  store i32 %v1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v4i64_to_v16i16:
-define amdgpu_kernel void @bitcast_v4i64_to_v16i16(i32 %cond, <16 x i16> addrspace(1)* %out, <4 x i64> %value) {
+define amdgpu_kernel void @bitcast_v4i64_to_v16i16(i32 %cond, ptr addrspace(1) %out, <4 x i64> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
@@ -325,12 +324,12 @@ if:
 
 end:
   %phi_cast = phi <16 x i16> [zeroinitializer, %entry], [%cast, %if]
-  store <16 x i16> %phi_cast, <16 x i16> addrspace(1)* %out
+  store <16 x i16> %phi_cast, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v4f64_to_v16f16:
-define amdgpu_kernel void @bitcast_v4f64_to_v16f16(i32 %cond, <16 x half> addrspace(1)* %out, <4 x double> %value) {
+define amdgpu_kernel void @bitcast_v4f64_to_v16f16(i32 %cond, ptr addrspace(1) %out, <4 x double> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
@@ -343,12 +342,12 @@ if:
 
 end:
   %phi_cast = phi <16 x half> [zeroinitializer, %entry], [%cast, %if]
-  store <16 x half> %phi_cast, <16 x half> addrspace(1)* %out
+  store <16 x half> %phi_cast, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v16i16_to_v4i64:
-define amdgpu_kernel void @bitcast_v16i16_to_v4i64(i32 %cond, <4 x i64> addrspace(1)* %out, <16 x i16> %value) {
+define amdgpu_kernel void @bitcast_v16i16_to_v4i64(i32 %cond, ptr addrspace(1) %out, <16 x i16> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
@@ -361,12 +360,12 @@ if:
 
 end:
   %phi_cast = phi <4 x i64> [zeroinitializer, %entry], [%cast, %if]
-  store <4 x i64> %phi_cast, <4 x i64> addrspace(1)* %out
+  store <4 x i64> %phi_cast, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v16f16_to_v4f64:
-define amdgpu_kernel void @bitcast_v16f16_to_v4f64(i32 %cond, <4 x double> addrspace(1)* %out, <16 x half> %value) {
+define amdgpu_kernel void @bitcast_v16f16_to_v4f64(i32 %cond, ptr addrspace(1) %out, <16 x half> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
@@ -379,6 +378,6 @@ if:
 
 end:
   %phi_cast = phi <4 x double> [zeroinitializer, %entry], [%cast, %if]
-  store <4 x double> %phi_cast, <4 x double> addrspace(1)* %out
+  store <4 x double> %phi_cast, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
index 1a9b966adec97..f9c98e85ed5cc 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -4,9 +4,9 @@
 
 ; NOOP-LABEL: @noop_fdiv_fpmath(
 ; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0
-define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
+define amdgpu_kernel void @noop_fdiv_fpmath(ptr addrspace(1) %out, float %a, float %b) #3 {
   %md.25ulp = fdiv float %a, %b, !fpmath !0
-  store volatile float %md.25ulp, float addrspace(1)* %out
+  store volatile float %md.25ulp, ptr addrspace(1) %out
   ret void
 }
 
@@ -20,27 +20,27 @@ define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a,
 ; CHECK: %fast.md.25ulp = fmul fast float %a, %[[FAST_RCP]]
 ; CHECK: %[[AFN_RCP:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %b)
 ; CHECK: afn.md.25ulp = fmul afn float %a, %[[AFN_RCP]]
-define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
+define amdgpu_kernel void @fdiv_fpmath(ptr addrspace(1) %out, float %a, float %b) #1 {
   %no.md = fdiv float %a, %b
-  store volatile float %no.md, float addrspace(1)* %out
+  store volatile float %no.md, ptr addrspace(1) %out
 
   %md.half.ulp = fdiv float %a, %b, !fpmath !1
-  store volatile float %md.half.ulp, float addrspace(1)* %out
+  store volatile float %md.half.ulp, ptr addrspace(1) %out
 
   %md.1ulp = fdiv float %a, %b, !fpmath !2
-  store volatile float %md.1ulp, float addrspace(1)* %out
+  store volatile float %md.1ulp, ptr addrspace(1) %out
 
   %md.25ulp = fdiv float %a, %b, !fpmath !0
-  store volatile float %md.25ulp, float addrspace(1)* %out
+  store volatile float %md.25ulp, ptr addrspace(1) %out
 
   %md.3ulp = fdiv float %a, %b, !fpmath !3
-  store volatile float %md.3ulp, float addrspace(1)* %out
+  store volatile float %md.3ulp, ptr addrspace(1) %out
 
   %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
-  store volatile float %fast.md.25ulp, float addrspace(1)* %out
+  store volatile float %fast.md.25ulp, ptr addrspace(1) %out
 
   %afn.md.25ulp = fdiv afn float %a, %b, !fpmath !0
-  store volatile float %afn.md.25ulp, float addrspace(1)* %out
+  store volatile float %afn.md.25ulp, ptr addrspace(1) %out
 
   ret void
 }
@@ -53,27 +53,27 @@ define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float
 ; CHECK: %afn.25ulp = call afn float @llvm.amdgcn.rcp.f32(float %x)
 ; CHECK: %fast.no.md = call fast float @llvm.amdgcn.rcp.f32(float %x)
 ; CHECK: %fast.25ulp = call fast float @llvm.amdgcn.rcp.f32(float %x)
-define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath(ptr addrspace(1) %out, float %x) #1 {
   %no.md = fdiv float 1.0, %x
-  store volatile float %no.md, float addrspace(1)* %out
+  store volatile float %no.md, ptr addrspace(1) %out
 
   %md.25ulp = fdiv float 1.0, %x, !fpmath !0
-  store volatile float %md.25ulp, float addrspace(1)* %out
+  store volatile float %md.25ulp, ptr addrspace(1) %out
 
   %md.half.ulp = fdiv float 1.0, %x, !fpmath !1
-  store volatile float %md.half.ulp, float addrspace(1)* %out
+  store volatile float %md.half.ulp, ptr addrspace(1) %out
 
   %afn.no.md = fdiv afn float 1.0, %x
-  store volatile float %afn.no.md, float addrspace(1)* %out
+  store volatile float %afn.no.md, ptr addrspace(1) %out
 
   %afn.25ulp = fdiv afn float 1.0, %x, !fpmath !0
-  store volatile float %afn.25ulp, float addrspace(1)* %out
+  store volatile float %afn.25ulp, ptr addrspace(1) %out
 
   %fast.no.md = fdiv fast float 1.0, %x
-  store volatile float %fast.no.md, float addrspace(1)* %out
+  store volatile float %fast.no.md, ptr addrspace(1) %out
 
   %fast.25ulp = fdiv fast float 1.0, %x, !fpmath !0
-  store volatile float %fast.25ulp, float addrspace(1)* %out
+  store volatile float %fast.25ulp, ptr addrspace(1) %out
 
   ret void
 }
@@ -87,7 +87,7 @@ define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #
 ; CHECK: %[[NO_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
 ; CHECK: %[[NO_FDIV1:[0-9]+]] = fdiv float %[[NO_A1]], %[[NO_B1]]
 ; CHECK: %no.md = insertelement <2 x float> %[[NO_INS0]], float %[[NO_FDIV1]], i64 1
-; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+; CHECK: store volatile <2 x float> %no.md, ptr addrspace(1) %out
 
 ; CHECK: %[[HALF_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
 ; CHECK: %[[HALF_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
@@ -97,7 +97,7 @@ define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #
 ; CHECK: %[[HALF_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
 ; CHECK: %[[HALF_FDIV1:[0-9]+]] = fdiv float %[[HALF_A1]], %[[HALF_B1]]
 ; CHECK: %md.half.ulp = insertelement <2 x float> %[[HALF_INS0]], float %[[HALF_FDIV1]], i64 1
-; CHECK: store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
+; CHECK: store volatile <2 x float> %md.half.ulp, ptr addrspace(1) %out
 
 ; CHECK: %[[ONE_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
 ; CHECK: %[[ONE_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
@@ -107,7 +107,7 @@ define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #
 ; CHECK: %[[ONE_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
 ; CHECK: %[[ONE_FDIV1:[0-9]+]] = fdiv float %[[ONE_A1]], %[[ONE_B1]]
 ; CHECK: %md.1ulp = insertelement <2 x float> %[[ONE_INS0]], float %[[ONE_FDIV1]], i64 1
-; CHECK: store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out
+; CHECK: store volatile <2 x float> %md.1ulp, ptr addrspace(1) %out
 
 ; CHECK: %[[A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
 ; CHECK: %[[B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
@@ -117,18 +117,18 @@ define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #
 ; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
 ; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]])
 ; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
-define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
+define amdgpu_kernel void @fdiv_fpmath_vector(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #1 {
   %no.md = fdiv <2 x float> %a, %b
-  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+  store volatile <2 x float> %no.md, ptr addrspace(1) %out
 
   %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
-  store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
+  store volatile <2 x float> %md.half.ulp, ptr addrspace(1) %out
 
   %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
-  store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out
+  store volatile <2 x float> %md.1ulp, ptr addrspace(1) %out
 
   %md.25ulp = fdiv <2 x float> %a, %b, !fpmath !0
-  store volatile <2 x float> %md.25ulp, <2 x float> addrspace(1)* %out
+  store volatile <2 x float> %md.25ulp, ptr addrspace(1) %out
 
   ret void
 }
@@ -140,7 +140,7 @@ define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2
 ; CHECK: %[[NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
 ; CHECK: %[[NO_FDIV1:[0-9]+]] = fdiv float 1.000000e+00, %[[NO1]]
 ; CHECK: %no.md = insertelement <2 x float> %[[NO_INS0]], float %[[NO_FDIV1]], i64 1
-; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+; CHECK: store volatile <2 x float> %no.md, ptr addrspace(1) %out
 
 ; CHECK: %[[HALF0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
 ; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[HALF0]]
@@ -148,7 +148,7 @@ define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2
 ; CHECK: %[[HALF1:[0-9]+]] = extractelement <2 x float> %x, i64 1
 ; CHECK: %[[HALF_FDIV1:[0-9]+]] =  fdiv float 1.000000e+00, %[[HALF1]]
 ; CHECK: %md.half.ulp = insertelement <2 x float> %[[HALF_INS0]], float %[[HALF_FDIV1]], i64 1
-; CHECK: store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
+; CHECK: store volatile <2 x float> %md.half.ulp, ptr addrspace(1) %out
 
 ; CHECK: %[[AFN_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
 ; CHECK: %[[AFN_NO_FDIV0:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_NO0]])
@@ -156,7 +156,7 @@ define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2
 ; CHECK: %[[AFN_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
 ; CHECK: %[[AFN_NO_FDIV1:[0-9]+]] =  call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_NO1]])
 ; CHECK: %afn.no.md = insertelement <2 x float> %[[AFN_NO_INS0]], float %[[AFN_NO_FDIV1]], i64 1
-; CHECK: store volatile <2 x float> %afn.no.md, <2 x float> addrspace(1)* %out
+; CHECK: store volatile <2 x float> %afn.no.md, ptr addrspace(1) %out
 
 ; CHECK: %[[FAST_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
 ; CHECK: %[[FAST_NO_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO0]])
@@ -164,7 +164,7 @@ define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2
 ; CHECK: %[[FAST_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
 ; CHECK: %[[FAST_NO_RCP1:[0-9]+]] =  call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO1]])
 ; CHECK: %fast.no.md = insertelement <2 x float> %[[FAST_NO_INS0]], float %[[FAST_NO_RCP1]], i64 1
-; CHECK: store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
+; CHECK: store volatile <2 x float> %fast.no.md, ptr addrspace(1) %out
 
 ; CHECK: %[[AFN_250:[0-9]+]] =  extractelement <2 x float> %x, i64 0
 ; CHECK: %[[AFN_25_RCP0:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_250]])
@@ -172,7 +172,7 @@ define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2
 ; CHECK: %[[AFN_251:[0-9]+]] = extractelement <2 x float> %x, i64 1
 ; CHECK: %[[AFN_25_RCP1:[0-9]+]] =  call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_251]])
 ; CHECK: %afn.25ulp = insertelement <2 x float> %[[AFN_25_INS0]], float %[[AFN_25_RCP1]], i64 1
-; CHECK: store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out
+; CHECK: store volatile <2 x float> %afn.25ulp, ptr addrspace(1) %out
 
 ; CHECK: %[[FAST_250:[0-9]+]] =  extractelement <2 x float> %x, i64 0
 ; CHECK: %[[FAST_25_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_250]])
@@ -180,25 +180,25 @@ define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2
 ; CHECK: %[[FAST_251:[0-9]+]] = extractelement <2 x float> %x, i64 1
 ; CHECK: %[[FAST_25_RCP1:[0-9]+]] =  call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_251]])
 ; CHECK: %fast.25ulp = insertelement <2 x float> %[[FAST_25_INS0]], float %[[FAST_25_RCP1]], i64 1
-; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
-define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+; CHECK: store volatile <2 x float> %fast.25ulp, ptr addrspace(1) %out
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector(ptr addrspace(1) %out, <2 x float> %x) #1 {
   %no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
-  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+  store volatile <2 x float> %no.md, ptr addrspace(1) %out
 
   %md.half.ulp = fdiv <2 x float> <float 1.0, float 1.0>, %x, !fpmath !1
-  store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
+  store volatile <2 x float> %md.half.ulp, ptr addrspace(1) %out
 
   %afn.no.md = fdiv afn <2 x float> <float 1.0, float 1.0>, %x
-  store volatile <2 x float> %afn.no.md, <2 x float> addrspace(1)* %out
+  store volatile <2 x float> %afn.no.md, ptr addrspace(1) %out
 
   %fast.no.md = fdiv fast <2 x float> <float 1.0, float 1.0>, %x
-  store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
+  store volatile <2 x float> %fast.no.md, ptr addrspace(1) %out
 
   %afn.25ulp = fdiv afn <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
-  store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out
+  store volatile <2 x float> %afn.25ulp, ptr addrspace(1) %out
 
   %fast.25ulp = fdiv fast <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
-  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
+  store volatile <2 x float> %fast.25ulp, ptr addrspace(1) %out
 
   ret void
 }
@@ -210,7 +210,7 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out
 ; CHECK: %[[NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
 ; CHECK: %[[NO_FDIV1:[0-9]+]] = fdiv float 2.000000e+00, %[[NO1]]
 ; CHECK: %no.md = insertelement <2 x float> %[[NO_INS0]], float %[[NO_FDIV1]], i64 1
-; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+; CHECK: store volatile <2 x float> %no.md, ptr addrspace(1) %out
 
 ; CHECK: %[[AFN_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
 ; CHECK: %[[AFN_NO_FDIV0:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_NO0]])
@@ -219,7 +219,7 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out
 ; CHECK: %[[AFN_NO_FDIV1:[0-9]+]] =  call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_NO1]])
 ; CHECK: %[[AFN_NO_MUL1:[0-9]+]] = fmul afn float 2.000000e+00, %[[AFN_NO_FDIV1]]
 ; CHECK: %afn.no.md = insertelement <2 x float> %[[AFN_NO_INS0]], float %[[AFN_NO_MUL1]], i64 1
-; CHECK: store volatile <2 x float> %afn.no.md, <2 x float> addrspace(1)* %out
+; CHECK: store volatile <2 x float> %afn.no.md, ptr addrspace(1) %out
 
 ; CHECK: %[[FAST_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
 ; CHECK: %[[FAST_NO_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO0]])
@@ -228,7 +228,7 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out
 ; CHECK: %[[FAST_NO_RCP1:[0-9]+]] =  call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO1]])
 ; CHECK: %[[FAST_NO_MUL1:[0-9]+]] = fmul fast float 2.000000e+00, %[[FAST_NO_RCP1]]
 ; CHECK: %fast.no.md = insertelement <2 x float> %[[FAST_NO_INS0]], float %[[FAST_NO_MUL1]], i64 1
-; CHECK: store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
+; CHECK: store volatile <2 x float> %fast.no.md, ptr addrspace(1) %out
 
 ; CHECK: %[[AFN_250:[0-9]+]] =  extractelement <2 x float> %x, i64 0
 ; CHECK: %[[AFN_25_RCP0:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_250]])
@@ -237,7 +237,7 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out
 ; CHECK: %[[AFN_25_RCP1:[0-9]+]] =  call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_251]])
 ; CHECK: %[[AFN_25_MUL1:[0-9]+]] = fmul afn float 2.000000e+00, %[[AFN_25_RCP1]]
 ; CHECK: %afn.25ulp = insertelement <2 x float> %[[AFN_25_INS0]], float %[[AFN_25_MUL1]], i64 1
-; CHECK: store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out
+; CHECK: store volatile <2 x float> %afn.25ulp, ptr addrspace(1) %out
 
 ; CHECK: %[[FAST_250:[0-9]+]] =  extractelement <2 x float> %x, i64 0
 ; CHECK: %[[FAST_25_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_250]])
@@ -246,22 +246,22 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out
 ; CHECK: %[[FAST_25_RCP1:[0-9]+]] =  call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_251]])
 ; CHECK: %[[FAST_25_MUL1:[0-9]+]] = fmul fast float 2.000000e+00, %[[FAST_25_RCP1]]
 ; CHECK: %fast.25ulp = insertelement <2 x float> %[[FAST_25_INS0]], float %[[FAST_25_MUL1]], i64 1
-; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
-define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+; CHECK: store volatile <2 x float> %fast.25ulp, ptr addrspace(1) %out
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(ptr addrspace(1) %out, <2 x float> %x) #1 {
   %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
-  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+  store volatile <2 x float> %no.md, ptr addrspace(1) %out
 
   %afn.no.md = fdiv afn <2 x float> <float 1.0, float 2.0>, %x
-  store volatile <2 x float> %afn.no.md, <2 x float> addrspace(1)* %out
+  store volatile <2 x float> %afn.no.md, ptr addrspace(1) %out
 
   %fast.no.md = fdiv fast <2 x float> <float 1.0, float 2.0>, %x
-  store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
+  store volatile <2 x float> %fast.no.md, ptr addrspace(1) %out
 
   %afn.25ulp = fdiv afn <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
-  store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out
+  store volatile <2 x float> %afn.25ulp, ptr addrspace(1) %out
 
   %fast.25ulp = fdiv fast <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
-  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
+  store volatile <2 x float> %fast.25ulp, ptr addrspace(1) %out
 
   ret void
 }
@@ -290,14 +290,14 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace
 ; CHECK: %[[FAST_MUL1:[0-9]+]] = fmul fast float %[[FAST_A1]], %[[FAST_RCP1]]
 ; CHECK: %fast.25ulp = insertelement <2 x float> %[[FAST_INS0]], float %[[FAST_MUL1]], i64 1
 ; CHECK: store volatile <2 x float> %fast.25ulp
-define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(ptr addrspace(1) %out, <2 x float> %x, <2 x float> %y) #1 {
   %x.insert = insertelement <2 x float> %x, float 1.0, i32 0
 
   %afn.25ulp = fdiv afn <2 x float> %x.insert, %y, !fpmath !0
-  store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out
+  store volatile <2 x float> %afn.25ulp, ptr addrspace(1) %out
 
   %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0
-  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
+  store volatile <2 x float> %fast.25ulp, ptr addrspace(1) %out
 
   ret void
 }
@@ -312,27 +312,27 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> a
 ; CHECK: %fast.md.25ulp = fmul fast float %a, %[[RCP_FAST]]
 ; CHECK: %[[RCP_AFN:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %b)
 ; CHECK: %afn.md.25ulp  = fmul afn float %a, %[[RCP_AFN]]
-define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_fpmath_f32_denormals(ptr addrspace(1) %out, float %a, float %b) #2 {
   %no.md = fdiv float %a, %b
-  store volatile float %no.md, float addrspace(1)* %out
+  store volatile float %no.md, ptr addrspace(1) %out
 
   %md.half.ulp = fdiv float %a, %b, !fpmath !1
-  store volatile float %md.half.ulp, float addrspace(1)* %out
+  store volatile float %md.half.ulp, ptr addrspace(1) %out
 
   %md.1ulp = fdiv float %a, %b, !fpmath !2
-  store volatile float %md.1ulp, float addrspace(1)* %out
+  store volatile float %md.1ulp, ptr addrspace(1) %out
 
   %md.25ulp = fdiv float %a, %b, !fpmath !0
-  store volatile float %md.25ulp, float addrspace(1)* %out
+  store volatile float %md.25ulp, ptr addrspace(1) %out
 
   %md.3ulp = fdiv float %a, %b, !fpmath !3
-  store volatile float %md.3ulp, float addrspace(1)* %out
+  store volatile float %md.3ulp, ptr addrspace(1) %out
 
   %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
-  store volatile float %fast.md.25ulp, float addrspace(1)* %out
+  store volatile float %fast.md.25ulp, ptr addrspace(1) %out
 
   %afn.md.25ulp = fdiv afn float %a, %b, !fpmath !0
-  store volatile float %afn.md.25ulp, float addrspace(1)* %out
+  store volatile float %afn.md.25ulp, ptr addrspace(1) %out
 
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index a936bcf4ef31e..5c40a4ce13e31 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -86,7 +86,7 @@ define <2 x i32> @select_sdiv_rhs_const_v2i32(i1 %cond) {
 
 define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
 ; IR-LABEL: @select_sdiv_lhs_opaque_const0_i32(
-; IR-NEXT:    [[SELECT:%.*]] = select i1 [[COND:%.*]], i32 ptrtoint (i32 addrspace(1)* @gv to i32), i32 5
+; IR-NEXT:    [[SELECT:%.*]] = select i1 [[COND:%.*]], i32 ptrtoint (ptr addrspace(1) @gv to i32), i32 5
 ; IR-NEXT:    [[TMP1:%.*]] = ashr i32 [[SELECT]], 31
 ; IR-NEXT:    [[TMP2:%.*]] = xor i32 0, [[TMP1]]
 ; IR-NEXT:    [[TMP3:%.*]] = add i32 [[SELECT]], [[TMP1]]
@@ -161,14 +161,14 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %select = select i1 %cond, i32 ptrtoint (i32 addrspace(1)* @gv to i32), i32 5
+  %select = select i1 %cond, i32 ptrtoint (ptr addrspace(1) @gv to i32), i32 5
   %op = sdiv i32 1000000, %select
   ret i32 %op
 }
 
 define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
 ; IR-LABEL: @select_sdiv_lhs_opaque_const1_i32(
-; IR-NEXT:    [[SELECT:%.*]] = select i1 [[COND:%.*]], i32 5, i32 ptrtoint (i32 addrspace(1)* @gv to i32)
+; IR-NEXT:    [[SELECT:%.*]] = select i1 [[COND:%.*]], i32 5, i32 ptrtoint (ptr addrspace(1) @gv to i32)
 ; IR-NEXT:    [[TMP1:%.*]] = ashr i32 [[SELECT]], 31
 ; IR-NEXT:    [[TMP2:%.*]] = xor i32 0, [[TMP1]]
 ; IR-NEXT:    [[TMP3:%.*]] = add i32 [[SELECT]], [[TMP1]]
@@ -243,14 +243,14 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %select = select i1 %cond, i32 5, i32 ptrtoint (i32 addrspace(1)* @gv to i32)
+  %select = select i1 %cond, i32 5, i32 ptrtoint (ptr addrspace(1) @gv to i32)
   %op = sdiv i32 1000000, %select
   ret i32 %op
 }
 
 define i32 @select_sdiv_rhs_opaque_const0_i32(i1 %cond) {
 ; IR-LABEL: @select_sdiv_rhs_opaque_const0_i32(
-; IR-NEXT:    [[SELECT:%.*]] = select i1 [[COND:%.*]], i32 ptrtoint (i32 addrspace(1)* @gv to i32), i32 234234
+; IR-NEXT:    [[SELECT:%.*]] = select i1 [[COND:%.*]], i32 ptrtoint (ptr addrspace(1) @gv to i32), i32 234234
 ; IR-NEXT:    [[OP:%.*]] = sdiv i32 [[SELECT]], 42
 ; IR-NEXT:    ret i32 [[OP]]
 ;
@@ -273,14 +273,14 @@ define i32 @select_sdiv_rhs_opaque_const0_i32(i1 %cond) {
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 3, v0
 ; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %select = select i1 %cond, i32 ptrtoint (i32 addrspace(1)* @gv to i32), i32 234234
+  %select = select i1 %cond, i32 ptrtoint (ptr addrspace(1) @gv to i32), i32 234234
   %op = sdiv i32 %select, 42
   ret i32 %op
 }
 
 define i32 @select_sdiv_rhs_opaque_const1_i32(i1 %cond) {
 ; IR-LABEL: @select_sdiv_rhs_opaque_const1_i32(
-; IR-NEXT:    [[SELECT:%.*]] = select i1 [[COND:%.*]], i32 42000, i32 ptrtoint (i32 addrspace(1)* @gv to i32)
+; IR-NEXT:    [[SELECT:%.*]] = select i1 [[COND:%.*]], i32 42000, i32 ptrtoint (ptr addrspace(1) @gv to i32)
 ; IR-NEXT:    [[OP:%.*]] = sdiv i32 [[SELECT]], 42
 ; IR-NEXT:    ret i32 [[OP]]
 ;
@@ -303,7 +303,7 @@ define i32 @select_sdiv_rhs_opaque_const1_i32(i1 %cond) {
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 3, v0
 ; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %select = select i1 %cond, i32 42000, i32 ptrtoint (i32 addrspace(1)* @gv to i32)
+  %select = select i1 %cond, i32 42000, i32 ptrtoint (ptr addrspace(1) @gv to i32)
   %op = sdiv i32 %select, 42
   ret i32 %op
 }
@@ -387,7 +387,7 @@ define i32 @select_mul_rhs_const_i32(i1 %cond) {
 define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
 ; IR-LABEL: @select_add_lhs_const_i16(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i16 128, i16 131
-; IR-NEXT:    store i16 [[OP]], i16 addrspace(1)* undef, align 2
+; IR-NEXT:    store i16 [[OP]], ptr addrspace(1) undef, align 2
 ; IR-NEXT:    ret void
 ;
 ; GCN-LABEL: select_add_lhs_const_i16:
@@ -403,7 +403,7 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
 ; GCN-NEXT:    s_endpgm
   %select = select i1 %cond, i16 5, i16 8
   %op = add i16 %select, 123
-  store i16 %op, i16 addrspace(1)* undef
+  store i16 %op, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll
index a3b7178d9df11..296370e99bdfa 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-foldnegate.ll
@@ -28,13 +28,13 @@ define i1 @fold_negate_intrinsic_test_mask_dbl(double %x) nounwind {
 ; Negative test: should not transform for variable test masks
 ; CHECK: @fold_negate_intrinsic_test_mask_neg_var
 ; CHECK: %[[X0:.*]] = alloca i32
-; CHECK: %[[X1:.*]] = load i32, i32 addrspace(5)* %[[X0]]
+; CHECK: %[[X1:.*]] = load i32, ptr addrspace(5) %[[X0]]
 ; CHECK: call i1 @llvm.amdgcn.class.f32(float %x, i32 %[[X1]])
 ; CHECK: xor
 define i1 @fold_negate_intrinsic_test_mask_neg_var(float %x) nounwind {
   %1 = alloca i32, addrspace(5)
-  store i32 7, i32 addrspace(5)* %1
-  %2 = load i32, i32 addrspace(5)* %1
+  store i32 7, ptr addrspace(5) %1
+  %2 = load i32, ptr addrspace(5) %1
   %3 = call i1 @llvm.amdgcn.class.f32(float %x, i32 %2)
   %4 = xor i1 %3, -1
   ret i1 %4
@@ -50,7 +50,7 @@ define i1 @fold_negate_intrinsic_test_mask_neg_multiple_uses(float %x) nounwind
   %y = alloca i1, addrspace(5)
   %1 = call i1 @llvm.amdgcn.class.f32(float %x, i32 7)
   %2 = xor i1 %1, -1
-  store i1 %1, i1 addrspace(5)* %y
+  store i1 %1, ptr addrspace(5) %y
   %3 = xor i1 %1, -1
   ret i1 %2
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
index 01c2da8f399c5..210356d131350 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
@@ -5,7 +5,7 @@
 define amdgpu_kernel void @add_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @add_i3(
 ; SI-NEXT:    [[R:%.*]] = add i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_i3(
@@ -13,18 +13,18 @@ define amdgpu_kernel void @add_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = add i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @add_nsw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @add_nsw_i3(
 ; SI-NEXT:    [[R:%.*]] = add nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nsw_i3(
@@ -32,18 +32,18 @@ define amdgpu_kernel void @add_nsw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = add nsw i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @add_nuw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @add_nuw_i3(
 ; SI-NEXT:    [[R:%.*]] = add nuw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nuw_i3(
@@ -51,18 +51,18 @@ define amdgpu_kernel void @add_nuw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = add nuw i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @add_nuw_nsw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @add_nuw_nsw_i3(
 ; SI-NEXT:    [[R:%.*]] = add nuw nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nuw_nsw_i3(
@@ -70,18 +70,18 @@ define amdgpu_kernel void @add_nuw_nsw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = add nuw nsw i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @sub_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @sub_i3(
 ; SI-NEXT:    [[R:%.*]] = sub i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_i3(
@@ -89,18 +89,18 @@ define amdgpu_kernel void @sub_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = sub nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = sub i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @sub_nsw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @sub_nsw_i3(
 ; SI-NEXT:    [[R:%.*]] = sub nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nsw_i3(
@@ -108,18 +108,18 @@ define amdgpu_kernel void @sub_nsw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = sub nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = sub nsw i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @sub_nuw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @sub_nuw_i3(
 ; SI-NEXT:    [[R:%.*]] = sub nuw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nuw_i3(
@@ -127,18 +127,18 @@ define amdgpu_kernel void @sub_nuw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = sub nuw i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @sub_nuw_nsw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @sub_nuw_nsw_i3(
 ; SI-NEXT:    [[R:%.*]] = sub nuw nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nuw_nsw_i3(
@@ -146,18 +146,18 @@ define amdgpu_kernel void @sub_nuw_nsw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = sub nuw nsw i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @mul_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @mul_i3(
 ; SI-NEXT:    [[R:%.*]] = mul i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_i3(
@@ -165,18 +165,18 @@ define amdgpu_kernel void @mul_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = mul i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @mul_nsw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @mul_nsw_i3(
 ; SI-NEXT:    [[R:%.*]] = mul nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nsw_i3(
@@ -184,18 +184,18 @@ define amdgpu_kernel void @mul_nsw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = mul nsw i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @mul_nuw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @mul_nuw_i3(
 ; SI-NEXT:    [[R:%.*]] = mul nuw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nuw_i3(
@@ -203,18 +203,18 @@ define amdgpu_kernel void @mul_nuw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = mul nuw i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @mul_nuw_nsw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @mul_nuw_nsw_i3(
 ; SI-NEXT:    [[R:%.*]] = mul nuw nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nuw_nsw_i3(
@@ -222,18 +222,18 @@ define amdgpu_kernel void @mul_nuw_nsw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = mul nuw nsw i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @shl_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @shl_i3(
 ; SI-NEXT:    [[R:%.*]] = shl i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_i3(
@@ -241,18 +241,18 @@ define amdgpu_kernel void @shl_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = shl i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @shl_nsw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @shl_nsw_i3(
 ; SI-NEXT:    [[R:%.*]] = shl nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nsw_i3(
@@ -260,18 +260,18 @@ define amdgpu_kernel void @shl_nsw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = shl nsw i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @shl_nuw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @shl_nuw_i3(
 ; SI-NEXT:    [[R:%.*]] = shl nuw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nuw_i3(
@@ -279,18 +279,18 @@ define amdgpu_kernel void @shl_nuw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = shl nuw i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @shl_nuw_nsw_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @shl_nuw_nsw_i3(
 ; SI-NEXT:    [[R:%.*]] = shl nuw nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nuw_nsw_i3(
@@ -298,18 +298,18 @@ define amdgpu_kernel void @shl_nuw_nsw_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = shl nuw nsw i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @lshr_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @lshr_i3(
 ; SI-NEXT:    [[R:%.*]] = lshr i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @lshr_i3(
@@ -317,18 +317,18 @@ define amdgpu_kernel void @lshr_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = lshr i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @lshr_exact_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @lshr_exact_i3(
 ; SI-NEXT:    [[R:%.*]] = lshr exact i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @lshr_exact_i3(
@@ -336,18 +336,18 @@ define amdgpu_kernel void @lshr_exact_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = lshr exact i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = lshr exact i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @ashr_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @ashr_i3(
 ; SI-NEXT:    [[R:%.*]] = ashr i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @ashr_i3(
@@ -355,18 +355,18 @@ define amdgpu_kernel void @ashr_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = sext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = ashr i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @ashr_exact_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @ashr_exact_i3(
 ; SI-NEXT:    [[R:%.*]] = ashr exact i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @ashr_exact_i3(
@@ -374,18 +374,18 @@ define amdgpu_kernel void @ashr_exact_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = sext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = ashr exact i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = ashr exact i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @and_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @and_i3(
 ; SI-NEXT:    [[R:%.*]] = and i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @and_i3(
@@ -393,18 +393,18 @@ define amdgpu_kernel void @and_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = and i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = and i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @or_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @or_i3(
 ; SI-NEXT:    [[R:%.*]] = or i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @or_i3(
@@ -412,18 +412,18 @@ define amdgpu_kernel void @or_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = or i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = or i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @xor_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @xor_i3(
 ; SI-NEXT:    [[R:%.*]] = xor i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @xor_i3(
@@ -431,11 +431,11 @@ define amdgpu_kernel void @xor_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %r = xor i3 %a, %b
-  store volatile i3 %r, i3 addrspace(1)* undef
+  store volatile i3 %r, ptr addrspace(1) undef
   ret void
 }
 
@@ -443,7 +443,7 @@ define amdgpu_kernel void @select_eq_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_eq_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp eq i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_eq_i3(
@@ -454,12 +454,12 @@ define amdgpu_kernel void @select_eq_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp eq i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, i3 addrspace(1)* undef
+  store volatile i3 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -467,7 +467,7 @@ define amdgpu_kernel void @select_ne_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_ne_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ne i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ne_i3(
@@ -478,12 +478,12 @@ define amdgpu_kernel void @select_ne_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ne i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, i3 addrspace(1)* undef
+  store volatile i3 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -491,7 +491,7 @@ define amdgpu_kernel void @select_ugt_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_ugt_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ugt i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ugt_i3(
@@ -502,12 +502,12 @@ define amdgpu_kernel void @select_ugt_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ugt i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, i3 addrspace(1)* undef
+  store volatile i3 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -515,7 +515,7 @@ define amdgpu_kernel void @select_uge_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_uge_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp uge i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_uge_i3(
@@ -526,12 +526,12 @@ define amdgpu_kernel void @select_uge_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp uge i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, i3 addrspace(1)* undef
+  store volatile i3 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -539,7 +539,7 @@ define amdgpu_kernel void @select_ult_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_ult_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ult i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ult_i3(
@@ -550,12 +550,12 @@ define amdgpu_kernel void @select_ult_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ult i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, i3 addrspace(1)* undef
+  store volatile i3 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -563,7 +563,7 @@ define amdgpu_kernel void @select_ule_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_ule_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ule i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ule_i3(
@@ -574,12 +574,12 @@ define amdgpu_kernel void @select_ule_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ule i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, i3 addrspace(1)* undef
+  store volatile i3 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -587,7 +587,7 @@ define amdgpu_kernel void @select_sgt_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_sgt_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sgt i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sgt_i3(
@@ -598,12 +598,12 @@ define amdgpu_kernel void @select_sgt_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sgt i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, i3 addrspace(1)* undef
+  store volatile i3 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -611,7 +611,7 @@ define amdgpu_kernel void @select_sge_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_sge_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sge i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sge_i3(
@@ -622,12 +622,12 @@ define amdgpu_kernel void @select_sge_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sge i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, i3 addrspace(1)* undef
+  store volatile i3 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -635,7 +635,7 @@ define amdgpu_kernel void @select_slt_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_slt_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp slt i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_slt_i3(
@@ -646,12 +646,12 @@ define amdgpu_kernel void @select_slt_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp slt i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, i3 addrspace(1)* undef
+  store volatile i3 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -659,7 +659,7 @@ define amdgpu_kernel void @select_sle_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @select_sle_i3(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sle i3 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sle_i3(
@@ -670,12 +670,12 @@ define amdgpu_kernel void @select_sle_i3(i3 %a, i3 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext i3 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sle i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, i3 addrspace(1)* undef
+  store volatile i3 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -683,7 +683,7 @@ declare i3 @llvm.bitreverse.i3(i3)
 define amdgpu_kernel void @bitreverse_i3(i3 %a) {
 ; SI-LABEL: @bitreverse_i3(
 ; SI-NEXT:    [[BREV:%.*]] = call i3 @llvm.bitreverse.i3(i3 [[A:%.*]])
-; SI-NEXT:    store volatile i3 [[BREV]], i3 addrspace(1)* undef, align 1
+; SI-NEXT:    store volatile i3 [[BREV]], ptr addrspace(1) undef, align 1
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @bitreverse_i3(
@@ -691,18 +691,18 @@ define amdgpu_kernel void @bitreverse_i3(i3 %a) {
 ; VI-NEXT:    [[TMP2:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[TMP1]])
 ; VI-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP2]], 29
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], i3 addrspace(1)* undef, align 1
+; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) undef, align 1
 ; VI-NEXT:    ret void
 ;
   %brev = call i3 @llvm.bitreverse.i3(i3 %a)
-  store volatile i3 %brev, i3 addrspace(1)* undef
+  store volatile i3 %brev, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @add_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @add_i16(
 ; SI-NEXT:    [[R:%.*]] = add i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_i16(
@@ -710,63 +710,63 @@ define amdgpu_kernel void @add_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = add i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @constant_add_i16() {
 ; SI-LABEL: @constant_add_i16(
 ; SI-NEXT:    [[R:%.*]] = add i16 1, 2
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @constant_add_i16(
-; VI-NEXT:    store volatile i16 3, i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 3, ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = add i16 1, 2
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @constant_add_nsw_i16() {
 ; SI-LABEL: @constant_add_nsw_i16(
 ; SI-NEXT:    [[R:%.*]] = add nsw i16 1, 2
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @constant_add_nsw_i16(
-; VI-NEXT:    store volatile i16 3, i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 3, ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = add nsw i16 1, 2
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @constant_add_nuw_i16() {
 ; SI-LABEL: @constant_add_nuw_i16(
 ; SI-NEXT:    [[R:%.*]] = add nsw i16 1, 2
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @constant_add_nuw_i16(
-; VI-NEXT:    store volatile i16 3, i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 3, ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = add nsw i16 1, 2
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @add_nsw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @add_nsw_i16(
 ; SI-NEXT:    [[R:%.*]] = add nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nsw_i16(
@@ -774,18 +774,18 @@ define amdgpu_kernel void @add_nsw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = add nsw i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @add_nuw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @add_nuw_i16(
 ; SI-NEXT:    [[R:%.*]] = add nuw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nuw_i16(
@@ -793,18 +793,18 @@ define amdgpu_kernel void @add_nuw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = add nuw i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @add_nuw_nsw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @add_nuw_nsw_i16(
 ; SI-NEXT:    [[R:%.*]] = add nuw nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nuw_nsw_i16(
@@ -812,18 +812,18 @@ define amdgpu_kernel void @add_nuw_nsw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = add nuw nsw i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @sub_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @sub_i16(
 ; SI-NEXT:    [[R:%.*]] = sub i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_i16(
@@ -831,18 +831,18 @@ define amdgpu_kernel void @sub_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = sub nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = sub i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @sub_nsw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @sub_nsw_i16(
 ; SI-NEXT:    [[R:%.*]] = sub nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nsw_i16(
@@ -850,18 +850,18 @@ define amdgpu_kernel void @sub_nsw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = sub nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = sub nsw i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @sub_nuw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @sub_nuw_i16(
 ; SI-NEXT:    [[R:%.*]] = sub nuw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nuw_i16(
@@ -869,18 +869,18 @@ define amdgpu_kernel void @sub_nuw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = sub nuw i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @sub_nuw_nsw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @sub_nuw_nsw_i16(
 ; SI-NEXT:    [[R:%.*]] = sub nuw nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nuw_nsw_i16(
@@ -888,18 +888,18 @@ define amdgpu_kernel void @sub_nuw_nsw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = sub nuw nsw i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @mul_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @mul_i16(
 ; SI-NEXT:    [[R:%.*]] = mul i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_i16(
@@ -907,18 +907,18 @@ define amdgpu_kernel void @mul_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = mul i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @mul_nsw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @mul_nsw_i16(
 ; SI-NEXT:    [[R:%.*]] = mul nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nsw_i16(
@@ -926,18 +926,18 @@ define amdgpu_kernel void @mul_nsw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = mul nsw i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @mul_nuw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @mul_nuw_i16(
 ; SI-NEXT:    [[R:%.*]] = mul nuw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nuw_i16(
@@ -945,18 +945,18 @@ define amdgpu_kernel void @mul_nuw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = mul nuw i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @mul_nuw_nsw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @mul_nuw_nsw_i16(
 ; SI-NEXT:    [[R:%.*]] = mul nuw nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nuw_nsw_i16(
@@ -964,18 +964,18 @@ define amdgpu_kernel void @mul_nuw_nsw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = mul nuw nsw i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @shl_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @shl_i16(
 ; SI-NEXT:    [[R:%.*]] = shl i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_i16(
@@ -983,18 +983,18 @@ define amdgpu_kernel void @shl_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = shl i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @shl_nsw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @shl_nsw_i16(
 ; SI-NEXT:    [[R:%.*]] = shl nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nsw_i16(
@@ -1002,18 +1002,18 @@ define amdgpu_kernel void @shl_nsw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = shl nsw i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @shl_nuw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @shl_nuw_i16(
 ; SI-NEXT:    [[R:%.*]] = shl nuw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nuw_i16(
@@ -1021,18 +1021,18 @@ define amdgpu_kernel void @shl_nuw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = shl nuw i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @shl_nuw_nsw_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @shl_nuw_nsw_i16(
 ; SI-NEXT:    [[R:%.*]] = shl nuw nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nuw_nsw_i16(
@@ -1040,18 +1040,18 @@ define amdgpu_kernel void @shl_nuw_nsw_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = shl nuw nsw i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @lshr_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @lshr_i16(
 ; SI-NEXT:    [[R:%.*]] = lshr i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @lshr_i16(
@@ -1059,18 +1059,18 @@ define amdgpu_kernel void @lshr_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = lshr i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @lshr_exact_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @lshr_exact_i16(
 ; SI-NEXT:    [[R:%.*]] = lshr exact i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @lshr_exact_i16(
@@ -1078,18 +1078,18 @@ define amdgpu_kernel void @lshr_exact_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = lshr exact i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = lshr exact i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @ashr_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @ashr_i16(
 ; SI-NEXT:    [[R:%.*]] = ashr i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @ashr_i16(
@@ -1097,18 +1097,18 @@ define amdgpu_kernel void @ashr_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = sext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = ashr i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @ashr_exact_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @ashr_exact_i16(
 ; SI-NEXT:    [[R:%.*]] = ashr exact i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @ashr_exact_i16(
@@ -1116,33 +1116,33 @@ define amdgpu_kernel void @ashr_exact_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = sext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = ashr exact i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = ashr exact i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @constant_lshr_exact_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @constant_lshr_exact_i16(
 ; SI-NEXT:    [[R:%.*]] = lshr exact i16 4, 1
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @constant_lshr_exact_i16(
-; VI-NEXT:    store volatile i16 2, i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 2, ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = lshr exact i16 4, 1
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @and_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @and_i16(
 ; SI-NEXT:    [[R:%.*]] = and i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @and_i16(
@@ -1150,18 +1150,18 @@ define amdgpu_kernel void @and_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = and i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = and i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @or_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @or_i16(
 ; SI-NEXT:    [[R:%.*]] = or i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @or_i16(
@@ -1169,18 +1169,18 @@ define amdgpu_kernel void @or_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = or i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = or i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @xor_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @xor_i16(
 ; SI-NEXT:    [[R:%.*]] = xor i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @xor_i16(
@@ -1188,11 +1188,11 @@ define amdgpu_kernel void @xor_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
 ; VI-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %r = xor i16 %a, %b
-  store volatile i16 %r, i16 addrspace(1)* undef
+  store volatile i16 %r, ptr addrspace(1) undef
   ret void
 }
 
@@ -1200,7 +1200,7 @@ define amdgpu_kernel void @select_eq_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_eq_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp eq i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_eq_i16(
@@ -1211,12 +1211,12 @@ define amdgpu_kernel void @select_eq_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp eq i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, i16 addrspace(1)* undef
+  store volatile i16 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -1224,7 +1224,7 @@ define amdgpu_kernel void @select_ne_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_ne_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ne i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ne_i16(
@@ -1235,12 +1235,12 @@ define amdgpu_kernel void @select_ne_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ne i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, i16 addrspace(1)* undef
+  store volatile i16 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -1248,7 +1248,7 @@ define amdgpu_kernel void @select_ugt_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_ugt_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ugt i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ugt_i16(
@@ -1259,12 +1259,12 @@ define amdgpu_kernel void @select_ugt_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ugt i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, i16 addrspace(1)* undef
+  store volatile i16 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -1272,7 +1272,7 @@ define amdgpu_kernel void @select_uge_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_uge_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp uge i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_uge_i16(
@@ -1283,12 +1283,12 @@ define amdgpu_kernel void @select_uge_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp uge i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, i16 addrspace(1)* undef
+  store volatile i16 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -1296,7 +1296,7 @@ define amdgpu_kernel void @select_ult_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_ult_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ult i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ult_i16(
@@ -1307,12 +1307,12 @@ define amdgpu_kernel void @select_ult_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ult i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, i16 addrspace(1)* undef
+  store volatile i16 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -1320,7 +1320,7 @@ define amdgpu_kernel void @select_ule_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_ule_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ule i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ule_i16(
@@ -1331,12 +1331,12 @@ define amdgpu_kernel void @select_ule_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ule i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, i16 addrspace(1)* undef
+  store volatile i16 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -1344,7 +1344,7 @@ define amdgpu_kernel void @select_sgt_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_sgt_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sgt i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sgt_i16(
@@ -1355,12 +1355,12 @@ define amdgpu_kernel void @select_sgt_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sgt i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, i16 addrspace(1)* undef
+  store volatile i16 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -1368,7 +1368,7 @@ define amdgpu_kernel void @select_sge_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_sge_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sge i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sge_i16(
@@ -1379,12 +1379,12 @@ define amdgpu_kernel void @select_sge_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sge i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, i16 addrspace(1)* undef
+  store volatile i16 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -1392,7 +1392,7 @@ define amdgpu_kernel void @select_slt_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_slt_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp slt i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_slt_i16(
@@ -1403,12 +1403,12 @@ define amdgpu_kernel void @select_slt_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp slt i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, i16 addrspace(1)* undef
+  store volatile i16 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -1416,7 +1416,7 @@ define amdgpu_kernel void @select_sle_i16(i16 %a, i16 %b) {
 ; SI-LABEL: @select_sle_i16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sle i16 [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sle_i16(
@@ -1427,12 +1427,12 @@ define amdgpu_kernel void @select_sle_i16(i16 %a, i16 %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext i16 [[B]] to i32
 ; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sle i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, i16 addrspace(1)* undef
+  store volatile i16 %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -1441,7 +1441,7 @@ declare i16 @llvm.bitreverse.i16(i16)
 define amdgpu_kernel void @bitreverse_i16(i16 %a) {
 ; SI-LABEL: @bitreverse_i16(
 ; SI-NEXT:    [[BREV:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[A:%.*]])
-; SI-NEXT:    store volatile i16 [[BREV]], i16 addrspace(1)* undef, align 2
+; SI-NEXT:    store volatile i16 [[BREV]], ptr addrspace(1) undef, align 2
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @bitreverse_i16(
@@ -1449,18 +1449,18 @@ define amdgpu_kernel void @bitreverse_i16(i16 %a) {
 ; VI-NEXT:    [[TMP2:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[TMP1]])
 ; VI-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP2]], 16
 ; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], i16 addrspace(1)* undef, align 2
+; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) undef, align 2
 ; VI-NEXT:    ret void
 ;
   %brev = call i16 @llvm.bitreverse.i16(i16 %a)
-  store volatile i16 %brev, i16 addrspace(1)* undef
+  store volatile i16 %brev, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @add_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @add_3xi15(
 ; SI-NEXT:    [[R:%.*]] = add <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_3xi15(
@@ -1468,18 +1468,18 @@ define amdgpu_kernel void @add_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = add <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @add_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @add_nsw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = add nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nsw_3xi15(
@@ -1487,18 +1487,18 @@ define amdgpu_kernel void @add_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = add nsw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @add_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @add_nuw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = add nuw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nuw_3xi15(
@@ -1506,18 +1506,18 @@ define amdgpu_kernel void @add_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = add nuw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @add_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @add_nuw_nsw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = add nuw nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nuw_nsw_3xi15(
@@ -1525,18 +1525,18 @@ define amdgpu_kernel void @add_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = add nuw nsw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @sub_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @sub_3xi15(
 ; SI-NEXT:    [[R:%.*]] = sub <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_3xi15(
@@ -1544,18 +1544,18 @@ define amdgpu_kernel void @sub_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = sub nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = sub <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @sub_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @sub_nsw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = sub nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nsw_3xi15(
@@ -1563,18 +1563,18 @@ define amdgpu_kernel void @sub_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = sub nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = sub nsw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @sub_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @sub_nuw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = sub nuw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nuw_3xi15(
@@ -1582,18 +1582,18 @@ define amdgpu_kernel void @sub_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = sub nuw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @sub_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @sub_nuw_nsw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = sub nuw nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nuw_nsw_3xi15(
@@ -1601,18 +1601,18 @@ define amdgpu_kernel void @sub_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = sub nuw nsw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @mul_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @mul_3xi15(
 ; SI-NEXT:    [[R:%.*]] = mul <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_3xi15(
@@ -1620,18 +1620,18 @@ define amdgpu_kernel void @mul_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = mul <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @mul_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @mul_nsw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = mul nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nsw_3xi15(
@@ -1639,18 +1639,18 @@ define amdgpu_kernel void @mul_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = mul nsw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @mul_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @mul_nuw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = mul nuw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nuw_3xi15(
@@ -1658,18 +1658,18 @@ define amdgpu_kernel void @mul_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = mul nuw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @mul_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @mul_nuw_nsw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = mul nuw nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nuw_nsw_3xi15(
@@ -1677,18 +1677,18 @@ define amdgpu_kernel void @mul_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = mul nuw nsw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @shl_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @shl_3xi15(
 ; SI-NEXT:    [[R:%.*]] = shl <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_3xi15(
@@ -1696,18 +1696,18 @@ define amdgpu_kernel void @shl_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = shl <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @shl_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @shl_nsw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = shl nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nsw_3xi15(
@@ -1715,18 +1715,18 @@ define amdgpu_kernel void @shl_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = shl nsw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @shl_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @shl_nuw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = shl nuw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nuw_3xi15(
@@ -1734,18 +1734,18 @@ define amdgpu_kernel void @shl_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = shl nuw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @shl_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @shl_nuw_nsw_3xi15(
 ; SI-NEXT:    [[R:%.*]] = shl nuw nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nuw_nsw_3xi15(
@@ -1753,18 +1753,18 @@ define amdgpu_kernel void @shl_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = shl nuw nsw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @lshr_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @lshr_3xi15(
 ; SI-NEXT:    [[R:%.*]] = lshr <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @lshr_3xi15(
@@ -1772,18 +1772,18 @@ define amdgpu_kernel void @lshr_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = lshr <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = lshr <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @lshr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @lshr_exact_3xi15(
 ; SI-NEXT:    [[R:%.*]] = lshr exact <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @lshr_exact_3xi15(
@@ -1791,18 +1791,18 @@ define amdgpu_kernel void @lshr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = lshr exact <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = lshr exact <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @ashr_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @ashr_3xi15(
 ; SI-NEXT:    [[R:%.*]] = ashr <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @ashr_3xi15(
@@ -1810,18 +1810,18 @@ define amdgpu_kernel void @ashr_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = ashr <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = ashr <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @ashr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @ashr_exact_3xi15(
 ; SI-NEXT:    [[R:%.*]] = ashr exact <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @ashr_exact_3xi15(
@@ -1829,18 +1829,18 @@ define amdgpu_kernel void @ashr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = ashr exact <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = ashr exact <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @and_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @and_3xi15(
 ; SI-NEXT:    [[R:%.*]] = and <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @and_3xi15(
@@ -1848,18 +1848,18 @@ define amdgpu_kernel void @and_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = and <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = and <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @or_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @or_3xi15(
 ; SI-NEXT:    [[R:%.*]] = or <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @or_3xi15(
@@ -1867,18 +1867,18 @@ define amdgpu_kernel void @or_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = or <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = or <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @xor_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @xor_3xi15(
 ; SI-NEXT:    [[R:%.*]] = xor <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @xor_3xi15(
@@ -1886,11 +1886,11 @@ define amdgpu_kernel void @xor_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = xor <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = xor <3 x i15> %a, %b
-  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %r, ptr addrspace(1) undef
   ret void
 }
 
@@ -1898,7 +1898,7 @@ define amdgpu_kernel void @select_eq_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_eq_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp eq <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_eq_3xi15(
@@ -1909,12 +1909,12 @@ define amdgpu_kernel void @select_eq_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp eq <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -1922,7 +1922,7 @@ define amdgpu_kernel void @select_ne_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_ne_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ne <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ne_3xi15(
@@ -1933,12 +1933,12 @@ define amdgpu_kernel void @select_ne_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ne <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -1946,7 +1946,7 @@ define amdgpu_kernel void @select_ugt_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_ugt_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ugt <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ugt_3xi15(
@@ -1957,12 +1957,12 @@ define amdgpu_kernel void @select_ugt_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ugt <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -1970,7 +1970,7 @@ define amdgpu_kernel void @select_uge_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_uge_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp uge <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_uge_3xi15(
@@ -1981,12 +1981,12 @@ define amdgpu_kernel void @select_uge_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp uge <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -1994,7 +1994,7 @@ define amdgpu_kernel void @select_ult_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_ult_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ult <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ult_3xi15(
@@ -2005,12 +2005,12 @@ define amdgpu_kernel void @select_ult_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ult <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -2018,7 +2018,7 @@ define amdgpu_kernel void @select_ule_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_ule_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ule <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ule_3xi15(
@@ -2029,12 +2029,12 @@ define amdgpu_kernel void @select_ule_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ule <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -2042,7 +2042,7 @@ define amdgpu_kernel void @select_sgt_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_sgt_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sgt <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sgt_3xi15(
@@ -2053,12 +2053,12 @@ define amdgpu_kernel void @select_sgt_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sgt <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -2066,7 +2066,7 @@ define amdgpu_kernel void @select_sge_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_sge_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sge <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sge_3xi15(
@@ -2077,12 +2077,12 @@ define amdgpu_kernel void @select_sge_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sge <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -2090,7 +2090,7 @@ define amdgpu_kernel void @select_slt_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_slt_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp slt <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_slt_3xi15(
@@ -2101,12 +2101,12 @@ define amdgpu_kernel void @select_slt_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp slt <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -2114,7 +2114,7 @@ define amdgpu_kernel void @select_sle_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-LABEL: @select_sle_3xi15(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sle <3 x i15> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sle_3xi15(
@@ -2125,12 +2125,12 @@ define amdgpu_kernel void @select_sle_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i15> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sle <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -2138,7 +2138,7 @@ declare <3 x i15> @llvm.bitreverse.v3i15(<3 x i15>)
 define amdgpu_kernel void @bitreverse_3xi15(<3 x i15> %a) {
 ; SI-LABEL: @bitreverse_3xi15(
 ; SI-NEXT:    [[BREV:%.*]] = call <3 x i15> @llvm.bitreverse.v3i15(<3 x i15> [[A:%.*]])
-; SI-NEXT:    store volatile <3 x i15> [[BREV]], <3 x i15> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i15> [[BREV]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @bitreverse_3xi15(
@@ -2146,18 +2146,18 @@ define amdgpu_kernel void @bitreverse_3xi15(<3 x i15> %a) {
 ; VI-NEXT:    [[TMP2:%.*]] = call <3 x i32> @llvm.bitreverse.v3i32(<3 x i32> [[TMP1]])
 ; VI-NEXT:    [[TMP3:%.*]] = lshr <3 x i32> [[TMP2]], <i32 17, i32 17, i32 17>
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], <3 x i15> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %brev = call <3 x i15> @llvm.bitreverse.v3i15(<3 x i15> %a)
-  store volatile <3 x i15> %brev, <3 x i15> addrspace(1)* undef
+  store volatile <3 x i15> %brev, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @add_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @add_3xi16(
 ; SI-NEXT:    [[R:%.*]] = add <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_3xi16(
@@ -2165,18 +2165,18 @@ define amdgpu_kernel void @add_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = add <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @add_nsw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = add nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nsw_3xi16(
@@ -2184,18 +2184,18 @@ define amdgpu_kernel void @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = add nsw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @add_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @add_nuw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = add nuw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nuw_3xi16(
@@ -2203,18 +2203,18 @@ define amdgpu_kernel void @add_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = add nuw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @add_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @add_nuw_nsw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = add nuw nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @add_nuw_nsw_3xi16(
@@ -2222,18 +2222,18 @@ define amdgpu_kernel void @add_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = add nuw nsw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @sub_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @sub_3xi16(
 ; SI-NEXT:    [[R:%.*]] = sub <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_3xi16(
@@ -2241,18 +2241,18 @@ define amdgpu_kernel void @sub_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = sub nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = sub <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @sub_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @sub_nsw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = sub nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nsw_3xi16(
@@ -2260,18 +2260,18 @@ define amdgpu_kernel void @sub_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = sub nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = sub nsw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @sub_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @sub_nuw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = sub nuw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nuw_3xi16(
@@ -2279,18 +2279,18 @@ define amdgpu_kernel void @sub_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = sub nuw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @sub_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @sub_nuw_nsw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = sub nuw nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @sub_nuw_nsw_3xi16(
@@ -2298,18 +2298,18 @@ define amdgpu_kernel void @sub_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = sub nuw nsw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @mul_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @mul_3xi16(
 ; SI-NEXT:    [[R:%.*]] = mul <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_3xi16(
@@ -2317,18 +2317,18 @@ define amdgpu_kernel void @mul_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = mul <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @mul_nsw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = mul nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nsw_3xi16(
@@ -2336,18 +2336,18 @@ define amdgpu_kernel void @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = mul nsw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @mul_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @mul_nuw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = mul nuw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nuw_3xi16(
@@ -2355,18 +2355,18 @@ define amdgpu_kernel void @mul_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = mul nuw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @mul_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @mul_nuw_nsw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = mul nuw nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @mul_nuw_nsw_3xi16(
@@ -2374,18 +2374,18 @@ define amdgpu_kernel void @mul_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = mul nuw nsw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @shl_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @shl_3xi16(
 ; SI-NEXT:    [[R:%.*]] = shl <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_3xi16(
@@ -2393,18 +2393,18 @@ define amdgpu_kernel void @shl_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = shl <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @shl_nsw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = shl nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nsw_3xi16(
@@ -2412,18 +2412,18 @@ define amdgpu_kernel void @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = shl nsw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @shl_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @shl_nuw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = shl nuw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nuw_3xi16(
@@ -2431,18 +2431,18 @@ define amdgpu_kernel void @shl_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = shl nuw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @shl_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @shl_nuw_nsw_3xi16(
 ; SI-NEXT:    [[R:%.*]] = shl nuw nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @shl_nuw_nsw_3xi16(
@@ -2450,18 +2450,18 @@ define amdgpu_kernel void @shl_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = shl nuw nsw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @lshr_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @lshr_3xi16(
 ; SI-NEXT:    [[R:%.*]] = lshr <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @lshr_3xi16(
@@ -2469,18 +2469,18 @@ define amdgpu_kernel void @lshr_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = lshr <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = lshr <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @lshr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @lshr_exact_3xi16(
 ; SI-NEXT:    [[R:%.*]] = lshr exact <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @lshr_exact_3xi16(
@@ -2488,18 +2488,18 @@ define amdgpu_kernel void @lshr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = lshr exact <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = lshr exact <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @ashr_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @ashr_3xi16(
 ; SI-NEXT:    [[R:%.*]] = ashr <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @ashr_3xi16(
@@ -2507,18 +2507,18 @@ define amdgpu_kernel void @ashr_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = ashr <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = ashr <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @ashr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @ashr_exact_3xi16(
 ; SI-NEXT:    [[R:%.*]] = ashr exact <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @ashr_exact_3xi16(
@@ -2526,18 +2526,18 @@ define amdgpu_kernel void @ashr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = ashr exact <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = ashr exact <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @and_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @and_3xi16(
 ; SI-NEXT:    [[R:%.*]] = and <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @and_3xi16(
@@ -2545,18 +2545,18 @@ define amdgpu_kernel void @and_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = and <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = and <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @or_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @or_3xi16(
 ; SI-NEXT:    [[R:%.*]] = or <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @or_3xi16(
@@ -2564,18 +2564,18 @@ define amdgpu_kernel void @or_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = or <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = or <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_kernel void @xor_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @xor_3xi16(
 ; SI-NEXT:    [[R:%.*]] = xor <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @xor_3xi16(
@@ -2583,11 +2583,11 @@ define amdgpu_kernel void @xor_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
 ; VI-NEXT:    [[TMP3:%.*]] = xor <3 x i32> [[TMP1]], [[TMP2]]
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %r = xor <3 x i16> %a, %b
-  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %r, ptr addrspace(1) undef
   ret void
 }
 
@@ -2595,7 +2595,7 @@ define amdgpu_kernel void @select_eq_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_eq_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp eq <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_eq_3xi16(
@@ -2606,12 +2606,12 @@ define amdgpu_kernel void @select_eq_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp eq <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -2619,7 +2619,7 @@ define amdgpu_kernel void @select_ne_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_ne_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ne <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ne_3xi16(
@@ -2630,12 +2630,12 @@ define amdgpu_kernel void @select_ne_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ne <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -2643,7 +2643,7 @@ define amdgpu_kernel void @select_ugt_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_ugt_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ugt <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ugt_3xi16(
@@ -2654,12 +2654,12 @@ define amdgpu_kernel void @select_ugt_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ugt <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -2667,7 +2667,7 @@ define amdgpu_kernel void @select_uge_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_uge_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp uge <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_uge_3xi16(
@@ -2678,12 +2678,12 @@ define amdgpu_kernel void @select_uge_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp uge <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -2691,7 +2691,7 @@ define amdgpu_kernel void @select_ult_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_ult_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ult <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ult_3xi16(
@@ -2702,12 +2702,12 @@ define amdgpu_kernel void @select_ult_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ult <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -2715,7 +2715,7 @@ define amdgpu_kernel void @select_ule_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_ule_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp ule <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_ule_3xi16(
@@ -2726,12 +2726,12 @@ define amdgpu_kernel void @select_ule_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp ule <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -2739,7 +2739,7 @@ define amdgpu_kernel void @select_sgt_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_sgt_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sgt <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sgt_3xi16(
@@ -2750,12 +2750,12 @@ define amdgpu_kernel void @select_sgt_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sgt <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -2763,7 +2763,7 @@ define amdgpu_kernel void @select_sge_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_sge_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sge <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sge_3xi16(
@@ -2774,12 +2774,12 @@ define amdgpu_kernel void @select_sge_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sge <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -2787,7 +2787,7 @@ define amdgpu_kernel void @select_slt_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_slt_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp slt <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_slt_3xi16(
@@ -2798,12 +2798,12 @@ define amdgpu_kernel void @select_slt_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp slt <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -2811,7 +2811,7 @@ define amdgpu_kernel void @select_sle_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-LABEL: @select_sle_3xi16(
 ; SI-NEXT:    [[CMP:%.*]] = icmp sle <3 x i16> [[A:%.*]], [[B:%.*]]
 ; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @select_sle_3xi16(
@@ -2822,12 +2822,12 @@ define amdgpu_kernel void @select_sle_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i16> [[B]] to <3 x i32>
 ; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
 ; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %cmp = icmp sle <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %sel, ptr addrspace(1) undef
   ret void
 }
 
@@ -2836,7 +2836,7 @@ declare <3 x i16> @llvm.bitreverse.v3i16(<3 x i16>)
 define amdgpu_kernel void @bitreverse_3xi16(<3 x i16> %a) {
 ; SI-LABEL: @bitreverse_3xi16(
 ; SI-NEXT:    [[BREV:%.*]] = call <3 x i16> @llvm.bitreverse.v3i16(<3 x i16> [[A:%.*]])
-; SI-NEXT:    store volatile <3 x i16> [[BREV]], <3 x i16> addrspace(1)* undef, align 8
+; SI-NEXT:    store volatile <3 x i16> [[BREV]], ptr addrspace(1) undef, align 8
 ; SI-NEXT:    ret void
 ;
 ; VI-LABEL: @bitreverse_3xi16(
@@ -2844,10 +2844,10 @@ define amdgpu_kernel void @bitreverse_3xi16(<3 x i16> %a) {
 ; VI-NEXT:    [[TMP2:%.*]] = call <3 x i32> @llvm.bitreverse.v3i32(<3 x i32> [[TMP1]])
 ; VI-NEXT:    [[TMP3:%.*]] = lshr <3 x i32> [[TMP2]], <i32 16, i32 16, i32 16>
 ; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], <3 x i16> addrspace(1)* undef, align 8
+; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) undef, align 8
 ; VI-NEXT:    ret void
 ;
   %brev = call <3 x i16> @llvm.bitreverse.v3i16(<3 x i16> %a)
-  store volatile <3 x i16> %brev, <3 x i16> addrspace(1)* undef
+  store volatile <3 x i16> %brev, ptr addrspace(1) undef
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index a76b69fab36f9..92f5d0a2d4057 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s
 
-define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; CHECK-LABEL: @udiv_i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
 ; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
@@ -34,7 +34,7 @@ define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[TMP24]], 1
 ; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]]
-; CHECK-NEXT:    store i32 [[TMP29]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP29]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i32:
@@ -99,11 +99,11 @@ define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i32 %x, %y
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; CHECK-LABEL: @urem_i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
 ; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
@@ -132,7 +132,7 @@ define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]]
-; CHECK-NEXT:    store i32 [[TMP27]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP27]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i32:
@@ -192,11 +192,11 @@ define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem i32 %x, %y
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; CHECK-LABEL: @sdiv_i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
 ; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
@@ -236,7 +236,7 @@ define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]]
-; CHECK-NEXT:    store i32 [[TMP38]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP38]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i32:
@@ -319,11 +319,11 @@ define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i32 %x, %y
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; CHECK-LABEL: @srem_i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
 ; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
@@ -360,7 +360,7 @@ define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]]
-; CHECK-NEXT:    store i32 [[TMP35]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP35]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i32:
@@ -434,11 +434,11 @@ define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i32 %x, %y
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
+define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; CHECK-LABEL: @udiv_i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
@@ -457,7 +457,7 @@ define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 65535
 ; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
-; CHECK-NEXT:    store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2
+; CHECK-NEXT:    store i16 [[TMP17]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i16:
@@ -501,11 +501,11 @@ define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    global_store_short v3, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i16 %x, %y
-  store i16 %r, i16 addrspace(1)* %out
+  store i16 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
+define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; CHECK-LABEL: @urem_i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
@@ -526,7 +526,7 @@ define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
 ; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
-; CHECK-NEXT:    store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2
+; CHECK-NEXT:    store i16 [[TMP19]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i16:
@@ -575,11 +575,11 @@ define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem i16 %x, %y
-  store i16 %r, i16 addrspace(1)* %out
+  store i16 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
+define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; CHECK-LABEL: @sdiv_i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
@@ -602,7 +602,7 @@ define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 16
 ; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 16
 ; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
-; CHECK-NEXT:    store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2
+; CHECK-NEXT:    store i16 [[TMP21]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i16:
@@ -656,11 +656,11 @@ define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    global_store_short v1, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i16 %x, %y
-  store i16 %r, i16 addrspace(1)* %out
+  store i16 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
+define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; CHECK-LABEL: @srem_i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
@@ -685,7 +685,7 @@ define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
 ; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
 ; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
-; CHECK-NEXT:    store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2
+; CHECK-NEXT:    store i16 [[TMP23]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i16:
@@ -744,11 +744,11 @@ define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i16 %x, %y
-  store i16 %r, i16 addrspace(1)* %out
+  store i16 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
+define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; CHECK-LABEL: @udiv_i8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
@@ -767,7 +767,7 @@ define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 255
 ; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
-; CHECK-NEXT:    store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1
+; CHECK-NEXT:    store i8 [[TMP17]], ptr addrspace(1) [[OUT:%.*]], align 1
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i8:
@@ -807,11 +807,11 @@ define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    global_store_byte v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i8 %x, %y
-  store i8 %r, i8 addrspace(1)* %out
+  store i8 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
+define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; CHECK-LABEL: @urem_i8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
@@ -832,7 +832,7 @@ define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 255
 ; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8
-; CHECK-NEXT:    store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1
+; CHECK-NEXT:    store i8 [[TMP19]], ptr addrspace(1) [[OUT:%.*]], align 1
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i8:
@@ -879,11 +879,11 @@ define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem i8 %x, %y
-  store i8 %r, i8 addrspace(1)* %out
+  store i8 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
+define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; CHECK-LABEL: @sdiv_i8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
@@ -906,7 +906,7 @@ define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 24
 ; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 24
 ; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8
-; CHECK-NEXT:    store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1
+; CHECK-NEXT:    store i8 [[TMP21]], ptr addrspace(1) [[OUT:%.*]], align 1
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i8:
@@ -960,11 +960,11 @@ define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i8 %x, %y
-  store i8 %r, i8 addrspace(1)* %out
+  store i8 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
+define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; CHECK-LABEL: @srem_i8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
@@ -989,7 +989,7 @@ define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 24
 ; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 24
 ; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8
-; CHECK-NEXT:    store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1
+; CHECK-NEXT:    store i8 [[TMP23]], ptr addrspace(1) [[OUT:%.*]], align 1
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i8:
@@ -1049,11 +1049,11 @@ define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i8 %x, %y
-  store i8 %r, i8 addrspace(1)* %out
+  store i8 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @udiv_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
@@ -1183,7 +1183,7 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; CHECK-NEXT:    [[TMP126:%.*]] = add i32 [[TMP122]], 1
 ; CHECK-NEXT:    [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]]
 ; CHECK-NEXT:    [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3
-; CHECK-NEXT:    store <4 x i32> [[TMP128]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <4 x i32> [[TMP128]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_v4i32:
@@ -1377,11 +1377,11 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <4 x i32> %x, %y
-  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @urem_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
@@ -1503,7 +1503,7 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; CHECK-NEXT:    [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]]
 ; CHECK-NEXT:    [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]]
 ; CHECK-NEXT:    [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3
-; CHECK-NEXT:    store <4 x i32> [[TMP120]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <4 x i32> [[TMP120]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_v4i32:
@@ -1677,11 +1677,11 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem <4 x i32> %x, %y
-  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @sdiv_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
@@ -1847,7 +1847,7 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; CHECK-NEXT:    [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]]
 ; CHECK-NEXT:    [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]]
 ; CHECK-NEXT:    [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3
-; CHECK-NEXT:    store <4 x i32> [[TMP164]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <4 x i32> [[TMP164]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_v4i32:
@@ -2113,11 +2113,11 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <4 x i32> %x, %y
-  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @srem_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
@@ -2271,7 +2271,7 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; CHECK-NEXT:    [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]]
 ; CHECK-NEXT:    [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]]
 ; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3
-; CHECK-NEXT:    store <4 x i32> [[TMP152]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <4 x i32> [[TMP152]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_v4i32:
@@ -2509,11 +2509,11 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem <4 x i32> %x, %y
-  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
+define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x i16> %y) {
 ; CHECK-LABEL: @udiv_v4i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
@@ -2595,7 +2595,7 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; CHECK-NEXT:    [[TMP78:%.*]] = and i32 [[TMP77]], 65535
 ; CHECK-NEXT:    [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16
 ; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3
-; CHECK-NEXT:    store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP80]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_v4i16:
@@ -2715,11 +2715,11 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <4 x i16> %x, %y
-  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
+  store <4 x i16> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
+define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x i16> %y) {
 ; CHECK-LABEL: @urem_v4i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
@@ -2809,7 +2809,7 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; CHECK-NEXT:    [[TMP86:%.*]] = and i32 [[TMP85]], 65535
 ; CHECK-NEXT:    [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16
 ; CHECK-NEXT:    [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3
-; CHECK-NEXT:    store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP88]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_v4i16:
@@ -2945,11 +2945,11 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem <4 x i16> %x, %y
-  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
+  store <4 x i16> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
+define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x i16> %y) {
 ; CHECK-LABEL: @sdiv_v4i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
@@ -3047,7 +3047,7 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; CHECK-NEXT:    [[TMP94:%.*]] = ashr i32 [[TMP93]], 16
 ; CHECK-NEXT:    [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16
 ; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3
-; CHECK-NEXT:    store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP96]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_v4i16:
@@ -3207,11 +3207,11 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <4 x i16> %x, %y
-  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
+  store <4 x i16> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
+define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x i16> %y) {
 ; CHECK-LABEL: @srem_v4i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
@@ -3317,7 +3317,7 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; CHECK-NEXT:    [[TMP102:%.*]] = ashr i32 [[TMP101]], 16
 ; CHECK-NEXT:    [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16
 ; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3
-; CHECK-NEXT:    store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP104]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_v4i16:
@@ -3497,11 +3497,11 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = srem <4 x i16> %x, %y
-  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
+  store <4 x i16> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
+define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; CHECK-LABEL: @udiv_i3(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
@@ -3520,7 +3520,7 @@ define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 7
 ; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3
-; CHECK-NEXT:    store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1
+; CHECK-NEXT:    store i3 [[TMP17]], ptr addrspace(1) [[OUT:%.*]], align 1
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i3:
@@ -3566,11 +3566,11 @@ define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX9-NEXT:    global_store_byte v2, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i3 %x, %y
-  store i3 %r, i3 addrspace(1)* %out
+  store i3 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
+define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; CHECK-LABEL: @urem_i3(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
@@ -3591,7 +3591,7 @@ define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 7
 ; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3
-; CHECK-NEXT:    store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1
+; CHECK-NEXT:    store i3 [[TMP19]], ptr addrspace(1) [[OUT:%.*]], align 1
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i3:
@@ -3644,11 +3644,11 @@ define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem i3 %x, %y
-  store i3 %r, i3 addrspace(1)* %out
+  store i3 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
+define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; CHECK-LABEL: @sdiv_i3(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
@@ -3671,7 +3671,7 @@ define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 29
 ; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 29
 ; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3
-; CHECK-NEXT:    store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1
+; CHECK-NEXT:    store i3 [[TMP21]], ptr addrspace(1) [[OUT:%.*]], align 1
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i3:
@@ -3727,11 +3727,11 @@ define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i3 %x, %y
-  store i3 %r, i3 addrspace(1)* %out
+  store i3 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
+define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; CHECK-LABEL: @srem_i3(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
@@ -3756,7 +3756,7 @@ define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 29
 ; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 29
 ; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3
-; CHECK-NEXT:    store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1
+; CHECK-NEXT:    store i3 [[TMP23]], ptr addrspace(1) [[OUT:%.*]], align 1
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i3:
@@ -3819,11 +3819,11 @@ define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i3 %x, %y
-  store i3 %r, i3 addrspace(1)* %out
+  store i3 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
+define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x i16> %y) {
 ; CHECK-LABEL: @udiv_v3i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
@@ -3885,7 +3885,7 @@ define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
 ; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2
-; CHECK-NEXT:    store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <3 x i16> [[TMP60]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_v3i16:
@@ -3980,11 +3980,11 @@ define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX9-NEXT:    global_store_dword v6, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <3 x i16> %x, %y
-  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
+  store <3 x i16> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
+define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x i16> %y) {
 ; CHECK-LABEL: @urem_v3i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
@@ -4052,7 +4052,7 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
 ; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
 ; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2
-; CHECK-NEXT:    store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <3 x i16> [[TMP66]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_v3i16:
@@ -4160,11 +4160,11 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem <3 x i16> %x, %y
-  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
+  store <3 x i16> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
+define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x i16> %y) {
 ; CHECK-LABEL: @sdiv_v3i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
@@ -4238,7 +4238,7 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
 ; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
 ; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2
-; CHECK-NEXT:    store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <3 x i16> [[TMP72]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_v3i16:
@@ -4363,11 +4363,11 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX9-NEXT:    global_store_dword v1, v2, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <3 x i16> %x, %y
-  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
+  store <3 x i16> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
+define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x i16> %y) {
 ; CHECK-LABEL: @srem_v3i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
@@ -4447,7 +4447,7 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
 ; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
 ; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2
-; CHECK-NEXT:    store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <3 x i16> [[TMP78]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_v3i16:
@@ -4587,11 +4587,11 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem <3 x i16> %x, %y
-  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
+  store <3 x i16> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
+define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x i15> %y) {
 ; CHECK-LABEL: @udiv_v3i15(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
@@ -4653,7 +4653,7 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 32767
 ; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2
-; CHECK-NEXT:    store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <3 x i15> [[TMP60]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_v3i15:
@@ -4768,11 +4768,11 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    global_store_short v2, v0, s[4:5] offset:4
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <3 x i15> %x, %y
-  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
+  store <3 x i15> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
+define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x i15> %y) {
 ; CHECK-LABEL: @urem_v3i15(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
@@ -4840,7 +4840,7 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 32767
 ; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15
 ; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2
-; CHECK-NEXT:    store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <3 x i15> [[TMP66]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_v3i15:
@@ -4971,11 +4971,11 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    global_store_short v2, v0, s[4:5] offset:4
 ; GFX9-NEXT:    s_endpgm
   %r = urem <3 x i15> %x, %y
-  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
+  store <3 x i15> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
+define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x i15> %y) {
 ; CHECK-LABEL: @sdiv_v3i15(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
@@ -5049,7 +5049,7 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 17
 ; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15
 ; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2
-; CHECK-NEXT:    store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <3 x i15> [[TMP72]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_v3i15:
@@ -5192,11 +5192,11 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    global_store_short v2, v0, s[4:5] offset:4
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <3 x i15> %x, %y
-  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
+  store <3 x i15> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
+define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x i15> %y) {
 ; CHECK-LABEL: @srem_v3i15(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
@@ -5276,7 +5276,7 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 17
 ; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15
 ; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2
-; CHECK-NEXT:    store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <3 x i15> [[TMP78]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_v3i15:
@@ -5439,14 +5439,14 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX9-NEXT:    global_store_short v2, v0, s[4:5] offset:4
 ; GFX9-NEXT:    s_endpgm
   %r = srem <3 x i15> %x, %y
-  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
+  store <3 x i15> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; CHECK-LABEL: @udiv_i32_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 1235195
-; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i32_oddk_denom:
@@ -5480,14 +5480,14 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i32 %x, 1235195
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @udiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; CHECK-LABEL: @udiv_i32_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 4096
-; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i32_pow2k_denom:
@@ -5513,15 +5513,15 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i32 %x, 4096
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; CHECK-LABEL: @udiv_i32_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i32_pow2_shl_denom:
@@ -5550,11 +5550,11 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = udiv i32 %x, %shl.y
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
+define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) {
 ; CHECK-LABEL: @udiv_v2i32_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
@@ -5562,7 +5562,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
-; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_v2i32_pow2k_denom:
@@ -5592,11 +5592,11 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <2 x i32> %x, <i32 4096, i32 4096>
-  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
+define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) {
 ; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
@@ -5604,7 +5604,7 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)*
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
-; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom:
@@ -5642,11 +5642,11 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)*
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
-  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
@@ -5713,7 +5713,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP58]], 1
 ; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
 ; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1
-; CHECK-NEXT:    store <2 x i32> [[TMP64]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP64]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_v2i32_pow2_shl_denom:
@@ -5826,14 +5826,14 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = udiv <2 x i32> %x, %shl.y
-  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; CHECK-LABEL: @urem_i32_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 1235195
-; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i32_oddk_denom:
@@ -5872,14 +5872,14 @@ define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = urem i32 %x, 1235195
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @urem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; CHECK-LABEL: @urem_i32_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 4096
-; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i32_pow2k_denom:
@@ -5905,15 +5905,15 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = urem i32 %x, 4096
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; CHECK-LABEL: @urem_i32_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i32_pow2_shl_denom:
@@ -5944,11 +5944,11 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = urem i32 %x, %shl.y
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
+define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) {
 ; CHECK-LABEL: @urem_v2i32_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = urem i32 [[TMP1]], 4096
@@ -5956,7 +5956,7 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = urem i32 [[TMP4]], 4096
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
-; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_v2i32_pow2k_denom:
@@ -5986,11 +5986,11 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem <2 x i32> %x, <i32 4096, i32 4096>
-  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @urem_v2i32_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
@@ -6053,7 +6053,7 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; CHECK-NEXT:    [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
 ; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1
-; CHECK-NEXT:    store <2 x i32> [[TMP60]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP60]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_v2i32_pow2_shl_denom:
@@ -6156,14 +6156,14 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = urem <2 x i32> %x, %shl.y
-  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; CHECK-LABEL: @sdiv_i32_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195
-; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i32_oddk_denom:
@@ -6197,14 +6197,14 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i32 %x, 1235195
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @sdiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; CHECK-LABEL: @sdiv_i32_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 4096
-; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i32_pow2k_denom:
@@ -6236,15 +6236,15 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i32 %x, 4096
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; CHECK-LABEL: @sdiv_i32_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i32_pow2_shl_denom:
@@ -6330,11 +6330,11 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = sdiv i32 %x, %shl.y
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
+define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) {
 ; CHECK-LABEL: @sdiv_v2i32_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
@@ -6342,7 +6342,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
-; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_v2i32_pow2k_denom:
@@ -6384,11 +6384,11 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <2 x i32> %x, <i32 4096, i32 4096>
-  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
+define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) {
 ; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
@@ -6396,7 +6396,7 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)*
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
-; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
@@ -6440,11 +6440,11 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)*
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <2 x i32> %x, <i32 4096, i32 4095>
-  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
@@ -6529,7 +6529,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; CHECK-NEXT:    [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
 ; CHECK-NEXT:    [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
 ; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1
-; CHECK-NEXT:    store <2 x i32> [[TMP82]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP82]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom:
@@ -6678,14 +6678,14 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = sdiv <2 x i32> %x, %shl.y
-  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @srem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; CHECK-LABEL: @srem_i32_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 1235195
-; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i32_oddk_denom:
@@ -6725,14 +6725,14 @@ define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i32 %x, 1235195
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @srem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; CHECK-LABEL: @srem_i32_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 4096
-; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i32_pow2k_denom:
@@ -6766,15 +6766,15 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i32 %x, 4096
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; CHECK-LABEL: @srem_i32_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i32_pow2_shl_denom:
@@ -6851,11 +6851,11 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = srem i32 %x, %shl.y
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
+define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) {
 ; CHECK-LABEL: @srem_v2i32_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = srem i32 [[TMP1]], 4096
@@ -6863,7 +6863,7 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = srem i32 [[TMP4]], 4096
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
-; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_v2i32_pow2k_denom:
@@ -6909,11 +6909,11 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out,
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem <2 x i32> %x, <i32 4096, i32 4096>
-  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @srem_v2i32_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
@@ -6992,7 +6992,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; CHECK-NEXT:    [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
 ; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
 ; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1
-; CHECK-NEXT:    store <2 x i32> [[TMP76]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP76]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_v2i32_pow2_shl_denom:
@@ -7127,14 +7127,14 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = srem <2 x i32> %x, %shl.y
-  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @udiv_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943
-; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i64_oddk_denom:
@@ -7390,14 +7390,14 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i64 %x, 1235195949943
-  store i64 %r, i64 addrspace(1)* %out
+  store i64 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @udiv_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 4096
-; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i64_pow2k_denom:
@@ -7425,15 +7425,15 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i64 %x, 4096
-  store i64 %r, i64 addrspace(1)* %out
+  store i64 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; CHECK-LABEL: @udiv_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i64_pow2_shl_denom:
@@ -7466,11 +7466,11 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i64 4096, %y
   %r = udiv i64 %x, %shl.y
-  store i64 %r, i64 addrspace(1)* %out
+  store i64 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
+define amdgpu_kernel void @udiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) {
 ; CHECK-LABEL: @udiv_v2i64_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
@@ -7478,7 +7478,7 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_v2i64_pow2k_denom:
@@ -7512,11 +7512,11 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <2 x i64> %x, <i64 4096, i64 4096>
-  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
+define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) {
 ; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
@@ -7524,7 +7524,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom:
@@ -7758,11 +7758,11 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <2 x i64> %x, <i64 4096, i64 4095>
-  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
@@ -7773,7 +7773,7 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
-; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i64> [[TMP8]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_v2i64_pow2_shl_denom:
@@ -7812,14 +7812,14 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = udiv <2 x i64> %x, %shl.y
-  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @urem_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993
-; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i64_oddk_denom:
@@ -8072,14 +8072,14 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = urem i64 %x, 1235195393993
-  store i64 %r, i64 addrspace(1)* %out
+  store i64 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @urem_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 4096
-; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i64_pow2k_denom:
@@ -8106,15 +8106,15 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
 ; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem i64 %x, 4096
-  store i64 %r, i64 addrspace(1)* %out
+  store i64 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; CHECK-LABEL: @urem_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i64_pow2_shl_denom:
@@ -8153,11 +8153,11 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i64 4096, %y
   %r = urem i64 %x, %shl.y
-  store i64 %r, i64 addrspace(1)* %out
+  store i64 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
+define amdgpu_kernel void @urem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) {
 ; CHECK-LABEL: @urem_v2i64_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = urem i64 [[TMP1]], 4096
@@ -8165,7 +8165,7 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = urem i64 [[TMP4]], 4096
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_v2i64_pow2k_denom:
@@ -8198,11 +8198,11 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GFX9-NEXT:    global_store_dwordx4 v1, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = urem <2 x i64> %x, <i64 4096, i64 4096>
-  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: @urem_v2i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
@@ -8213,7 +8213,7 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
-; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i64> [[TMP8]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_v2i64_pow2_shl_denom:
@@ -8262,14 +8262,14 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = urem <2 x i64> %x, %shl.y
-  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @sdiv_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195
-; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i64_oddk_denom:
@@ -8517,14 +8517,14 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i64 %x, 1235195
-  store i64 %r, i64 addrspace(1)* %out
+  store i64 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @sdiv_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 4096
-; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i64_pow2k_denom:
@@ -8560,15 +8560,15 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i64 %x, 4096
-  store i64 %r, i64 addrspace(1)* %out
+  store i64 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; CHECK-LABEL: @sdiv_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
@@ -8861,11 +8861,11 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i64 4096, %y
   %r = sdiv i64 %x, %shl.y
-  store i64 %r, i64 addrspace(1)* %out
+  store i64 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
+define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) {
 ; CHECK-LABEL: @sdiv_v2i64_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
@@ -8873,7 +8873,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_v2i64_pow2k_denom:
@@ -8923,11 +8923,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <2 x i64> %x, <i64 4096, i64 4096>
-  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
+define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) {
 ; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
@@ -8935,7 +8935,7 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
@@ -9198,11 +9198,11 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <2 x i64> %x, <i64 4096, i64 4095>
-  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
@@ -9213,7 +9213,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
-; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i64> [[TMP8]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom:
@@ -9768,14 +9768,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = sdiv <2 x i64> %x, %shl.y
-  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @srem_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 1235195
-; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i64_oddk_denom:
@@ -10017,14 +10017,14 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i64 %x, 1235195
-  store i64 %r, i64 addrspace(1)* %out
+  store i64 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @srem_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 4096
-; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i64_pow2k_denom:
@@ -10064,15 +10064,15 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i64 %x, 4096
-  store i64 %r, i64 addrspace(1)* %out
+  store i64 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; CHECK-LABEL: @srem_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i64_pow2_shl_denom:
@@ -10360,11 +10360,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i64 4096, %y
   %r = srem i64 %x, %shl.y
-  store i64 %r, i64 addrspace(1)* %out
+  store i64 %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
+define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) {
 ; CHECK-LABEL: @srem_v2i64_pow2k_denom(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = srem i64 [[TMP1]], 4096
@@ -10372,7 +10372,7 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = srem i64 [[TMP4]], 4096
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_v2i64_pow2k_denom:
@@ -10430,11 +10430,11 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out,
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = srem <2 x i64> %x, <i64 4096, i64 4096>
-  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %r, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: @srem_v2i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
@@ -10445,7 +10445,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
-; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <2 x i64> [[TMP8]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_v2i64_pow2_shl_denom:
@@ -10990,6 +10990,6 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = srem <2 x i64> %x, %shl.y
-  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %r, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
index 4993b95d228e2..47bbf3b6babf1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
@@ -17,6 +17,6 @@ define amdgpu_kernel void @kernel_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg
   %vi = bitcast float %v to i32
   %x = add i32 %vi, %w
   %xf = bitcast i32 %x to float
-  store float %xf, float addrspace(1)* undef
+  store float %xf, ptr addrspace(1) undef
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
index 8f4f1c3915351..3e19ee5567929 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
@@ -5,66 +5,66 @@
 ; TRAP-HANDLER-ENABLE:  NumSgprs: 61
 ; TRAP-HANDLER-DISABLE: NumSgprs: 77
 define amdgpu_kernel void @amdhsa_trap_num_sgprs(
-    i32 addrspace(1)* %out0, i32 %in0,
-    i32 addrspace(1)* %out1, i32 %in1,
-    i32 addrspace(1)* %out2, i32 %in2,
-    i32 addrspace(1)* %out3, i32 %in3,
-    i32 addrspace(1)* %out4, i32 %in4,
-    i32 addrspace(1)* %out5, i32 %in5,
-    i32 addrspace(1)* %out6, i32 %in6,
-    i32 addrspace(1)* %out7, i32 %in7,
-    i32 addrspace(1)* %out8, i32 %in8,
-    i32 addrspace(1)* %out9, i32 %in9,
-    i32 addrspace(1)* %out10, i32 %in10,
-    i32 addrspace(1)* %out11, i32 %in11,
-    i32 addrspace(1)* %out12, i32 %in12,
-    i32 addrspace(1)* %out13, i32 %in13,
-    i32 addrspace(1)* %out14, i32 %in14,
-    i32 addrspace(1)* %out15, i32 %in15,
-    i32 addrspace(1)* %out16, i32 %in16,
-    i32 addrspace(1)* %out17, i32 %in17,
-    i32 addrspace(1)* %out18, i32 %in18,
-    i32 addrspace(1)* %out19, i32 %in19,
-    i32 addrspace(1)* %out20, i32 %in20,
-    i32 addrspace(1)* %out21, i32 %in21,
-    i32 addrspace(1)* %out22, i32 %in22,
-    i32 addrspace(1)* %out23, i32 %in23,
-    i32 addrspace(1)* %out24, i32 %in24,
-    i32 addrspace(1)* %out25, i32 %in25,
-    i32 addrspace(1)* %out26, i32 %in26,
-    i32 addrspace(1)* %out27, i32 %in27,
-    i32 addrspace(1)* %out28, i32 %in28,
-    i32 addrspace(1)* %out29, i32 %in29) {
+    ptr addrspace(1) %out0, i32 %in0,
+    ptr addrspace(1) %out1, i32 %in1,
+    ptr addrspace(1) %out2, i32 %in2,
+    ptr addrspace(1) %out3, i32 %in3,
+    ptr addrspace(1) %out4, i32 %in4,
+    ptr addrspace(1) %out5, i32 %in5,
+    ptr addrspace(1) %out6, i32 %in6,
+    ptr addrspace(1) %out7, i32 %in7,
+    ptr addrspace(1) %out8, i32 %in8,
+    ptr addrspace(1) %out9, i32 %in9,
+    ptr addrspace(1) %out10, i32 %in10,
+    ptr addrspace(1) %out11, i32 %in11,
+    ptr addrspace(1) %out12, i32 %in12,
+    ptr addrspace(1) %out13, i32 %in13,
+    ptr addrspace(1) %out14, i32 %in14,
+    ptr addrspace(1) %out15, i32 %in15,
+    ptr addrspace(1) %out16, i32 %in16,
+    ptr addrspace(1) %out17, i32 %in17,
+    ptr addrspace(1) %out18, i32 %in18,
+    ptr addrspace(1) %out19, i32 %in19,
+    ptr addrspace(1) %out20, i32 %in20,
+    ptr addrspace(1) %out21, i32 %in21,
+    ptr addrspace(1) %out22, i32 %in22,
+    ptr addrspace(1) %out23, i32 %in23,
+    ptr addrspace(1) %out24, i32 %in24,
+    ptr addrspace(1) %out25, i32 %in25,
+    ptr addrspace(1) %out26, i32 %in26,
+    ptr addrspace(1) %out27, i32 %in27,
+    ptr addrspace(1) %out28, i32 %in28,
+    ptr addrspace(1) %out29, i32 %in29) {
 entry:
-  store i32 %in0, i32 addrspace(1)* %out0
-  store i32 %in1, i32 addrspace(1)* %out1
-  store i32 %in2, i32 addrspace(1)* %out2
-  store i32 %in3, i32 addrspace(1)* %out3
-  store i32 %in4, i32 addrspace(1)* %out4
-  store i32 %in5, i32 addrspace(1)* %out5
-  store i32 %in6, i32 addrspace(1)* %out6
-  store i32 %in7, i32 addrspace(1)* %out7
-  store i32 %in8, i32 addrspace(1)* %out8
-  store i32 %in9, i32 addrspace(1)* %out9
-  store i32 %in10, i32 addrspace(1)* %out10
-  store i32 %in11, i32 addrspace(1)* %out11
-  store i32 %in12, i32 addrspace(1)* %out12
-  store i32 %in13, i32 addrspace(1)* %out13
-  store i32 %in14, i32 addrspace(1)* %out14
-  store i32 %in15, i32 addrspace(1)* %out15
-  store i32 %in16, i32 addrspace(1)* %out16
-  store i32 %in17, i32 addrspace(1)* %out17
-  store i32 %in18, i32 addrspace(1)* %out18
-  store i32 %in19, i32 addrspace(1)* %out19
-  store i32 %in20, i32 addrspace(1)* %out20
-  store i32 %in21, i32 addrspace(1)* %out21
-  store i32 %in22, i32 addrspace(1)* %out22
-  store i32 %in23, i32 addrspace(1)* %out23
-  store i32 %in24, i32 addrspace(1)* %out24
-  store i32 %in25, i32 addrspace(1)* %out25
-  store i32 %in26, i32 addrspace(1)* %out26
-  store i32 %in27, i32 addrspace(1)* %out27
-  store i32 %in28, i32 addrspace(1)* %out28
-  store i32 %in29, i32 addrspace(1)* %out29
+  store i32 %in0, ptr addrspace(1) %out0
+  store i32 %in1, ptr addrspace(1) %out1
+  store i32 %in2, ptr addrspace(1) %out2
+  store i32 %in3, ptr addrspace(1) %out3
+  store i32 %in4, ptr addrspace(1) %out4
+  store i32 %in5, ptr addrspace(1) %out5
+  store i32 %in6, ptr addrspace(1) %out6
+  store i32 %in7, ptr addrspace(1) %out7
+  store i32 %in8, ptr addrspace(1) %out8
+  store i32 %in9, ptr addrspace(1) %out9
+  store i32 %in10, ptr addrspace(1) %out10
+  store i32 %in11, ptr addrspace(1) %out11
+  store i32 %in12, ptr addrspace(1) %out12
+  store i32 %in13, ptr addrspace(1) %out13
+  store i32 %in14, ptr addrspace(1) %out14
+  store i32 %in15, ptr addrspace(1) %out15
+  store i32 %in16, ptr addrspace(1) %out16
+  store i32 %in17, ptr addrspace(1) %out17
+  store i32 %in18, ptr addrspace(1) %out18
+  store i32 %in19, ptr addrspace(1) %out19
+  store i32 %in20, ptr addrspace(1) %out20
+  store i32 %in21, ptr addrspace(1) %out21
+  store i32 %in22, ptr addrspace(1) %out22
+  store i32 %in23, ptr addrspace(1) %out23
+  store i32 %in24, ptr addrspace(1) %out24
+  store i32 %in25, ptr addrspace(1) %out25
+  store i32 %in26, ptr addrspace(1) %out26
+  store i32 %in27, ptr addrspace(1) %out27
+  store i32 %in28, ptr addrspace(1) %out28
+  store i32 %in29, ptr addrspace(1) %out29
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index 1cb49b817e19e..14db2ab9c419c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -5,7 +5,7 @@
 declare amdgpu_gfx float @extern_func(float) #0
 declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0
 
- at funcptr = external hidden unnamed_addr addrspace(4) constant void()*, align 4
+ at funcptr = external hidden unnamed_addr addrspace(4) constant ptr, align 4
 
 define amdgpu_gfx float @no_stack(float %arg0) #0 {
   %add = fadd float %arg0, 1.0
@@ -14,20 +14,20 @@ define amdgpu_gfx float @no_stack(float %arg0) #0 {
 
 define amdgpu_gfx float @simple_stack(float %arg0) #0 {
   %stack = alloca float, i32 4, align 4, addrspace(5)
-  store volatile float 2.0, float addrspace(5)* %stack
-  %val = load volatile float, float addrspace(5)* %stack
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
   %add = fadd float %arg0, %val
   ret float %add
 }
 
 define amdgpu_gfx float @multiple_stack(float %arg0) #0 {
   %stack = alloca float, i32 4, align 4, addrspace(5)
-  store volatile float 2.0, float addrspace(5)* %stack
-  %val = load volatile float, float addrspace(5)* %stack
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
   %add = fadd float %arg0, %val
   %stack2 = alloca float, i32 4, align 4, addrspace(5)
-  store volatile float 2.0, float addrspace(5)* %stack2
-  %val2 = load volatile float, float addrspace(5)* %stack2
+  store volatile float 2.0, ptr addrspace(5) %stack2
+  %val2 = load volatile float, ptr addrspace(5) %stack2
   %add2 = fadd float %add, %val2
   ret float %add2
 }
@@ -39,8 +39,8 @@ bb0:
 
 bb1:
   %stack = alloca float, i32 4, align 4, addrspace(5)
-  store volatile float 2.0, float addrspace(5)* %stack
-  %val = load volatile float, float addrspace(5)* %stack
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
   %add = fadd float %arg0, %val
   br label %bb2
 
@@ -56,8 +56,8 @@ bb0:
 bb1:
   %ctr = phi i32 [ 0, %bb0 ], [ %newctr, %bb1 ]
   %stack = alloca float, i32 4, align 4, addrspace(5)
-  store volatile float 2.0, float addrspace(5)* %stack
-  %val = load volatile float, float addrspace(5)* %stack
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
   %add = fadd float %arg0, %val
   %cmp = icmp sgt i32 %ctr, 0
   %newctr = sub i32 %ctr, 1
@@ -74,8 +74,8 @@ define amdgpu_gfx float @no_stack_call(float %arg0) #0 {
 
 define amdgpu_gfx float @simple_stack_call(float %arg0) #0 {
   %stack = alloca float, i32 4, align 4, addrspace(5)
-  store volatile float 2.0, float addrspace(5)* %stack
-  %val = load volatile float, float addrspace(5)* %stack
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
   %res = call amdgpu_gfx float @simple_stack(float %arg0)
   %add = fadd float %res, %val
   ret float %add
@@ -88,8 +88,8 @@ define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 {
 
 define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 {
   %stack = alloca float, i32 4, align 4, addrspace(5)
-  store volatile float 2.0, float addrspace(5)* %stack
-  %val = load volatile float, float addrspace(5)* %stack
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
   %res = call amdgpu_gfx float @extern_func(float %arg0)
   %add = fadd float %res, %val
   ret float %add
@@ -101,16 +101,16 @@ define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 {
 }
 
 define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 {
-  %fptr = load void()*, void()* addrspace(4)* @funcptr
+  %fptr = load ptr, ptr addrspace(4) @funcptr
   call amdgpu_gfx void %fptr()
   ret float %arg0
 }
 
 define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 {
   %stack = alloca float, i32 4, align 4, addrspace(5)
-  store volatile float 2.0, float addrspace(5)* %stack
-  %val = load volatile float, float addrspace(5)* %stack
-  %fptr = load void()*, void()* addrspace(4)* @funcptr
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
+  %fptr = load ptr, ptr addrspace(4) @funcptr
   call amdgpu_gfx void %fptr()
   %add = fadd float %arg0, %val
   ret float %add
@@ -118,8 +118,8 @@ define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 {
 
 define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 {
   %stack = alloca float, i32 4, align 4, addrspace(5)
-  store volatile float 2.0, float addrspace(5)* %stack
-  %val = load volatile float, float addrspace(5)* %stack
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
   %res = call amdgpu_gfx float @simple_stack_recurse(float %arg0)
   %add = fadd float %res, %val
   ret float %add
@@ -128,14 +128,12 @@ define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 {
 @lds = internal addrspace(3) global [64 x float] undef
 
 define amdgpu_gfx float @simple_lds(float %arg0) #0 {
-  %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0
-  %val = load float, float addrspace(3)* %lds_ptr
+  %val = load float, ptr addrspace(3) @lds
   ret float %val
 }
 
 define amdgpu_gfx float @simple_lds_recurse(float %arg0) #0 {
-  %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0
-  %val = load float, float addrspace(3)* %lds_ptr
+  %val = load float, ptr addrspace(3) @lds
   %res = call amdgpu_gfx float @simple_lds_recurse(float %val)
   ret float %res
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
index 15cf20a8079f8..87084d780410b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
@@ -26,8 +26,8 @@
 ; GFX10: NumSGPRsForWavesPerEU: 2
 ; GFX10: NumVGPRsForWavesPerEU: 1
 
-define amdgpu_kernel void @simple(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @simple(ptr addrspace(1) %out) {
 entry:
-  store i32 0, i32 addrspace(1)* %out
+  store i32 0, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll b/llvm/test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll
index 0948bb590d409..20e1d6abc7978 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll
@@ -23,12 +23,12 @@ define amdgpu_hs void @_amdgpu_hs_main(i32 inreg %arg, i32 inreg %arg1, i32 inre
   br label %.endls
 
 .endls:                                           ; preds = %.beginls, %.entry
-  %.fca.2.gep120.i = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] addrspace(5)* %__llpc_global_proxy_7.i, i64 0, i64 2
-  store volatile <4 x float> <float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.200000e+01>, <4 x float> addrspace(5)* %.fca.2.gep120.i, align 16
+  %.fca.2.gep120.i = getelementptr inbounds [3 x <4 x float>], ptr addrspace(5) %__llpc_global_proxy_7.i, i64 0, i64 2
+  store volatile <4 x float> <float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.200000e+01>, ptr addrspace(5) %.fca.2.gep120.i, align 16
   br label %bb
 
 bb:                                               ; preds = %bb, %.endls
-  %lsr.iv182 = phi [3 x <4 x float>] addrspace(5)* [ undef, %bb ], [ %__llpc_global_proxy_7.i, %.endls ]
-  %scevgep183 = getelementptr [3 x <4 x float>], [3 x <4 x float>] addrspace(5)* %lsr.iv182, i32 0, i32 1
+  %lsr.iv182 = phi ptr addrspace(5) [ undef, %bb ], [ %__llpc_global_proxy_7.i, %.endls ]
+  %scevgep183 = getelementptr [3 x <4 x float>], ptr addrspace(5) %lsr.iv182, i32 0, i32 1
   br label %bb
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
index a16fd534247ec..e2624a4375c9a 100644
--- a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
@@ -2,48 +2,48 @@
 
 ; SI-LABEL: {{^}}s_or_to_orn2:
 ; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
-define amdgpu_kernel void @s_or_to_orn2(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @s_or_to_orn2(ptr addrspace(1) %out, i32 %in) {
   %x = or i32 %in, -51
-  store i32 %x, i32 addrspace(1)* %out
+  store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
 ; SI-LABEL: {{^}}s_or_to_orn2_imm0:
 ; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
-define amdgpu_kernel void @s_or_to_orn2_imm0(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @s_or_to_orn2_imm0(ptr addrspace(1) %out, i32 %in) {
   %x = or i32 -51, %in
-  store i32 %x, i32 addrspace(1)* %out
+  store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
 ; SI-LABEL: {{^}}s_and_to_andn2:
 ; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
-define amdgpu_kernel void @s_and_to_andn2(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @s_and_to_andn2(ptr addrspace(1) %out, i32 %in) {
   %x = and i32 %in, -51
-  store i32 %x, i32 addrspace(1)* %out
+  store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
 ; SI-LABEL: {{^}}s_and_to_andn2_imm0:
 ; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
-define amdgpu_kernel void @s_and_to_andn2_imm0(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @s_and_to_andn2_imm0(ptr addrspace(1) %out, i32 %in) {
   %x = and i32 -51, %in
-  store i32 %x, i32 addrspace(1)* %out
+  store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
 ; SI-LABEL: {{^}}s_xor_to_xnor:
 ; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
-define amdgpu_kernel void @s_xor_to_xnor(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @s_xor_to_xnor(ptr addrspace(1) %out, i32 %in) {
   %x = xor i32 %in, -51
-  store i32 %x, i32 addrspace(1)* %out
+  store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
 ; SI-LABEL: {{^}}s_xor_to_xnor_imm0:
 ; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
-define amdgpu_kernel void @s_xor_to_xnor_imm0(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @s_xor_to_xnor_imm0(ptr addrspace(1) %out, i32 %in) {
   %x = xor i32 -51, %in
-  store i32 %x, i32 addrspace(1)* %out
+  store i32 %x, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index 0c9248a7c0bab..b21fee3bad8f0 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -18,11 +18,11 @@
 ; PREGFX11: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
 ; GFX11: ds_cmpstore_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VSWAP]], [[VCMP]] offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, [8 x i32], i32 addrspace(3)* %ptr, [8 x i32], i32 %swap) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(ptr addrspace(1) %out, [8 x i32], ptr addrspace(3) %ptr, [8 x i32], i32 %swap) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %pair = cmpxchg ptr addrspace(3) %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -43,11 +43,11 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %
 ; GFX11: ds_cmpstore_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v[[[LOSWAPV]]:[[HISWAPV]]], v[[[LOVCMP]]:[[HIVCMP]]] offset:32
 ; GCN: [[RESULT]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr, i64 %swap) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %pair = cmpxchg ptr addrspace(3) %gep, i64 7, i64 %swap seq_cst monotonic
   %result = extractvalue { i64, i1 } %pair, 0
-  store i64 %result, i64 addrspace(1)* %out, align 8
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -57,13 +57,13 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %
 ; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GFX9PLUS: ds_{{cmpst|cmpstore}}_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
-  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 %add
+  %pair = cmpxchg ptr addrspace(3) %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -82,9 +82,9 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspac
 ; PREGFX11: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
 ; GFX11: ds_cmpstore_b32 [[VPTR]], [[VSWAP]], [[VCMP]] offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, [8 x i32], i32 %swap) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
+define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(ptr addrspace(3) %ptr, [8 x i32], i32 %swap) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %pair = cmpxchg ptr addrspace(3) %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
   ret void
 }
@@ -105,9 +105,9 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)*
 ; PREGFX11: ds_cmpst_b64 [[VPTR]], v[[[LOVCMP]]:[[HIVCMP]]], v[[[LOSWAPV]]:[[HISWAPV]]] offset:32
 ; GFX11: ds_cmpstore_b64 [[VPTR]], v[[[LOSWAPV]]:[[HISWAPV]]], v[[[LOVCMP]]:[[HIVCMP]]] offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
+define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i64_offset(ptr addrspace(3) %ptr, i64 %swap) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %pair = cmpxchg ptr addrspace(3) %gep, i64 7, i64 %swap seq_cst monotonic
   %result = extractvalue { i64, i1 } %pair, 0
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_load_add.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_add.ll
index 76090b73bc361..cf6011d8c2eaf 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_load_add.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_load_add.ll
@@ -8,8 +8,8 @@
 ; GFX9-NOT: m0
 ; R600: LDS_ADD *
 ; GCN: ds_add_u32
-define amdgpu_kernel void @atomic_add_local(i32 addrspace(3)* %local) {
-   %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
+define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
+   %unused = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst
    ret void
 }
 
@@ -19,9 +19,9 @@ define amdgpu_kernel void @atomic_add_local(i32 addrspace(3)* %local) {
 
 ; R600: LDS_ADD *
 ; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-define amdgpu_kernel void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
-  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
-  %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
+define amdgpu_kernel void @atomic_add_local_const_offset(ptr addrspace(3) %local) {
+  %gep = getelementptr i32, ptr addrspace(3) %local, i32 4
+  %val = atomicrmw volatile add ptr addrspace(3) %gep, i32 5 seq_cst
   ret void
 }
 
@@ -31,9 +31,9 @@ define amdgpu_kernel void @atomic_add_local_const_offset(i32 addrspace(3)* %loca
 
 ; R600: LDS_ADD_RET *
 ; GCN: ds_add_rtn_u32
-define amdgpu_kernel void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
-  %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrspace(3) %local) {
+  %val = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -43,9 +43,9 @@ define amdgpu_kernel void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addr
 
 ; R600: LDS_ADD_RET *
 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
-define amdgpu_kernel void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
-  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
-  %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @atomic_add_ret_local_const_offset(ptr addrspace(1) %out, ptr addrspace(3) %local) {
+  %gep = getelementptr i32, ptr addrspace(3) %local, i32 5
+  %val = atomicrmw volatile add ptr addrspace(3) %gep, i32 5 seq_cst
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
index 4534767353daa..bfd18f1b52a51 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
@@ -8,8 +8,8 @@
 ; GCN-NEXT: ds_read_u8 v0, v0{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define i8 @atomic_load_monotonic_i8(i8 addrspace(3)* %ptr) {
-  %load = load atomic i8, i8 addrspace(3)* %ptr monotonic, align 1
+define i8 @atomic_load_monotonic_i8(ptr addrspace(3) %ptr) {
+  %load = load atomic i8, ptr addrspace(3) %ptr monotonic, align 1
   ret i8 %load
 }
 
@@ -20,9 +20,9 @@ define i8 @atomic_load_monotonic_i8(i8 addrspace(3)* %ptr) {
 ; GCN-NEXT: ds_read_u8 v0, v0 offset:16{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define i8 @atomic_load_monotonic_i8_offset(i8 addrspace(3)* %ptr) {
-  %gep = getelementptr inbounds i8, i8 addrspace(3)* %ptr, i8 16
-  %load = load atomic i8, i8 addrspace(3)* %gep monotonic, align 1
+define i8 @atomic_load_monotonic_i8_offset(ptr addrspace(3) %ptr) {
+  %gep = getelementptr inbounds i8, ptr addrspace(3) %ptr, i8 16
+  %load = load atomic i8, ptr addrspace(3) %gep monotonic, align 1
   ret i8 %load
 }
 
@@ -33,8 +33,8 @@ define i8 @atomic_load_monotonic_i8_offset(i8 addrspace(3)* %ptr) {
 ; GCN-NEXT: ds_read_u16 v0, v0{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define i16 @atomic_load_monotonic_i16(i16 addrspace(3)* %ptr) {
-  %load = load atomic i16, i16 addrspace(3)* %ptr monotonic, align 2
+define i16 @atomic_load_monotonic_i16(ptr addrspace(3) %ptr) {
+  %load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2
   ret i16 %load
 }
 
@@ -45,9 +45,9 @@ define i16 @atomic_load_monotonic_i16(i16 addrspace(3)* %ptr) {
 ; GCN-NEXT: ds_read_u16 v0, v0 offset:32{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define i16 @atomic_load_monotonic_i16_offset(i16 addrspace(3)* %ptr) {
-  %gep = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i16 16
-  %load = load atomic i16, i16 addrspace(3)* %gep monotonic, align 2
+define i16 @atomic_load_monotonic_i16_offset(ptr addrspace(3) %ptr) {
+  %gep = getelementptr inbounds i16, ptr addrspace(3) %ptr, i16 16
+  %load = load atomic i16, ptr addrspace(3) %gep monotonic, align 2
   ret i16 %load
 }
 
@@ -58,8 +58,8 @@ define i16 @atomic_load_monotonic_i16_offset(i16 addrspace(3)* %ptr) {
 ; GCN-NEXT: ds_read_b32 v0, v0{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define i32 @atomic_load_monotonic_i32(i32 addrspace(3)* %ptr) {
-  %load = load atomic i32, i32 addrspace(3)* %ptr monotonic, align 4
+define i32 @atomic_load_monotonic_i32(ptr addrspace(3) %ptr) {
+  %load = load atomic i32, ptr addrspace(3) %ptr monotonic, align 4
   ret i32 %load
 }
 
@@ -70,9 +70,9 @@ define i32 @atomic_load_monotonic_i32(i32 addrspace(3)* %ptr) {
 ; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define i32 @atomic_load_monotonic_i32_offset(i32 addrspace(3)* %ptr) {
-  %gep = getelementptr inbounds i32, i32 addrspace(3)* %ptr, i32 16
-  %load = load atomic i32, i32 addrspace(3)* %gep monotonic, align 4
+define i32 @atomic_load_monotonic_i32_offset(ptr addrspace(3) %ptr) {
+  %gep = getelementptr inbounds i32, ptr addrspace(3) %ptr, i32 16
+  %load = load atomic i32, ptr addrspace(3) %gep monotonic, align 4
   ret i32 %load
 }
 
@@ -83,8 +83,8 @@ define i32 @atomic_load_monotonic_i32_offset(i32 addrspace(3)* %ptr) {
 ; GCN-NEXT: ds_read_b64 v[0:1], v0{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define i64 @atomic_load_monotonic_i64(i64 addrspace(3)* %ptr) {
-  %load = load atomic i64, i64 addrspace(3)* %ptr monotonic, align 8
+define i64 @atomic_load_monotonic_i64(ptr addrspace(3) %ptr) {
+  %load = load atomic i64, ptr addrspace(3) %ptr monotonic, align 8
   ret i64 %load
 }
 
@@ -95,9 +95,9 @@ define i64 @atomic_load_monotonic_i64(i64 addrspace(3)* %ptr) {
 ; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define i64 @atomic_load_monotonic_i64_offset(i64 addrspace(3)* %ptr) {
-  %gep = getelementptr inbounds i64, i64 addrspace(3)* %ptr, i32 16
-  %load = load atomic i64, i64 addrspace(3)* %gep monotonic, align 8
+define i64 @atomic_load_monotonic_i64_offset(ptr addrspace(3) %ptr) {
+  %gep = getelementptr inbounds i64, ptr addrspace(3) %ptr, i32 16
+  %load = load atomic i64, ptr addrspace(3) %gep monotonic, align 8
   ret i64 %load
 }
 
@@ -108,9 +108,9 @@ define i64 @atomic_load_monotonic_i64_offset(i64 addrspace(3)* %ptr) {
 ; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define float @atomic_load_monotonic_f32_offset(float addrspace(3)* %ptr) {
-  %gep = getelementptr inbounds float, float addrspace(3)* %ptr, i32 16
-  %load = load atomic float, float addrspace(3)* %gep monotonic, align 4
+define float @atomic_load_monotonic_f32_offset(ptr addrspace(3) %ptr) {
+  %gep = getelementptr inbounds float, ptr addrspace(3) %ptr, i32 16
+  %load = load atomic float, ptr addrspace(3) %gep monotonic, align 4
   ret float %load
 }
 
@@ -121,9 +121,9 @@ define float @atomic_load_monotonic_f32_offset(float addrspace(3)* %ptr) {
 ; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define double @atomic_load_monotonic_f64_offset(double addrspace(3)* %ptr) {
-  %gep = getelementptr inbounds double, double addrspace(3)* %ptr, i32 16
-  %load = load atomic double, double addrspace(3)* %gep monotonic, align 8
+define double @atomic_load_monotonic_f64_offset(ptr addrspace(3) %ptr) {
+  %gep = getelementptr inbounds double, ptr addrspace(3) %ptr, i32 16
+  %load = load atomic double, ptr addrspace(3) %gep monotonic, align 8
   ret double %load
 }
 
@@ -134,10 +134,10 @@ define double @atomic_load_monotonic_f64_offset(double addrspace(3)* %ptr) {
 ; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define i8* @atomic_load_monotonic_p0i8_offset(i8* addrspace(3)* %ptr) {
-  %gep = getelementptr inbounds i8*, i8* addrspace(3)* %ptr, i32 16
-  %load = load atomic i8*, i8* addrspace(3)* %gep monotonic, align 8
-  ret i8* %load
+define ptr @atomic_load_monotonic_p0i8_offset(ptr addrspace(3) %ptr) {
+  %gep = getelementptr inbounds ptr, ptr addrspace(3) %ptr, i32 16
+  %load = load atomic ptr, ptr addrspace(3) %gep monotonic, align 8
+  ret ptr %load
 }
 
 ; GCN-LABEL: {{^}}atomic_load_monotonic_p3i8_offset:
@@ -147,8 +147,8 @@ define i8* @atomic_load_monotonic_p0i8_offset(i8* addrspace(3)* %ptr) {
 ; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define i8 addrspace(3)* @atomic_load_monotonic_p3i8_offset(i8 addrspace(3)* addrspace(3)* %ptr) {
-  %gep = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %ptr, i32 16
-  %load = load atomic i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %gep monotonic, align 4
-  ret i8 addrspace(3)* %load
+define ptr addrspace(3) @atomic_load_monotonic_p3i8_offset(ptr addrspace(3) %ptr) {
+  %gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %ptr, i32 16
+  %load = load atomic ptr addrspace(3), ptr addrspace(3) %gep monotonic, align 4
+  ret ptr addrspace(3) %load
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll
index b48ed36dfc203..cc6762e0bad4a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll
@@ -9,8 +9,8 @@
 
 ; R600: LDS_SUB *
 ; GCN: ds_sub_u32
-define amdgpu_kernel void @atomic_sub_local(i32 addrspace(3)* %local) {
-   %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
+define amdgpu_kernel void @atomic_sub_local(ptr addrspace(3) %local) {
+   %unused = atomicrmw volatile sub ptr addrspace(3) %local, i32 5 seq_cst
    ret void
 }
 
@@ -20,9 +20,9 @@ define amdgpu_kernel void @atomic_sub_local(i32 addrspace(3)* %local) {
 
 ; R600: LDS_SUB *
 ; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-define amdgpu_kernel void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
-  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
-  %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
+define amdgpu_kernel void @atomic_sub_local_const_offset(ptr addrspace(3) %local) {
+  %gep = getelementptr i32, ptr addrspace(3) %local, i32 4
+  %val = atomicrmw volatile sub ptr addrspace(3) %gep, i32 5 seq_cst
   ret void
 }
 
@@ -32,9 +32,9 @@ define amdgpu_kernel void @atomic_sub_local_const_offset(i32 addrspace(3)* %loca
 
 ; R600: LDS_SUB_RET *
 ; GCN: ds_sub_rtn_u32
-define amdgpu_kernel void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
-  %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @atomic_sub_ret_local(ptr addrspace(1) %out, ptr addrspace(3) %local) {
+  %val = atomicrmw volatile sub ptr addrspace(3) %local, i32 5 seq_cst
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -44,9 +44,9 @@ define amdgpu_kernel void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addr
 
 ; R600: LDS_SUB_RET *
 ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
-define amdgpu_kernel void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
-  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
-  %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @atomic_sub_ret_local_const_offset(ptr addrspace(1) %out, ptr addrspace(3) %local) {
+  %gep = getelementptr i32, ptr addrspace(3) %local, i32 5
+  %val = atomicrmw volatile sub ptr addrspace(3) %gep, i32 5 seq_cst
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index bb4c0b7aca47b..c407f9de969a6 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -14,7 +14,7 @@ declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32 imm
 
 ; Show what the atomic optimization pass will do for raw buffers.
 
-define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: add_i32_constant:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
@@ -215,11 +215,11 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %i
 ; GFX11W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
+define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, <4 x i32> %inout, i32 %additive) {
 ; GFX6-LABEL: add_i32_uniform:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
@@ -430,11 +430,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
 ; GFX11W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: add_i32_varying_vdata:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
@@ -761,11 +761,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %vindex) {
+define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, <4 x i32> %inout, i32 %vindex) {
 ; GFX6-LABEL: struct_add_i32_varying_vdata:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x11
@@ -1109,11 +1109,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out,
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 %vindex, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: add_i32_varying_offset:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
@@ -1179,11 +1179,11 @@ define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: sub_i32_constant:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
@@ -1391,11 +1391,11 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %i
 ; GFX11W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
+define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, <4 x i32> %inout, i32 %subitive) {
 ; GFX6-LABEL: sub_i32_uniform:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
@@ -1610,11 +1610,11 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
 ; GFX11W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: sub_i32_varying_vdata:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
@@ -1941,11 +1941,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: sub_i32_varying_offset:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
@@ -2011,6 +2011,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index ab6356559b2bc..9102086a51641 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -11,7 +11,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 
 ; Show what the atomic optimization pass will do for global pointers.
 
-define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
+define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
 ; GFX7LESS-LABEL: add_i32_constant:
 ; GFX7LESS:       ; %bb.0: ; %entry
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
@@ -224,12 +224,12 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspac
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel
-  store i32 %old, i32 addrspace(1)* %out
+  %old = atomicrmw add ptr addrspace(1) %inout, i32 5 acq_rel
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) {
+define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %additive) {
 ; GFX7LESS-LABEL: add_i32_uniform:
 ; GFX7LESS:       ; %bb.0: ; %entry
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
@@ -489,12 +489,12 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel
-  store i32 %old, i32 addrspace(1)* %out
+  %old = atomicrmw add ptr addrspace(1) %inout, i32 %additive acq_rel
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
+define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
 ; GFX7LESS-LABEL: add_i32_varying:
 ; GFX7LESS:       ; %bb.0: ; %entry
 ; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -883,12 +883,12 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
-  %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel
-  store i32 %old, i32 addrspace(1)* %out
+  %old = atomicrmw add ptr addrspace(1) %inout, i32 %lane acq_rel
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
+define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
 ; GFX7LESS-LABEL: add_i64_constant:
 ; GFX7LESS:       ; %bb.0: ; %entry
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
@@ -1120,12 +1120,12 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel
-  store i64 %old, i64 addrspace(1)* %out
+  %old = atomicrmw add ptr addrspace(1) %inout, i64 5 acq_rel
+  store i64 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) {
+define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i64 %additive) {
 ; GFX7LESS-LABEL: add_i64_uniform:
 ; GFX7LESS:       ; %bb.0: ; %entry
 ; GFX7LESS-NEXT:    s_mov_b64 s[8:9], exec
@@ -1437,12 +1437,12 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel
-  store i64 %old, i64 addrspace(1)* %out
+  %old = atomicrmw add ptr addrspace(1) %inout, i64 %additive acq_rel
+  store i64 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
+define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
 ; GFX7LESS-LABEL: add_i64_varying:
 ; GFX7LESS:       ; %bb.0: ; %entry
 ; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1530,12 +1530,12 @@ define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %zext = zext i32 %lane to i64
-  %old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel
-  store i64 %old, i64 addrspace(1)* %out
+  %old = atomicrmw add ptr addrspace(1) %inout, i64 %zext acq_rel
+  store i64 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
+define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
 ; GFX7LESS-LABEL: sub_i32_constant:
 ; GFX7LESS:       ; %bb.0: ; %entry
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
@@ -1788,12 +1788,12 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspac
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel
-  store i32 %old, i32 addrspace(1)* %out
+  %old = atomicrmw sub ptr addrspace(1) %inout, i32 5 acq_rel
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) {
+define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %subitive) {
 ; GFX7LESS-LABEL: sub_i32_uniform:
 ; GFX7LESS:       ; %bb.0: ; %entry
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
@@ -2057,12 +2057,12 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel
-  store i32 %old, i32 addrspace(1)* %out
+  %old = atomicrmw sub ptr addrspace(1) %inout, i32 %subitive acq_rel
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
+define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
 ; GFX7LESS-LABEL: sub_i32_varying:
 ; GFX7LESS:       ; %bb.0: ; %entry
 ; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2451,12 +2451,12 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
-  %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel
-  store i32 %old, i32 addrspace(1)* %out
+  %old = atomicrmw sub ptr addrspace(1) %inout, i32 %lane acq_rel
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
+define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
 ; GFX7LESS-LABEL: sub_i64_constant:
 ; GFX7LESS:       ; %bb.0: ; %entry
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
@@ -2740,12 +2740,12 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel
-  store i64 %old, i64 addrspace(1)* %out
+  %old = atomicrmw sub ptr addrspace(1) %inout, i64 5 acq_rel
+  store i64 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) {
+define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i64 %subitive) {
 ; GFX7LESS-LABEL: sub_i64_uniform:
 ; GFX7LESS:       ; %bb.0: ; %entry
 ; GFX7LESS-NEXT:    s_mov_b64 s[8:9], exec
@@ -3070,12 +3070,12 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel
-  store i64 %old, i64 addrspace(1)* %out
+  %old = atomicrmw sub ptr addrspace(1) %inout, i64 %subitive acq_rel
+  store i64 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
+define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(1) %inout) {
 ; GFX7LESS-LABEL: sub_i64_varying:
 ; GFX7LESS:       ; %bb.0: ; %entry
 ; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -3163,7 +3163,7 @@ define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %zext = zext i32 %lane to i64
-  %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %zext acq_rel
-  store i64 %old, i64 addrspace(1)* %out
+  %old = atomicrmw sub ptr addrspace(1) %inout, i64 %zext acq_rel
+  store i64 %old, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index c6ba43a59188a..323050f4f8f5b 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -14,7 +14,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 
 ; Show what the atomic optimization pass will do for local pointers.
 
-define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ;
 ;
 ; GFX7LESS-LABEL: add_i32_constant:
@@ -230,12 +230,12 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
-  store i32 %old, i32 addrspace(1)* %out
+  %old = atomicrmw add ptr addrspace(3) @local_var32, i32 5 acq_rel
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
+define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) {
 ;
 ;
 ; GFX7LESS-LABEL: add_i32_uniform:
@@ -468,12 +468,12 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
-  store i32 %old, i32 addrspace(1)* %out
+  %old = atomicrmw add ptr addrspace(3) @local_var32, i32 %additive acq_rel
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ;
 ;
 ; GFX7LESS-LABEL: add_i32_varying:
@@ -814,8 +814,8 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
-  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
-  store i32 %old, i32 addrspace(1)* %out
+  %old = atomicrmw add ptr addrspace(3) @local_var32, i32 %lane acq_rel
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
@@ -1043,11 +1043,11 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
-  %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
+  %old = atomicrmw add ptr addrspace(3) @local_var32, i32 %lane acq_rel
   ret void
 }
 
-define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ;
 ;
 ; GFX7LESS-LABEL: add_i64_constant:
@@ -1283,12 +1283,12 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
-  store i64 %old, i64 addrspace(1)* %out
+  %old = atomicrmw add ptr addrspace(3) @local_var64, i64 5 acq_rel
+  store i64 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
+define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) {
 ;
 ;
 ; GFX7LESS-LABEL: add_i64_uniform:
@@ -1570,12 +1570,12 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
-  store i64 %old, i64 addrspace(1)* %out
+  %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %additive acq_rel
+  store i64 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ;
 ;
 ; GFX7LESS-LABEL: add_i64_varying:
@@ -1647,12 +1647,12 @@ define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %zext = zext i32 %lane to i64
-  %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
-  store i64 %old, i64 addrspace(1)* %out
+  %old = atomicrmw add ptr addrspace(3) @local_var64, i64 %zext acq_rel
+  store i64 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ;
 ;
 ; GFX7LESS-LABEL: sub_i32_constant:
@@ -1875,12 +1875,12 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
-  store i32 %old, i32 addrspace(1)* %out
+  %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 5 acq_rel
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
+define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive) {
 ;
 ;
 ; GFX7LESS-LABEL: sub_i32_uniform:
@@ -2117,12 +2117,12 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
-  store i32 %old, i32 addrspace(1)* %out
+  %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %subitive acq_rel
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ;
 ;
 ; GFX7LESS-LABEL: sub_i32_varying:
@@ -2463,8 +2463,8 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
-  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
-  store i32 %old, i32 addrspace(1)* %out
+  %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %lane acq_rel
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
@@ -2692,11 +2692,11 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
-  %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
+  %old = atomicrmw sub ptr addrspace(3) @local_var32, i32 %lane acq_rel
   ret void
 }
 
-define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ;
 ;
 ; GFX7LESS-LABEL: sub_i64_constant:
@@ -2944,12 +2944,12 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
-  store i64 %old, i64 addrspace(1)* %out
+  %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 5 acq_rel
+  store i64 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
+define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) {
 ;
 ;
 ; GFX7LESS-LABEL: sub_i64_uniform:
@@ -3244,12 +3244,12 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
-  store i64 %old, i64 addrspace(1)* %out
+  %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 %subitive acq_rel
+  store i64 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ;
 ;
 ; GFX7LESS-LABEL: sub_i64_varying:
@@ -3321,12 +3321,12 @@ define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %zext = zext i32 %lane to i64
-  %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
-  store i64 %old, i64 addrspace(1)* %out
+  %old = atomicrmw sub ptr addrspace(3) @local_var64, i64 %zext acq_rel
+  store i64 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ;
 ;
 ; GFX7LESS-LABEL: and_i32_varying:
@@ -3671,12 +3671,12 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
-  %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
-  store i32 %old, i32 addrspace(1)* %out
+  %old = atomicrmw and ptr addrspace(3) @local_var32, i32 %lane acq_rel
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ;
 ;
 ; GFX7LESS-LABEL: or_i32_varying:
@@ -4017,12 +4017,12 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
-  %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
-  store i32 %old, i32 addrspace(1)* %out
+  %old = atomicrmw or ptr addrspace(3) @local_var32, i32 %lane acq_rel
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ;
 ;
 ; GFX7LESS-LABEL: xor_i32_varying:
@@ -4363,12 +4363,12 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
-  %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
-  store i32 %old, i32 addrspace(1)* %out
+  %old = atomicrmw xor ptr addrspace(3) @local_var32, i32 %lane acq_rel
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ;
 ;
 ; GFX7LESS-LABEL: max_i32_varying:
@@ -4713,12 +4713,12 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
-  %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
-  store i32 %old, i32 addrspace(1)* %out
+  %old = atomicrmw max ptr addrspace(3) @local_var32, i32 %lane acq_rel
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ;
 ;
 ; GFX7LESS-LABEL: max_i64_constant:
@@ -4964,12 +4964,12 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
-  store i64 %old, i64 addrspace(1)* %out
+  %old = atomicrmw max ptr addrspace(3) @local_var64, i64 5 acq_rel
+  store i64 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ;
 ;
 ; GFX7LESS-LABEL: min_i32_varying:
@@ -5314,12 +5314,12 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
-  %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
-  store i32 %old, i32 addrspace(1)* %out
+  %old = atomicrmw min ptr addrspace(3) @local_var32, i32 %lane acq_rel
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ;
 ;
 ; GFX7LESS-LABEL: min_i64_constant:
@@ -5565,12 +5565,12 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
-  store i64 %old, i64 addrspace(1)* %out
+  %old = atomicrmw min ptr addrspace(3) @local_var64, i64 5 acq_rel
+  store i64 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ;
 ;
 ; GFX7LESS-LABEL: umax_i32_varying:
@@ -5911,12 +5911,12 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
-  %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
-  store i32 %old, i32 addrspace(1)* %out
+  %old = atomicrmw umax ptr addrspace(3) @local_var32, i32 %lane acq_rel
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ;
 ;
 ; GFX7LESS-LABEL: umax_i64_constant:
@@ -6157,12 +6157,12 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
-  store i64 %old, i64 addrspace(1)* %out
+  %old = atomicrmw umax ptr addrspace(3) @local_var64, i64 5 acq_rel
+  store i64 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ;
 ;
 ; GFX7LESS-LABEL: umin_i32_varying:
@@ -6507,12 +6507,12 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
-  %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
-  store i32 %old, i32 addrspace(1)* %out
+  %old = atomicrmw umin ptr addrspace(3) @local_var32, i32 %lane acq_rel
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ;
 ;
 ; GFX7LESS-LABEL: umin_i64_constant:
@@ -6753,7 +6753,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
-  store i64 %old, i64 addrspace(1)* %out
+  %old = atomicrmw umin ptr addrspace(3) @local_var64, i64 5 acq_rel
+  store i64 %old, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index f83f31b486527..187eff11cb84f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32)
 
 ; Show what the atomic optimization pass will do for raw buffers.
 
-define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: add_i32_constant:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
@@ -214,11 +214,11 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %i
 ; GFX11W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
+define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, <4 x i32> %inout, i32 %additive) {
 ; GFX6-LABEL: add_i32_uniform:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
@@ -429,11 +429,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
 ; GFX11W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: add_i32_varying_vdata:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
@@ -760,11 +760,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: add_i32_varying_offset:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
@@ -830,11 +830,11 @@ define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: sub_i32_constant:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
@@ -1042,11 +1042,11 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %i
 ; GFX11W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
+define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, <4 x i32> %inout, i32 %subitive) {
 ; GFX6-LABEL: sub_i32_uniform:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
@@ -1261,11 +1261,11 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
 ; GFX11W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: sub_i32_varying_vdata:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
@@ -1592,11 +1592,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: sub_i32_varying_offset:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
@@ -1662,6 +1662,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index cb9a3df64e29f..0d37f0e814dd5 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32,
 
 ; Show what the atomic optimization pass will do for struct buffers.
 
-define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: add_i32_constant:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
@@ -221,11 +221,11 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %i
 ; GFX11W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
+define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, <4 x i32> %inout, i32 %additive) {
 ; GFX6-LABEL: add_i32_uniform:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
@@ -443,11 +443,11 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
 ; GFX11W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: add_i32_varying_vdata:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
@@ -774,11 +774,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: add_i32_varying_vindex:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
@@ -844,11 +844,11 @@ define amdgpu_kernel void @add_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: add_i32_varying_offset:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v0
@@ -946,11 +946,11 @@ define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: sub_i32_constant:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
@@ -1165,11 +1165,11 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %i
 ; GFX11W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
+define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, <4 x i32> %inout, i32 %subitive) {
 ; GFX6-LABEL: sub_i32_uniform:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
@@ -1391,11 +1391,11 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
 ; GFX11W32-NEXT:    s_endpgm
 entry:
   %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: sub_i32_varying_vdata:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
@@ -1722,11 +1722,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: sub_i32_varying_vindex:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
@@ -1792,11 +1792,11 @@ define amdgpu_kernel void @sub_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
+define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, <4 x i32> %inout) {
 ; GFX6-LABEL: sub_i32_varying_offset:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v0
@@ -1894,6 +1894,6 @@ define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0)
-  store i32 %old, i32 addrspace(1)* %out
+  store i32 %old, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
index 4e80f5d05b796..71e24c1692c7f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
@@ -8,8 +8,8 @@
 ; GCN-NEXT: ds_write_b8 v0, v1{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define void @atomic_store_monotonic_i8(i8 addrspace(3)* %ptr, i8 %val) {
-  store atomic i8 %val, i8 addrspace(3)* %ptr monotonic, align 1
+define void @atomic_store_monotonic_i8(ptr addrspace(3) %ptr, i8 %val) {
+  store atomic i8 %val, ptr addrspace(3) %ptr monotonic, align 1
   ret void
 }
 
@@ -20,9 +20,9 @@ define void @atomic_store_monotonic_i8(i8 addrspace(3)* %ptr, i8 %val) {
 ; GCN-NEXT: ds_write_b8 v0, v1 offset:16{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define void @atomic_store_monotonic_offset_i8(i8 addrspace(3)* %ptr, i8 %val) {
-  %gep = getelementptr inbounds i8, i8 addrspace(3)* %ptr, i8 16
-  store atomic i8 %val, i8 addrspace(3)* %gep monotonic, align 1
+define void @atomic_store_monotonic_offset_i8(ptr addrspace(3) %ptr, i8 %val) {
+  %gep = getelementptr inbounds i8, ptr addrspace(3) %ptr, i8 16
+  store atomic i8 %val, ptr addrspace(3) %gep monotonic, align 1
   ret void
 }
 
@@ -33,8 +33,8 @@ define void @atomic_store_monotonic_offset_i8(i8 addrspace(3)* %ptr, i8 %val) {
 ; GCN-NEXT: ds_write_b16 v0, v1{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define void @atomic_store_monotonic_i16(i16 addrspace(3)* %ptr, i16 %val) {
-  store atomic i16 %val, i16 addrspace(3)* %ptr monotonic, align 2
+define void @atomic_store_monotonic_i16(ptr addrspace(3) %ptr, i16 %val) {
+  store atomic i16 %val, ptr addrspace(3) %ptr monotonic, align 2
   ret void
 }
 
@@ -45,9 +45,9 @@ define void @atomic_store_monotonic_i16(i16 addrspace(3)* %ptr, i16 %val) {
 ; GCN-NEXT: ds_write_b16 v0, v1 offset:32{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define void @atomic_store_monotonic_offset_i16(i16 addrspace(3)* %ptr, i16 %val) {
-  %gep = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i16 16
-  store atomic i16 %val, i16 addrspace(3)* %gep monotonic, align 2
+define void @atomic_store_monotonic_offset_i16(ptr addrspace(3) %ptr, i16 %val) {
+  %gep = getelementptr inbounds i16, ptr addrspace(3) %ptr, i16 16
+  store atomic i16 %val, ptr addrspace(3) %gep monotonic, align 2
   ret void
 }
 
@@ -58,8 +58,8 @@ define void @atomic_store_monotonic_offset_i16(i16 addrspace(3)* %ptr, i16 %val)
 ; GCN-NEXT: ds_write_b32 v0, v1{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define void @atomic_store_monotonic_i32(i32 addrspace(3)* %ptr, i32 %val) {
-  store atomic i32 %val, i32 addrspace(3)* %ptr monotonic, align 4
+define void @atomic_store_monotonic_i32(ptr addrspace(3) %ptr, i32 %val) {
+  store atomic i32 %val, ptr addrspace(3) %ptr monotonic, align 4
   ret void
 }
 
@@ -70,9 +70,9 @@ define void @atomic_store_monotonic_i32(i32 addrspace(3)* %ptr, i32 %val) {
 ; GCN-NEXT: ds_write_b32 v0, v1 offset:64{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define void @atomic_store_monotonic_offset_i32(i32 addrspace(3)* %ptr, i32 %val) {
-  %gep = getelementptr inbounds i32, i32 addrspace(3)* %ptr, i32 16
-  store atomic i32 %val, i32 addrspace(3)* %gep monotonic, align 4
+define void @atomic_store_monotonic_offset_i32(ptr addrspace(3) %ptr, i32 %val) {
+  %gep = getelementptr inbounds i32, ptr addrspace(3) %ptr, i32 16
+  store atomic i32 %val, ptr addrspace(3) %gep monotonic, align 4
   ret void
 }
 
@@ -83,8 +83,8 @@ define void @atomic_store_monotonic_offset_i32(i32 addrspace(3)* %ptr, i32 %val)
 ; GCN-NEXT: ds_write_b64 v0, v[1:2]{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define void @atomic_store_monotonic_i64(i64 addrspace(3)* %ptr, i64 %val) {
-  store atomic i64 %val, i64 addrspace(3)* %ptr monotonic, align 8
+define void @atomic_store_monotonic_i64(ptr addrspace(3) %ptr, i64 %val) {
+  store atomic i64 %val, ptr addrspace(3) %ptr monotonic, align 8
   ret void
 }
 
@@ -95,9 +95,9 @@ define void @atomic_store_monotonic_i64(i64 addrspace(3)* %ptr, i64 %val) {
 ; GCN-NEXT: ds_write_b64 v0, v[1:2] offset:128{{$}}
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 ; GCN-NEXT: s_setpc_b64
-define void @atomic_store_monotonic_offset_i64(i64 addrspace(3)* %ptr, i64 %val) {
-  %gep = getelementptr inbounds i64, i64 addrspace(3)* %ptr, i64 16
-  store atomic i64 %val, i64 addrspace(3)* %gep monotonic, align 8
+define void @atomic_store_monotonic_offset_i64(ptr addrspace(3) %ptr, i64 %val) {
+  %gep = getelementptr inbounds i64, ptr addrspace(3) %ptr, i64 16
+  store atomic i64 %val, ptr addrspace(3) %gep monotonic, align 8
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
index 2706037cbe444..5e1f9b0a8df92 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-define i32 @atomic_nand_i32_lds(i32 addrspace(3)* %ptr) nounwind {
+define i32 @atomic_nand_i32_lds(ptr addrspace(3) %ptr) nounwind {
 ; GCN-LABEL: atomic_nand_i32_lds:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24,11 +24,11 @@ define i32 @atomic_nand_i32_lds(i32 addrspace(3)* %ptr) nounwind {
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
+  %result = atomicrmw nand ptr addrspace(3) %ptr, i32 4 seq_cst
   ret i32 %result
 }
 
-define i32 @atomic_nand_i32_global(i32 addrspace(1)* %ptr) nounwind {
+define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
 ; GCN-LABEL: atomic_nand_i32_global:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -52,11 +52,11 @@ define i32 @atomic_nand_i32_global(i32 addrspace(1)* %ptr) nounwind {
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw nand i32 addrspace(1)* %ptr, i32 4 seq_cst
+  %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst
   ret i32 %result
 }
 
-define i32 @atomic_nand_i32_flat(i32* %ptr) nounwind {
+define i32 @atomic_nand_i32_flat(ptr %ptr) nounwind {
 ; GCN-LABEL: atomic_nand_i32_flat:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -80,6 +80,6 @@ define i32 @atomic_nand_i32_flat(i32* %ptr) nounwind {
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw nand i32* %ptr, i32 4 seq_cst
+  %result = atomicrmw nand ptr %ptr, i32 4 seq_cst
   ret i32 %result
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/atomics-cas-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/atomics-cas-remarks-gfx90a.ll
index 240963cfe9009..2f7d1e9a6efaf 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics-cas-remarks-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics-cas-remarks-gfx90a.ll
@@ -15,89 +15,89 @@
 ; GFX90A-CAS-LABEL: atomic_add_cas:
 ; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-CAS: s_cbranch_execnz
-define dso_local void @atomic_add_cas(float* %p, float %q) {
+define dso_local void @atomic_add_cas(ptr %p, float %q) {
 entry:
-  %ret = atomicrmw fadd float* %p, float %q monotonic, align 4
+  %ret = atomicrmw fadd ptr %p, float %q monotonic, align 4
   ret void
 }
 
 ; GFX90A-CAS-LABEL: atomic_add_cas_agent:
 ; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-CAS: s_cbranch_execnz
-define dso_local void @atomic_add_cas_agent(float* %p, float %q) {
+define dso_local void @atomic_add_cas_agent(ptr %p, float %q) {
 entry:
-  %ret = atomicrmw fadd float* %p, float %q syncscope("agent") monotonic, align 4
+  %ret = atomicrmw fadd ptr %p, float %q syncscope("agent") monotonic, align 4
   ret void
 }
 
 ; GFX90A-CAS-LABEL: atomic_add_cas_workgroup:
 ; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-CAS: s_cbranch_execnz
-define dso_local void @atomic_add_cas_workgroup(float* %p, float %q) {
+define dso_local void @atomic_add_cas_workgroup(ptr %p, float %q) {
 entry:
-  %ret = atomicrmw fadd float* %p, float %q syncscope("workgroup") monotonic, align 4
+  %ret = atomicrmw fadd ptr %p, float %q syncscope("workgroup") monotonic, align 4
   ret void
 }
 
 ; GFX90A-CAS-LABEL: atomic_add_cas_wavefront:
 ; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-CAS: s_cbranch_execnz
-define dso_local void @atomic_add_cas_wavefront(float* %p, float %q) {
+define dso_local void @atomic_add_cas_wavefront(ptr %p, float %q) {
 entry:
-  %ret = atomicrmw fadd float* %p, float %q syncscope("wavefront") monotonic, align 4
+  %ret = atomicrmw fadd ptr %p, float %q syncscope("wavefront") monotonic, align 4
   ret void
 }
 
 ; GFX90A-CAS-LABEL: atomic_add_cas_singlethread:
 ; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-CAS: s_cbranch_execnz
-define dso_local void @atomic_add_cas_singlethread(float* %p, float %q) {
+define dso_local void @atomic_add_cas_singlethread(ptr %p, float %q) {
 entry:
-  %ret = atomicrmw fadd float* %p, float %q syncscope("singlethread") monotonic, align 4
+  %ret = atomicrmw fadd ptr %p, float %q syncscope("singlethread") monotonic, align 4
   ret void
 }
 
 ; GFX90A-CAS-LABEL: atomic_add_cas_one_as:
 ; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-CAS: s_cbranch_execnz
-define dso_local void @atomic_add_cas_one_as(float* %p, float %q) {
+define dso_local void @atomic_add_cas_one_as(ptr %p, float %q) {
 entry:
-  %ret = atomicrmw fadd float* %p, float %q syncscope("one-as") monotonic, align 4
+  %ret = atomicrmw fadd ptr %p, float %q syncscope("one-as") monotonic, align 4
   ret void
 }
 
 ; GFX90A-CAS-LABEL: atomic_add_cas_agent_one_as:
 ; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-CAS: s_cbranch_execnz
-define dso_local void @atomic_add_cas_agent_one_as(float* %p, float %q) {
+define dso_local void @atomic_add_cas_agent_one_as(ptr %p, float %q) {
 entry:
-  %ret = atomicrmw fadd float* %p, float %q syncscope("agent-one-as") monotonic, align 4
+  %ret = atomicrmw fadd ptr %p, float %q syncscope("agent-one-as") monotonic, align 4
   ret void
 }
 
 ; GFX90A-CAS-LABEL: atomic_add_cas_workgroup_one_as:
 ; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-CAS: s_cbranch_execnz
-define dso_local void @atomic_add_cas_workgroup_one_as(float* %p, float %q) {
+define dso_local void @atomic_add_cas_workgroup_one_as(ptr %p, float %q) {
 entry:
-  %ret = atomicrmw fadd float* %p, float %q syncscope("workgroup-one-as") monotonic, align 4
+  %ret = atomicrmw fadd ptr %p, float %q syncscope("workgroup-one-as") monotonic, align 4
   ret void
 }
 
 ; GFX90A-CAS-LABEL: atomic_add_cas_wavefront_one_as:
 ; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-CAS: s_cbranch_execnz
-define dso_local void @atomic_add_cas_wavefront_one_as(float* %p, float %q) {
+define dso_local void @atomic_add_cas_wavefront_one_as(ptr %p, float %q) {
 entry:
-  %ret = atomicrmw fadd float* %p, float %q syncscope("wavefront-one-as") monotonic, align 4
+  %ret = atomicrmw fadd ptr %p, float %q syncscope("wavefront-one-as") monotonic, align 4
   ret void
 }
 
 ; GFX90A-CAS-LABEL: atomic_add_cas_singlethread_one_as:
 ; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
 ; GFX90A-CAS: s_cbranch_execnz
-define dso_local void @atomic_add_cas_singlethread_one_as(float* %p, float %q) {
+define dso_local void @atomic_add_cas_singlethread_one_as(ptr %p, float %q) {
 entry:
-  %ret = atomicrmw fadd float* %p, float %q syncscope("singlethread-one-as") monotonic, align 4
+  %ret = atomicrmw fadd ptr %p, float %q syncscope("singlethread-one-as") monotonic, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll
index 35637cd9882a1..f5eb9f23d6cbf 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll
@@ -14,81 +14,81 @@
 ; GFX90A-HW-LABEL: atomic_add_unsafe_hw:
 ; GFX90A-HW:    ds_add_f64 v2, v[0:1]
 ; GFX90A-HW:    s_endpgm
-define amdgpu_kernel void @atomic_add_unsafe_hw(double addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @atomic_add_unsafe_hw(ptr addrspace(3) %ptr) #0 {
 main_body:
-  %ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst
+  %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
   ret void
 }
 
 ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_agent:
 ; GFX90A-HW:    global_atomic_add_f32 v0, v1, s[2:3]
 ; GFX90A-HW:    s_endpgm
-define amdgpu_kernel void @atomic_add_unsafe_hw_agent(float addrspace(1)* %ptr, float %val) #0 {
+define amdgpu_kernel void @atomic_add_unsafe_hw_agent(ptr addrspace(1) %ptr, float %val) #0 {
 main_body:
-  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("agent") monotonic, align 4
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
   ret void
 }
 
 ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wg:
 ; GFX90A-HW:    global_atomic_add_f32 v0, v1, s[2:3]
 ; GFX90A-HW:    s_endpgm
-define amdgpu_kernel void @atomic_add_unsafe_hw_wg(float addrspace(1)* %ptr, float %val) #0 {
+define amdgpu_kernel void @atomic_add_unsafe_hw_wg(ptr addrspace(1) %ptr, float %val) #0 {
 main_body:
-  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("workgroup") monotonic, align 4
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("workgroup") monotonic, align 4
   ret void
 }
 
 ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wavefront:
 ; GFX90A-HW:    global_atomic_add_f32 v0, v1, s[2:3]
 ; GFX90A-HW:    s_endpgm
-define amdgpu_kernel void @atomic_add_unsafe_hw_wavefront(float addrspace(1)* %ptr, float %val) #0 {
+define amdgpu_kernel void @atomic_add_unsafe_hw_wavefront(ptr addrspace(1) %ptr, float %val) #0 {
 main_body:
-  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("wavefront") monotonic, align 4
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("wavefront") monotonic, align 4
   ret void
 }
 
 ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_single_thread:
 ; GFX90A-HW:    global_atomic_add_f32 v0, v1, s[2:3]
 ; GFX90A-HW:    s_endpgm
-define amdgpu_kernel void @atomic_add_unsafe_hw_single_thread(float addrspace(1)* %ptr, float %val) #0 {
+define amdgpu_kernel void @atomic_add_unsafe_hw_single_thread(ptr addrspace(1) %ptr, float %val) #0 {
 main_body:
-  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("singlethread") monotonic, align 4
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("singlethread") monotonic, align 4
   ret void
 }
 
 ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_aoa:
 ; GFX90A-HW:    global_atomic_add_f32 v0, v1, s[2:3]
 ; GFX90A-HW:    s_endpgm
-define amdgpu_kernel void @atomic_add_unsafe_hw_aoa(float addrspace(1)* %ptr, float %val) #0 {
+define amdgpu_kernel void @atomic_add_unsafe_hw_aoa(ptr addrspace(1) %ptr, float %val) #0 {
 main_body:
-  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("agent-one-as") monotonic, align 4
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent-one-as") monotonic, align 4
   ret void
 }
 
 ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wgoa:
 ; GFX90A-HW:    global_atomic_add_f32 v0, v1, s[2:3]
 ; GFX90A-HW:    s_endpgm
-define amdgpu_kernel void @atomic_add_unsafe_hw_wgoa(float addrspace(1)* %ptr, float %val) #0 {
+define amdgpu_kernel void @atomic_add_unsafe_hw_wgoa(ptr addrspace(1) %ptr, float %val) #0 {
 main_body:
-  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("workgroup-one-as") monotonic, align 4
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("workgroup-one-as") monotonic, align 4
   ret void
 }
 
 ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wfoa:
 ; GFX90A-HW:    global_atomic_add_f32 v0, v1, s[2:3]
 ; GFX90A-HW:    s_endpgm
-define amdgpu_kernel void @atomic_add_unsafe_hw_wfoa(float addrspace(1)* %ptr, float %val) #0 {
+define amdgpu_kernel void @atomic_add_unsafe_hw_wfoa(ptr addrspace(1) %ptr, float %val) #0 {
 main_body:
-  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("wavefront-one-as") monotonic, align 4
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("wavefront-one-as") monotonic, align 4
   ret void
 }
 
 ; GFX90A-HW-LABEL: atomic_add_unsafe_hw_stoa:
 ; GFX90A-HW:    global_atomic_add_f32 v0, v1, s[2:3]
 ; GFX90A-HW:    s_endpgm
-define amdgpu_kernel void @atomic_add_unsafe_hw_stoa(float addrspace(1)* %ptr, float %val) #0 {
+define amdgpu_kernel void @atomic_add_unsafe_hw_stoa(ptr addrspace(1) %ptr, float %val) #0 {
 main_body:
-  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("singlethread-one-as") monotonic, align 4
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("singlethread-one-as") monotonic, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll
index 670644e38d4f3..9d2be29f96ebf 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll
@@ -41,89 +41,89 @@ attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
 ; CHECK: NumVGPRsForWavesPerEU: 43
 @var = addrspace(1) global float 0.0
 define amdgpu_kernel void @min_1024_max_1024() #3 {
-  %val0 = load volatile float, float addrspace(1)* @var
-  %val1 = load volatile float, float addrspace(1)* @var
-  %val2 = load volatile float, float addrspace(1)* @var
-  %val3 = load volatile float, float addrspace(1)* @var
-  %val4 = load volatile float, float addrspace(1)* @var
-  %val5 = load volatile float, float addrspace(1)* @var
-  %val6 = load volatile float, float addrspace(1)* @var
-  %val7 = load volatile float, float addrspace(1)* @var
-  %val8 = load volatile float, float addrspace(1)* @var
-  %val9 = load volatile float, float addrspace(1)* @var
-  %val10 = load volatile float, float addrspace(1)* @var
-  %val11 = load volatile float, float addrspace(1)* @var
-  %val12 = load volatile float, float addrspace(1)* @var
-  %val13 = load volatile float, float addrspace(1)* @var
-  %val14 = load volatile float, float addrspace(1)* @var
-  %val15 = load volatile float, float addrspace(1)* @var
-  %val16 = load volatile float, float addrspace(1)* @var
-  %val17 = load volatile float, float addrspace(1)* @var
-  %val18 = load volatile float, float addrspace(1)* @var
-  %val19 = load volatile float, float addrspace(1)* @var
-  %val20 = load volatile float, float addrspace(1)* @var
-  %val21 = load volatile float, float addrspace(1)* @var
-  %val22 = load volatile float, float addrspace(1)* @var
-  %val23 = load volatile float, float addrspace(1)* @var
-  %val24 = load volatile float, float addrspace(1)* @var
-  %val25 = load volatile float, float addrspace(1)* @var
-  %val26 = load volatile float, float addrspace(1)* @var
-  %val27 = load volatile float, float addrspace(1)* @var
-  %val28 = load volatile float, float addrspace(1)* @var
-  %val29 = load volatile float, float addrspace(1)* @var
-  %val30 = load volatile float, float addrspace(1)* @var
-  %val31 = load volatile float, float addrspace(1)* @var
-  %val32 = load volatile float, float addrspace(1)* @var
-  %val33 = load volatile float, float addrspace(1)* @var
-  %val34 = load volatile float, float addrspace(1)* @var
-  %val35 = load volatile float, float addrspace(1)* @var
-  %val36 = load volatile float, float addrspace(1)* @var
-  %val37 = load volatile float, float addrspace(1)* @var
-  %val38 = load volatile float, float addrspace(1)* @var
-  %val39 = load volatile float, float addrspace(1)* @var
-  %val40 = load volatile float, float addrspace(1)* @var
+  %val0 = load volatile float, ptr addrspace(1) @var
+  %val1 = load volatile float, ptr addrspace(1) @var
+  %val2 = load volatile float, ptr addrspace(1) @var
+  %val3 = load volatile float, ptr addrspace(1) @var
+  %val4 = load volatile float, ptr addrspace(1) @var
+  %val5 = load volatile float, ptr addrspace(1) @var
+  %val6 = load volatile float, ptr addrspace(1) @var
+  %val7 = load volatile float, ptr addrspace(1) @var
+  %val8 = load volatile float, ptr addrspace(1) @var
+  %val9 = load volatile float, ptr addrspace(1) @var
+  %val10 = load volatile float, ptr addrspace(1) @var
+  %val11 = load volatile float, ptr addrspace(1) @var
+  %val12 = load volatile float, ptr addrspace(1) @var
+  %val13 = load volatile float, ptr addrspace(1) @var
+  %val14 = load volatile float, ptr addrspace(1) @var
+  %val15 = load volatile float, ptr addrspace(1) @var
+  %val16 = load volatile float, ptr addrspace(1) @var
+  %val17 = load volatile float, ptr addrspace(1) @var
+  %val18 = load volatile float, ptr addrspace(1) @var
+  %val19 = load volatile float, ptr addrspace(1) @var
+  %val20 = load volatile float, ptr addrspace(1) @var
+  %val21 = load volatile float, ptr addrspace(1) @var
+  %val22 = load volatile float, ptr addrspace(1) @var
+  %val23 = load volatile float, ptr addrspace(1) @var
+  %val24 = load volatile float, ptr addrspace(1) @var
+  %val25 = load volatile float, ptr addrspace(1) @var
+  %val26 = load volatile float, ptr addrspace(1) @var
+  %val27 = load volatile float, ptr addrspace(1) @var
+  %val28 = load volatile float, ptr addrspace(1) @var
+  %val29 = load volatile float, ptr addrspace(1) @var
+  %val30 = load volatile float, ptr addrspace(1) @var
+  %val31 = load volatile float, ptr addrspace(1) @var
+  %val32 = load volatile float, ptr addrspace(1) @var
+  %val33 = load volatile float, ptr addrspace(1) @var
+  %val34 = load volatile float, ptr addrspace(1) @var
+  %val35 = load volatile float, ptr addrspace(1) @var
+  %val36 = load volatile float, ptr addrspace(1) @var
+  %val37 = load volatile float, ptr addrspace(1) @var
+  %val38 = load volatile float, ptr addrspace(1) @var
+  %val39 = load volatile float, ptr addrspace(1) @var
+  %val40 = load volatile float, ptr addrspace(1) @var
 
-  store volatile float %val0, float addrspace(1)* @var
-  store volatile float %val1, float addrspace(1)* @var
-  store volatile float %val2, float addrspace(1)* @var
-  store volatile float %val3, float addrspace(1)* @var
-  store volatile float %val4, float addrspace(1)* @var
-  store volatile float %val5, float addrspace(1)* @var
-  store volatile float %val6, float addrspace(1)* @var
-  store volatile float %val7, float addrspace(1)* @var
-  store volatile float %val8, float addrspace(1)* @var
-  store volatile float %val9, float addrspace(1)* @var
-  store volatile float %val10, float addrspace(1)* @var
-  store volatile float %val11, float addrspace(1)* @var
-  store volatile float %val12, float addrspace(1)* @var
-  store volatile float %val13, float addrspace(1)* @var
-  store volatile float %val14, float addrspace(1)* @var
-  store volatile float %val15, float addrspace(1)* @var
-  store volatile float %val16, float addrspace(1)* @var
-  store volatile float %val17, float addrspace(1)* @var
-  store volatile float %val18, float addrspace(1)* @var
-  store volatile float %val19, float addrspace(1)* @var
-  store volatile float %val20, float addrspace(1)* @var
-  store volatile float %val21, float addrspace(1)* @var
-  store volatile float %val22, float addrspace(1)* @var
-  store volatile float %val23, float addrspace(1)* @var
-  store volatile float %val24, float addrspace(1)* @var
-  store volatile float %val25, float addrspace(1)* @var
-  store volatile float %val26, float addrspace(1)* @var
-  store volatile float %val27, float addrspace(1)* @var
-  store volatile float %val28, float addrspace(1)* @var
-  store volatile float %val29, float addrspace(1)* @var
-  store volatile float %val30, float addrspace(1)* @var
-  store volatile float %val31, float addrspace(1)* @var
-  store volatile float %val32, float addrspace(1)* @var
-  store volatile float %val33, float addrspace(1)* @var
-  store volatile float %val34, float addrspace(1)* @var
-  store volatile float %val35, float addrspace(1)* @var
-  store volatile float %val36, float addrspace(1)* @var
-  store volatile float %val37, float addrspace(1)* @var
-  store volatile float %val38, float addrspace(1)* @var
-  store volatile float %val39, float addrspace(1)* @var
-  store volatile float %val40, float addrspace(1)* @var
+  store volatile float %val0, ptr addrspace(1) @var
+  store volatile float %val1, ptr addrspace(1) @var
+  store volatile float %val2, ptr addrspace(1) @var
+  store volatile float %val3, ptr addrspace(1) @var
+  store volatile float %val4, ptr addrspace(1) @var
+  store volatile float %val5, ptr addrspace(1) @var
+  store volatile float %val6, ptr addrspace(1) @var
+  store volatile float %val7, ptr addrspace(1) @var
+  store volatile float %val8, ptr addrspace(1) @var
+  store volatile float %val9, ptr addrspace(1) @var
+  store volatile float %val10, ptr addrspace(1) @var
+  store volatile float %val11, ptr addrspace(1) @var
+  store volatile float %val12, ptr addrspace(1) @var
+  store volatile float %val13, ptr addrspace(1) @var
+  store volatile float %val14, ptr addrspace(1) @var
+  store volatile float %val15, ptr addrspace(1) @var
+  store volatile float %val16, ptr addrspace(1) @var
+  store volatile float %val17, ptr addrspace(1) @var
+  store volatile float %val18, ptr addrspace(1) @var
+  store volatile float %val19, ptr addrspace(1) @var
+  store volatile float %val20, ptr addrspace(1) @var
+  store volatile float %val21, ptr addrspace(1) @var
+  store volatile float %val22, ptr addrspace(1) @var
+  store volatile float %val23, ptr addrspace(1) @var
+  store volatile float %val24, ptr addrspace(1) @var
+  store volatile float %val25, ptr addrspace(1) @var
+  store volatile float %val26, ptr addrspace(1) @var
+  store volatile float %val27, ptr addrspace(1) @var
+  store volatile float %val28, ptr addrspace(1) @var
+  store volatile float %val29, ptr addrspace(1) @var
+  store volatile float %val30, ptr addrspace(1) @var
+  store volatile float %val31, ptr addrspace(1) @var
+  store volatile float %val32, ptr addrspace(1) @var
+  store volatile float %val33, ptr addrspace(1) @var
+  store volatile float %val34, ptr addrspace(1) @var
+  store volatile float %val35, ptr addrspace(1) @var
+  store volatile float %val36, ptr addrspace(1) @var
+  store volatile float %val37, ptr addrspace(1) @var
+  store volatile float %val38, ptr addrspace(1) @var
+  store volatile float %val39, ptr addrspace(1) @var
+  store volatile float %val40, ptr addrspace(1) @var
 
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index 48945eaf3fd21..ffab35995dd17 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -41,89 +41,89 @@ attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
 ; CHECK: NumVGPRsForWavesPerEU: 43
 @var = addrspace(1) global float 0.0
 define amdgpu_kernel void @min_1024_max_1024() #3 {
-  %val0 = load volatile float, float addrspace(1)* @var
-  %val1 = load volatile float, float addrspace(1)* @var
-  %val2 = load volatile float, float addrspace(1)* @var
-  %val3 = load volatile float, float addrspace(1)* @var
-  %val4 = load volatile float, float addrspace(1)* @var
-  %val5 = load volatile float, float addrspace(1)* @var
-  %val6 = load volatile float, float addrspace(1)* @var
-  %val7 = load volatile float, float addrspace(1)* @var
-  %val8 = load volatile float, float addrspace(1)* @var
-  %val9 = load volatile float, float addrspace(1)* @var
-  %val10 = load volatile float, float addrspace(1)* @var
-  %val11 = load volatile float, float addrspace(1)* @var
-  %val12 = load volatile float, float addrspace(1)* @var
-  %val13 = load volatile float, float addrspace(1)* @var
-  %val14 = load volatile float, float addrspace(1)* @var
-  %val15 = load volatile float, float addrspace(1)* @var
-  %val16 = load volatile float, float addrspace(1)* @var
-  %val17 = load volatile float, float addrspace(1)* @var
-  %val18 = load volatile float, float addrspace(1)* @var
-  %val19 = load volatile float, float addrspace(1)* @var
-  %val20 = load volatile float, float addrspace(1)* @var
-  %val21 = load volatile float, float addrspace(1)* @var
-  %val22 = load volatile float, float addrspace(1)* @var
-  %val23 = load volatile float, float addrspace(1)* @var
-  %val24 = load volatile float, float addrspace(1)* @var
-  %val25 = load volatile float, float addrspace(1)* @var
-  %val26 = load volatile float, float addrspace(1)* @var
-  %val27 = load volatile float, float addrspace(1)* @var
-  %val28 = load volatile float, float addrspace(1)* @var
-  %val29 = load volatile float, float addrspace(1)* @var
-  %val30 = load volatile float, float addrspace(1)* @var
-  %val31 = load volatile float, float addrspace(1)* @var
-  %val32 = load volatile float, float addrspace(1)* @var
-  %val33 = load volatile float, float addrspace(1)* @var
-  %val34 = load volatile float, float addrspace(1)* @var
-  %val35 = load volatile float, float addrspace(1)* @var
-  %val36 = load volatile float, float addrspace(1)* @var
-  %val37 = load volatile float, float addrspace(1)* @var
-  %val38 = load volatile float, float addrspace(1)* @var
-  %val39 = load volatile float, float addrspace(1)* @var
-  %val40 = load volatile float, float addrspace(1)* @var
+  %val0 = load volatile float, ptr addrspace(1) @var
+  %val1 = load volatile float, ptr addrspace(1) @var
+  %val2 = load volatile float, ptr addrspace(1) @var
+  %val3 = load volatile float, ptr addrspace(1) @var
+  %val4 = load volatile float, ptr addrspace(1) @var
+  %val5 = load volatile float, ptr addrspace(1) @var
+  %val6 = load volatile float, ptr addrspace(1) @var
+  %val7 = load volatile float, ptr addrspace(1) @var
+  %val8 = load volatile float, ptr addrspace(1) @var
+  %val9 = load volatile float, ptr addrspace(1) @var
+  %val10 = load volatile float, ptr addrspace(1) @var
+  %val11 = load volatile float, ptr addrspace(1) @var
+  %val12 = load volatile float, ptr addrspace(1) @var
+  %val13 = load volatile float, ptr addrspace(1) @var
+  %val14 = load volatile float, ptr addrspace(1) @var
+  %val15 = load volatile float, ptr addrspace(1) @var
+  %val16 = load volatile float, ptr addrspace(1) @var
+  %val17 = load volatile float, ptr addrspace(1) @var
+  %val18 = load volatile float, ptr addrspace(1) @var
+  %val19 = load volatile float, ptr addrspace(1) @var
+  %val20 = load volatile float, ptr addrspace(1) @var
+  %val21 = load volatile float, ptr addrspace(1) @var
+  %val22 = load volatile float, ptr addrspace(1) @var
+  %val23 = load volatile float, ptr addrspace(1) @var
+  %val24 = load volatile float, ptr addrspace(1) @var
+  %val25 = load volatile float, ptr addrspace(1) @var
+  %val26 = load volatile float, ptr addrspace(1) @var
+  %val27 = load volatile float, ptr addrspace(1) @var
+  %val28 = load volatile float, ptr addrspace(1) @var
+  %val29 = load volatile float, ptr addrspace(1) @var
+  %val30 = load volatile float, ptr addrspace(1) @var
+  %val31 = load volatile float, ptr addrspace(1) @var
+  %val32 = load volatile float, ptr addrspace(1) @var
+  %val33 = load volatile float, ptr addrspace(1) @var
+  %val34 = load volatile float, ptr addrspace(1) @var
+  %val35 = load volatile float, ptr addrspace(1) @var
+  %val36 = load volatile float, ptr addrspace(1) @var
+  %val37 = load volatile float, ptr addrspace(1) @var
+  %val38 = load volatile float, ptr addrspace(1) @var
+  %val39 = load volatile float, ptr addrspace(1) @var
+  %val40 = load volatile float, ptr addrspace(1) @var
 
-  store volatile float %val0, float addrspace(1)* @var
-  store volatile float %val1, float addrspace(1)* @var
-  store volatile float %val2, float addrspace(1)* @var
-  store volatile float %val3, float addrspace(1)* @var
-  store volatile float %val4, float addrspace(1)* @var
-  store volatile float %val5, float addrspace(1)* @var
-  store volatile float %val6, float addrspace(1)* @var
-  store volatile float %val7, float addrspace(1)* @var
-  store volatile float %val8, float addrspace(1)* @var
-  store volatile float %val9, float addrspace(1)* @var
-  store volatile float %val10, float addrspace(1)* @var
-  store volatile float %val11, float addrspace(1)* @var
-  store volatile float %val12, float addrspace(1)* @var
-  store volatile float %val13, float addrspace(1)* @var
-  store volatile float %val14, float addrspace(1)* @var
-  store volatile float %val15, float addrspace(1)* @var
-  store volatile float %val16, float addrspace(1)* @var
-  store volatile float %val17, float addrspace(1)* @var
-  store volatile float %val18, float addrspace(1)* @var
-  store volatile float %val19, float addrspace(1)* @var
-  store volatile float %val20, float addrspace(1)* @var
-  store volatile float %val21, float addrspace(1)* @var
-  store volatile float %val22, float addrspace(1)* @var
-  store volatile float %val23, float addrspace(1)* @var
-  store volatile float %val24, float addrspace(1)* @var
-  store volatile float %val25, float addrspace(1)* @var
-  store volatile float %val26, float addrspace(1)* @var
-  store volatile float %val27, float addrspace(1)* @var
-  store volatile float %val28, float addrspace(1)* @var
-  store volatile float %val29, float addrspace(1)* @var
-  store volatile float %val30, float addrspace(1)* @var
-  store volatile float %val31, float addrspace(1)* @var
-  store volatile float %val32, float addrspace(1)* @var
-  store volatile float %val33, float addrspace(1)* @var
-  store volatile float %val34, float addrspace(1)* @var
-  store volatile float %val35, float addrspace(1)* @var
-  store volatile float %val36, float addrspace(1)* @var
-  store volatile float %val37, float addrspace(1)* @var
-  store volatile float %val38, float addrspace(1)* @var
-  store volatile float %val39, float addrspace(1)* @var
-  store volatile float %val40, float addrspace(1)* @var
+  store volatile float %val0, ptr addrspace(1) @var
+  store volatile float %val1, ptr addrspace(1) @var
+  store volatile float %val2, ptr addrspace(1) @var
+  store volatile float %val3, ptr addrspace(1) @var
+  store volatile float %val4, ptr addrspace(1) @var
+  store volatile float %val5, ptr addrspace(1) @var
+  store volatile float %val6, ptr addrspace(1) @var
+  store volatile float %val7, ptr addrspace(1) @var
+  store volatile float %val8, ptr addrspace(1) @var
+  store volatile float %val9, ptr addrspace(1) @var
+  store volatile float %val10, ptr addrspace(1) @var
+  store volatile float %val11, ptr addrspace(1) @var
+  store volatile float %val12, ptr addrspace(1) @var
+  store volatile float %val13, ptr addrspace(1) @var
+  store volatile float %val14, ptr addrspace(1) @var
+  store volatile float %val15, ptr addrspace(1) @var
+  store volatile float %val16, ptr addrspace(1) @var
+  store volatile float %val17, ptr addrspace(1) @var
+  store volatile float %val18, ptr addrspace(1) @var
+  store volatile float %val19, ptr addrspace(1) @var
+  store volatile float %val20, ptr addrspace(1) @var
+  store volatile float %val21, ptr addrspace(1) @var
+  store volatile float %val22, ptr addrspace(1) @var
+  store volatile float %val23, ptr addrspace(1) @var
+  store volatile float %val24, ptr addrspace(1) @var
+  store volatile float %val25, ptr addrspace(1) @var
+  store volatile float %val26, ptr addrspace(1) @var
+  store volatile float %val27, ptr addrspace(1) @var
+  store volatile float %val28, ptr addrspace(1) @var
+  store volatile float %val29, ptr addrspace(1) @var
+  store volatile float %val30, ptr addrspace(1) @var
+  store volatile float %val31, ptr addrspace(1) @var
+  store volatile float %val32, ptr addrspace(1) @var
+  store volatile float %val33, ptr addrspace(1) @var
+  store volatile float %val34, ptr addrspace(1) @var
+  store volatile float %val35, ptr addrspace(1) @var
+  store volatile float %val36, ptr addrspace(1) @var
+  store volatile float %val37, ptr addrspace(1) @var
+  store volatile float %val38, ptr addrspace(1) @var
+  store volatile float %val39, ptr addrspace(1) @var
+  store volatile float %val40, ptr addrspace(1) @var
 
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
index 979665ff0a807..a1594a83d7dd8 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
@@ -6,69 +6,69 @@
 ; CHECK: VGPRBlocks: 4
 ; CHECK: NumVGPRsForWavesPerEU: 20
 define amdgpu_kernel void @max_20_vgprs() #1 {
-  %val0 = load volatile float, float addrspace(1)* @var
-  %val1 = load volatile float, float addrspace(1)* @var
-  %val2 = load volatile float, float addrspace(1)* @var
-  %val3 = load volatile float, float addrspace(1)* @var
-  %val4 = load volatile float, float addrspace(1)* @var
-  %val5 = load volatile float, float addrspace(1)* @var
-  %val6 = load volatile float, float addrspace(1)* @var
-  %val7 = load volatile float, float addrspace(1)* @var
-  %val8 = load volatile float, float addrspace(1)* @var
-  %val9 = load volatile float, float addrspace(1)* @var
-  %val10 = load volatile float, float addrspace(1)* @var
-  %val11 = load volatile float, float addrspace(1)* @var
-  %val12 = load volatile float, float addrspace(1)* @var
-  %val13 = load volatile float, float addrspace(1)* @var
-  %val14 = load volatile float, float addrspace(1)* @var
-  %val15 = load volatile float, float addrspace(1)* @var
-  %val16 = load volatile float, float addrspace(1)* @var
-  %val17 = load volatile float, float addrspace(1)* @var
-  %val18 = load volatile float, float addrspace(1)* @var
-  %val19 = load volatile float, float addrspace(1)* @var
-  %val20 = load volatile float, float addrspace(1)* @var
-  %val21 = load volatile float, float addrspace(1)* @var
-  %val22 = load volatile float, float addrspace(1)* @var
-  %val23 = load volatile float, float addrspace(1)* @var
-  %val24 = load volatile float, float addrspace(1)* @var
-  %val25 = load volatile float, float addrspace(1)* @var
-  %val26 = load volatile float, float addrspace(1)* @var
-  %val27 = load volatile float, float addrspace(1)* @var
-  %val28 = load volatile float, float addrspace(1)* @var
-  %val29 = load volatile float, float addrspace(1)* @var
-  %val30 = load volatile float, float addrspace(1)* @var
+  %val0 = load volatile float, ptr addrspace(1) @var
+  %val1 = load volatile float, ptr addrspace(1) @var
+  %val2 = load volatile float, ptr addrspace(1) @var
+  %val3 = load volatile float, ptr addrspace(1) @var
+  %val4 = load volatile float, ptr addrspace(1) @var
+  %val5 = load volatile float, ptr addrspace(1) @var
+  %val6 = load volatile float, ptr addrspace(1) @var
+  %val7 = load volatile float, ptr addrspace(1) @var
+  %val8 = load volatile float, ptr addrspace(1) @var
+  %val9 = load volatile float, ptr addrspace(1) @var
+  %val10 = load volatile float, ptr addrspace(1) @var
+  %val11 = load volatile float, ptr addrspace(1) @var
+  %val12 = load volatile float, ptr addrspace(1) @var
+  %val13 = load volatile float, ptr addrspace(1) @var
+  %val14 = load volatile float, ptr addrspace(1) @var
+  %val15 = load volatile float, ptr addrspace(1) @var
+  %val16 = load volatile float, ptr addrspace(1) @var
+  %val17 = load volatile float, ptr addrspace(1) @var
+  %val18 = load volatile float, ptr addrspace(1) @var
+  %val19 = load volatile float, ptr addrspace(1) @var
+  %val20 = load volatile float, ptr addrspace(1) @var
+  %val21 = load volatile float, ptr addrspace(1) @var
+  %val22 = load volatile float, ptr addrspace(1) @var
+  %val23 = load volatile float, ptr addrspace(1) @var
+  %val24 = load volatile float, ptr addrspace(1) @var
+  %val25 = load volatile float, ptr addrspace(1) @var
+  %val26 = load volatile float, ptr addrspace(1) @var
+  %val27 = load volatile float, ptr addrspace(1) @var
+  %val28 = load volatile float, ptr addrspace(1) @var
+  %val29 = load volatile float, ptr addrspace(1) @var
+  %val30 = load volatile float, ptr addrspace(1) @var
 
-  store volatile float %val0, float addrspace(1)* @var
-  store volatile float %val1, float addrspace(1)* @var
-  store volatile float %val2, float addrspace(1)* @var
-  store volatile float %val3, float addrspace(1)* @var
-  store volatile float %val4, float addrspace(1)* @var
-  store volatile float %val5, float addrspace(1)* @var
-  store volatile float %val6, float addrspace(1)* @var
-  store volatile float %val7, float addrspace(1)* @var
-  store volatile float %val8, float addrspace(1)* @var
-  store volatile float %val9, float addrspace(1)* @var
-  store volatile float %val10, float addrspace(1)* @var
-  store volatile float %val11, float addrspace(1)* @var
-  store volatile float %val12, float addrspace(1)* @var
-  store volatile float %val13, float addrspace(1)* @var
-  store volatile float %val14, float addrspace(1)* @var
-  store volatile float %val15, float addrspace(1)* @var
-  store volatile float %val16, float addrspace(1)* @var
-  store volatile float %val17, float addrspace(1)* @var
-  store volatile float %val18, float addrspace(1)* @var
-  store volatile float %val19, float addrspace(1)* @var
-  store volatile float %val20, float addrspace(1)* @var
-  store volatile float %val21, float addrspace(1)* @var
-  store volatile float %val22, float addrspace(1)* @var
-  store volatile float %val23, float addrspace(1)* @var
-  store volatile float %val24, float addrspace(1)* @var
-  store volatile float %val25, float addrspace(1)* @var
-  store volatile float %val26, float addrspace(1)* @var
-  store volatile float %val27, float addrspace(1)* @var
-  store volatile float %val28, float addrspace(1)* @var
-  store volatile float %val29, float addrspace(1)* @var
-  store volatile float %val30, float addrspace(1)* @var
+  store volatile float %val0, ptr addrspace(1) @var
+  store volatile float %val1, ptr addrspace(1) @var
+  store volatile float %val2, ptr addrspace(1) @var
+  store volatile float %val3, ptr addrspace(1) @var
+  store volatile float %val4, ptr addrspace(1) @var
+  store volatile float %val5, ptr addrspace(1) @var
+  store volatile float %val6, ptr addrspace(1) @var
+  store volatile float %val7, ptr addrspace(1) @var
+  store volatile float %val8, ptr addrspace(1) @var
+  store volatile float %val9, ptr addrspace(1) @var
+  store volatile float %val10, ptr addrspace(1) @var
+  store volatile float %val11, ptr addrspace(1) @var
+  store volatile float %val12, ptr addrspace(1) @var
+  store volatile float %val13, ptr addrspace(1) @var
+  store volatile float %val14, ptr addrspace(1) @var
+  store volatile float %val15, ptr addrspace(1) @var
+  store volatile float %val16, ptr addrspace(1) @var
+  store volatile float %val17, ptr addrspace(1) @var
+  store volatile float %val18, ptr addrspace(1) @var
+  store volatile float %val19, ptr addrspace(1) @var
+  store volatile float %val20, ptr addrspace(1) @var
+  store volatile float %val21, ptr addrspace(1) @var
+  store volatile float %val22, ptr addrspace(1) @var
+  store volatile float %val23, ptr addrspace(1) @var
+  store volatile float %val24, ptr addrspace(1) @var
+  store volatile float %val25, ptr addrspace(1) @var
+  store volatile float %val26, ptr addrspace(1) @var
+  store volatile float %val27, ptr addrspace(1) @var
+  store volatile float %val28, ptr addrspace(1) @var
+  store volatile float %val29, ptr addrspace(1) @var
+  store volatile float %val30, ptr addrspace(1) @var
 
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll b/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll
index 4105d24f0a6ea..dabdbbda2a884 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll
@@ -1,53 +1,51 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck %s
 
-%0 = type { double()*, %0* }
+%0 = type { ptr, ptr }
 
-define internal fastcc i1 @widget(%0* %arg) {
+define internal fastcc i1 @widget(ptr %arg) {
 ; CHECK-LABEL: define {{[^@]+}}@widget
-; CHECK-SAME: (%0* [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: (ptr [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[TMP0:%.*]], %0* [[ARG]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load %0*, %0** [[TMP]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = call fastcc double @baz(%0* [[TMP1]])
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[TMP0:%.*]], ptr [[ARG]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call fastcc double @baz(ptr [[TMP1]])
 ; CHECK-NEXT:    ret i1 false
 ;
 bb:
-  %tmp = getelementptr inbounds %0, %0* %arg, i64 0, i32 1
-  %tmp1 = load %0*, %0** %tmp, align 8
-  %tmp2 = call fastcc double @baz(%0* %tmp1)
+  %tmp = getelementptr inbounds %0, ptr %arg, i64 0, i32 1
+  %tmp1 = load ptr, ptr %tmp, align 8
+  %tmp2 = call fastcc double @baz(ptr %tmp1)
   ret i1 false
 }
 
-define internal fastcc double @baz(%0* %arg) {
+define internal fastcc double @baz(ptr %arg) {
 ; CHECK-LABEL: define {{[^@]+}}@baz
-; CHECK-SAME: (%0* [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[TMP0:%.*]], %0* [[ARG]], i64 0, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load double ()*, double ()** [[TMP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARG]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call double [[TMP1]]()
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[TMP0]], %0* [[ARG]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[TMP0]], ptr [[ARG]], i64 0, i32 1
 ; CHECK-NEXT:    br label [[BB5:%.*]]
 ; CHECK:       bb5:
-; CHECK-NEXT:    [[TMP6:%.*]] = load %0*, %0** [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = call fastcc i1 @widget(%0* [[TMP6]])
+; CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = call fastcc i1 @widget(ptr [[TMP6]])
 ; CHECK-NEXT:    br label [[BB5]]
 ;
 bb:
-  %tmp = getelementptr inbounds %0, %0* %arg, i64 0, i32 0
-  %tmp1 = load double ()*, double ()** %tmp, align 8
+  %tmp1 = load ptr, ptr %arg, align 8
   %tmp2 = tail call double %tmp1()
   br label %bb3
 
 bb3:                                              ; preds = %bb
-  %tmp4 = getelementptr inbounds %0, %0* %arg, i64 0, i32 1
+  %tmp4 = getelementptr inbounds %0, ptr %arg, i64 0, i32 1
   br label %bb5
 
 bb5:                                              ; preds = %bb5, %bb3
-  %tmp6 = load %0*, %0** %tmp4, align 8
-  %tmp7 = call fastcc i1 @widget(%0* %tmp6)
+  %tmp6 = load ptr, ptr %tmp4, align 8
+  %tmp7 = call fastcc i1 @widget(ptr %tmp6)
   br label %bb5
 }
 
@@ -55,13 +53,13 @@ define amdgpu_kernel void @entry() {
 ; CHECK-LABEL: define {{[^@]+}}@entry
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [[TMP0:%.*]], align 8, addrspace(5)
-; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast [[TMP0]] addrspace(5)* [[ALLOCA]] to %0*
-; CHECK-NEXT:    [[ARST:%.*]] = call double @baz(%0* [[CAST]])
+; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr
+; CHECK-NEXT:    [[ARST:%.*]] = call double @baz(ptr [[CAST]])
 ; CHECK-NEXT:    ret void
 ;
   %alloca = alloca %0, align 8, addrspace(5)
-  %cast = addrspacecast %0 addrspace(5)* %alloca to %0*
-  %arst = call double @baz(%0* %cast)
+  %cast = addrspacecast ptr addrspace(5) %alloca to ptr
+  %arst = call double @baz(ptr %cast)
   ret void
 }
 ;.

diff  --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
index 6e2533706639c..62db25c765c6b 100644
--- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
@@ -17,12 +17,12 @@
 
 ; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
-define amdgpu_kernel void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
+define amdgpu_kernel void @test_branch(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %val) #0 {
   %cmp = icmp ne i32 %val, 0
   br i1 %cmp, label %store, label %end
 
 store:
-  store i32 222, i32 addrspace(1)* %out
+  store i32 222, ptr addrspace(1) %out
   ret void
 
 end:
@@ -41,12 +41,12 @@ end:
 
 ; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
-define amdgpu_kernel void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
+define amdgpu_kernel void @test_brcc_i1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %val) #0 {
   %cmp0 = icmp ne i1 %val, 0
   br i1 %cmp0, label %store, label %end
 
 store:
-  store i32 222, i32 addrspace(1)* %out
+  store i32 222, ptr addrspace(1) %out
   ret void
 
 end:

diff  --git a/llvm/test/CodeGen/AMDGPU/basic-loop.ll b/llvm/test/CodeGen/AMDGPU/basic-loop.ll
index d1dfd7c91dfb8..0b42b4d6b1c3e 100644
--- a/llvm/test/CodeGen/AMDGPU/basic-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-loop.ll
@@ -2,13 +2,13 @@
 ; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test_loop:
-define amdgpu_kernel void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
+define amdgpu_kernel void @test_loop(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %val) nounwind {
 entry:
   br label %loop.body
 
 loop.body:
   %i = phi i32 [0, %entry], [%i.inc, %loop.body]
-  store i32 222, i32 addrspace(1)* %out
+  store i32 222, ptr addrspace(1) %out
   %cmp = icmp ne i32 %i, %val
   %i.inc = add i32 %i, 1
   br i1 %cmp, label %loop.body, label %end

diff  --git a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
index cc6e654a5ddae..c91bd4ca099d3 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
@@ -239,7 +239,7 @@ define float @v_bfi_single_constant_as_partition(float %x, float %y, float %z) {
   ret float %result
 }
 
-define amdgpu_kernel void @v_bfi_dont_applied_for_scalar_ops(i32 addrspace(1)* %out, i16 %a, i32 %b) {; GFX10-LABEL: v_bfi_not_applied_in_scalar_case:
+define amdgpu_kernel void @v_bfi_dont_applied_for_scalar_ops(ptr addrspace(1) %out, i16 %a, i32 %b) {; GFX10-LABEL: v_bfi_not_applied_in_scalar_case:
 ; GCN-LABEL: v_bfi_dont_applied_for_scalar_ops:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -259,6 +259,6 @@ define amdgpu_kernel void @v_bfi_dont_applied_for_scalar_ops(i32 addrspace(1)* %
   %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 %tr, i32 1
   %val = bitcast <2 x i16> %vec to i32
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll b/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll
index cbc5963464222..6a48aee267b5f 100644
--- a/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll
@@ -3,35 +3,35 @@
 ; GCN-LABEL: {{^}}cast_constant_i64_to_build_vector_v4i16:
 ; GCN: global_store_short
 ; GCN: global_store_dwordx3
-define amdgpu_kernel void @cast_constant_i64_to_build_vector_v4i16(i8 addrspace(1)* nocapture %data) {
+define amdgpu_kernel void @cast_constant_i64_to_build_vector_v4i16(ptr addrspace(1) nocapture %data) {
 entry:
-  store i8 72, i8 addrspace(1)* %data, align 1
-  %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 1
-  store i8 101, i8 addrspace(1)* %arrayidx1, align 1
-  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 2
-  store i8 108, i8 addrspace(1)* %arrayidx2, align 1
-  %arrayidx3 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 3
-  store i8 108, i8 addrspace(1)* %arrayidx3, align 1
-  %arrayidx4 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 4
-  store i8 111, i8 addrspace(1)* %arrayidx4, align 1
-  %arrayidx5 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 5
-  store i8 44, i8 addrspace(1)* %arrayidx5, align 1
-  %arrayidx6 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 6
-  store i8 32, i8 addrspace(1)* %arrayidx6, align 1
-  %arrayidx7 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 7
-  store i8 87, i8 addrspace(1)* %arrayidx7, align 1
-  %arrayidx8 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 8
-  store i8 111, i8 addrspace(1)* %arrayidx8, align 1
-  %arrayidx9 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 9
-  store i8 114, i8 addrspace(1)* %arrayidx9, align 1
-  %arrayidx10 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 10
-  store i8 108, i8 addrspace(1)* %arrayidx10, align 1
-  %arrayidx11 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 11
-  store i8 100, i8 addrspace(1)* %arrayidx11, align 1
-  %arrayidx12 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 12
-  store i8 33, i8 addrspace(1)* %arrayidx12, align 1
-  %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 13
-  store i8 72, i8 addrspace(1)* %arrayidx13, align 1
+  store i8 72, ptr addrspace(1) %data, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr addrspace(1) %data, i64 1
+  store i8 101, ptr addrspace(1) %arrayidx1, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %data, i64 2
+  store i8 108, ptr addrspace(1) %arrayidx2, align 1
+  %arrayidx3 = getelementptr inbounds i8, ptr addrspace(1) %data, i64 3
+  store i8 108, ptr addrspace(1) %arrayidx3, align 1
+  %arrayidx4 = getelementptr inbounds i8, ptr addrspace(1) %data, i64 4
+  store i8 111, ptr addrspace(1) %arrayidx4, align 1
+  %arrayidx5 = getelementptr inbounds i8, ptr addrspace(1) %data, i64 5
+  store i8 44, ptr addrspace(1) %arrayidx5, align 1
+  %arrayidx6 = getelementptr inbounds i8, ptr addrspace(1) %data, i64 6
+  store i8 32, ptr addrspace(1) %arrayidx6, align 1
+  %arrayidx7 = getelementptr inbounds i8, ptr addrspace(1) %data, i64 7
+  store i8 87, ptr addrspace(1) %arrayidx7, align 1
+  %arrayidx8 = getelementptr inbounds i8, ptr addrspace(1) %data, i64 8
+  store i8 111, ptr addrspace(1) %arrayidx8, align 1
+  %arrayidx9 = getelementptr inbounds i8, ptr addrspace(1) %data, i64 9
+  store i8 114, ptr addrspace(1) %arrayidx9, align 1
+  %arrayidx10 = getelementptr inbounds i8, ptr addrspace(1) %data, i64 10
+  store i8 108, ptr addrspace(1) %arrayidx10, align 1
+  %arrayidx11 = getelementptr inbounds i8, ptr addrspace(1) %data, i64 11
+  store i8 100, ptr addrspace(1) %arrayidx11, align 1
+  %arrayidx12 = getelementptr inbounds i8, ptr addrspace(1) %data, i64 12
+  store i8 33, ptr addrspace(1) %arrayidx12, align 1
+  %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %data, i64 13
+  store i8 72, ptr addrspace(1) %arrayidx13, align 1
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
index 6644449b14a87..e4c88a09f85a9 100644
--- a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
@@ -11,12 +11,12 @@
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v8f32(ptr addrspace(1) %out, <8 x i32> %vec) {
   %vec0.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8> to <8 x float>
-  store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
+  store volatile <8 x float> %vec0.bc, ptr addrspace(1) %out
 
   %vec1.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 9> to <8 x float>
-  store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out
+  store volatile <8 x float> %vec1.bc, ptr addrspace(1) %out
   ret void
 }
 
@@ -27,12 +27,12 @@ define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> add
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v8f32(ptr addrspace(1) %out, <4 x i64> %vec) {
   %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <8 x float>
-  store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
+  store volatile <8 x float> %vec0.bc, ptr addrspace(1) %out
 
   %vec1.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 9> to <8 x float>
-  store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out
+  store volatile <8 x float> %vec1.bc, ptr addrspace(1) %out
   ret void
 }
 
@@ -43,12 +43,12 @@ define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> add
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %vec) {
   %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <4 x double>
-  store volatile <4 x double> %vec0.bc, <4 x double> addrspace(1)* %out
+  store volatile <4 x double> %vec0.bc, ptr addrspace(1) %out
 
   %vec1.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 9> to <4 x double>
-  store volatile <4 x double> %vec1.bc, <4 x double> addrspace(1)* %out
+  store volatile <4 x double> %vec1.bc, ptr addrspace(1) %out
   ret void
 }
 
@@ -59,31 +59,31 @@ define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> ad
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v16i16(ptr addrspace(1) %out, <16 x i16> %vec) {
   %vec0.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 8> to <8 x float>
-  store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
+  store volatile <8 x float> %vec0.bc, ptr addrspace(1) %out
 
   %vec1.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 9> to <8 x float>
-  store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out
+  store volatile <8 x float> %vec1.bc, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source:
 ; GCN-NOT: store_dword
-define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source(<2 x i32> addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
   %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 999) #1
   %bc = bitcast i64 %undef to <2 x i32>
-  store volatile <2 x i32> %bc, <2 x i32> addrspace(1)* %out
+  store volatile <2 x i32> %bc, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source_extractelt:
 ; GCN-NOT: store_dword
-define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source_extractelt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source_extractelt(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
   %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 9999) #1
   %bc = bitcast i64 %undef to <2 x i32>
   %elt1 = extractelement <2 x i32> %bc, i32 1
-  store volatile i32 %elt1, i32 addrspace(1)* %out
+  store volatile i32 %elt1, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll b/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
index e4e32708af257..d156a6fce856c 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
@@ -7,8 +7,8 @@
 ; GCN-LABEL: {{^}}materialize_0_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[K]]
-define amdgpu_kernel void @materialize_0_i32(i32 addrspace(1)* %out) {
-  store i32 0, i32 addrspace(1)* %out
+define amdgpu_kernel void @materialize_0_i32(ptr addrspace(1) %out) {
+  store i32 0, ptr addrspace(1) %out
   ret void
 }
 
@@ -16,16 +16,16 @@ define amdgpu_kernel void @materialize_0_i32(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
 ; GCN: buffer_store_dwordx2 v[[[LOK]]:[[HIK]]]
-define amdgpu_kernel void @materialize_0_i64(i64 addrspace(1)* %out) {
-  store i64 0, i64 addrspace(1)* %out
+define amdgpu_kernel void @materialize_0_i64(ptr addrspace(1) %out) {
+  store i64 0, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}materialize_neg1_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -1{{$}}
 ; GCN: buffer_store_dword [[K]]
-define amdgpu_kernel void @materialize_neg1_i32(i32 addrspace(1)* %out) {
-  store i32 -1, i32 addrspace(1)* %out
+define amdgpu_kernel void @materialize_neg1_i32(ptr addrspace(1) %out) {
+  store i32 -1, ptr addrspace(1) %out
   ret void
 }
 
@@ -33,16 +33,16 @@ define amdgpu_kernel void @materialize_neg1_i32(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
 ; GCN: buffer_store_dwordx2 v[[[LOK]]:[[HIK]]]
-define amdgpu_kernel void @materialize_neg1_i64(i64 addrspace(1)* %out) {
-  store i64 -1, i64 addrspace(1)* %out
+define amdgpu_kernel void @materialize_neg1_i64(ptr addrspace(1) %out) {
+  store i64 -1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}materialize_signbit_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[K]]
-define amdgpu_kernel void @materialize_signbit_i32(i32 addrspace(1)* %out) {
-  store i32 -2147483648, i32 addrspace(1)* %out
+define amdgpu_kernel void @materialize_signbit_i32(ptr addrspace(1) %out) {
+  store i32 -2147483648, ptr addrspace(1) %out
   ret void
 }
 
@@ -50,16 +50,16 @@ define amdgpu_kernel void @materialize_signbit_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v[[[LOK]]:[[HIK]]]
-define amdgpu_kernel void @materialize_signbit_i64(i64 addrspace(1)* %out) {
-  store i64  -9223372036854775808, i64 addrspace(1)* %out
+define amdgpu_kernel void @materialize_signbit_i64(ptr addrspace(1) %out) {
+  store i64  -9223372036854775808, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}materialize_rev_neg16_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], -16{{$}}
 ; GCN: buffer_store_dword [[K]]
-define amdgpu_kernel void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
-  store i32 268435455, i32 addrspace(1)* %out
+define amdgpu_kernel void @materialize_rev_neg16_i32(ptr addrspace(1) %out) {
+  store i32 268435455, ptr addrspace(1) %out
   ret void
 }
 
@@ -67,16 +67,16 @@ define amdgpu_kernel void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], -16{{$}}
 ; GCN: buffer_store_dwordx2 v[[[LOK]]:[[HIK]]]
-define amdgpu_kernel void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
-  store i64  1152921504606846975, i64 addrspace(1)* %out
+define amdgpu_kernel void @materialize_rev_neg16_i64(ptr addrspace(1) %out) {
+  store i64  1152921504606846975, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}materialize_rev_neg17_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xf7ffffff{{$}}
 ; GCN: buffer_store_dword [[K]]
-define amdgpu_kernel void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
-  store i32 -134217729, i32 addrspace(1)* %out
+define amdgpu_kernel void @materialize_rev_neg17_i32(ptr addrspace(1) %out) {
+  store i32 -134217729, ptr addrspace(1) %out
   ret void
 }
 
@@ -84,16 +84,16 @@ define amdgpu_kernel void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0xf7ffffff{{$}}
 ; GCN: buffer_store_dwordx2 v[[[LOK]]:[[HIK]]]
-define amdgpu_kernel void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
-  store i64 -576460752303423489, i64 addrspace(1)* %out
+define amdgpu_kernel void @materialize_rev_neg17_i64(ptr addrspace(1) %out) {
+  store i64 -576460752303423489, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}materialize_rev_64_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 64{{$}}
 ; GCN: buffer_store_dword [[K]]
-define amdgpu_kernel void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
-  store i32 33554432, i32 addrspace(1)* %out
+define amdgpu_kernel void @materialize_rev_64_i32(ptr addrspace(1) %out) {
+  store i32 33554432, ptr addrspace(1) %out
   ret void
 }
 
@@ -101,16 +101,16 @@ define amdgpu_kernel void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 64{{$}}
 ; GCN: buffer_store_dwordx2 v[[[LOK]]:[[HIK]]]
-define amdgpu_kernel void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
-  store i64 144115188075855872, i64 addrspace(1)* %out
+define amdgpu_kernel void @materialize_rev_64_i64(ptr addrspace(1) %out) {
+  store i64 144115188075855872, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}materialize_rev_65_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x82000000{{$}}
 ; GCN: buffer_store_dword [[K]]
-define amdgpu_kernel void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
-  store i32 -2113929216, i32 addrspace(1)* %out
+define amdgpu_kernel void @materialize_rev_65_i32(ptr addrspace(1) %out) {
+  store i32 -2113929216, ptr addrspace(1) %out
   ret void
 }
 
@@ -118,16 +118,16 @@ define amdgpu_kernel void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0x82000000{{$}}
 ; GCN: buffer_store_dwordx2 v[[[LOK]]:[[HIK]]]
-define amdgpu_kernel void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
-  store i64 -9079256848778919936, i64 addrspace(1)* %out
+define amdgpu_kernel void @materialize_rev_65_i64(ptr addrspace(1) %out) {
+  store i64 -9079256848778919936, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}materialize_rev_3_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -2.0{{$}}
 ; GCN: buffer_store_dword [[K]]
-define amdgpu_kernel void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
-  store i32 -1073741824, i32 addrspace(1)* %out
+define amdgpu_kernel void @materialize_rev_3_i32(ptr addrspace(1) %out) {
+  store i32 -1073741824, ptr addrspace(1) %out
   ret void
 }
 
@@ -135,16 +135,16 @@ define amdgpu_kernel void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], -2.0{{$}}
 ; GCN: buffer_store_dwordx2 v[[[LOK]]:[[HIK]]]
-define amdgpu_kernel void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
-  store i64 -4611686018427387904, i64 addrspace(1)* %out
+define amdgpu_kernel void @materialize_rev_3_i64(ptr addrspace(1) %out) {
+  store i64 -4611686018427387904, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}materialize_rev_1.0_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1fc{{$}}
 ; GCN: buffer_store_dword [[K]]
-define amdgpu_kernel void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
-  store i32 508, i32 addrspace(1)* %out
+define amdgpu_kernel void @materialize_rev_1.0_i32(ptr addrspace(1) %out) {
+  store i32 508, ptr addrspace(1) %out
   ret void
 }
 
@@ -152,8 +152,8 @@ define amdgpu_kernel void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0x1fc{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v[[[LOK]]:[[HIK]]]
-define amdgpu_kernel void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) {
-  store i64 508, i64 addrspace(1)* %out
+define amdgpu_kernel void @materialize_rev_1.0_i64(ptr addrspace(1) %out) {
+  store i64 508, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
index b24f4f11182ec..5a252f22bdc02 100644
--- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -31,21 +31,21 @@
 ; VI: buffer_store_short v[[B_F16]]
 ; VI: s_endpgm
 define amdgpu_kernel void @br_cc_f16(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
   %fcmp = fcmp olt half %a.val, %b.val
   br i1 %fcmp, label %one, label %two
 
 one:
-  store half %a.val, half addrspace(1)* %r
+  store half %a.val, ptr addrspace(1) %r
   ret void
 
 two:
-  store half %b.val, half addrspace(1)* %r
+  store half %b.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -70,19 +70,19 @@ two:
 ; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
 
 define amdgpu_kernel void @br_cc_f16_imm_a(
-    half addrspace(1)* %r,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %b) {
 entry:
-  %b.val = load half, half addrspace(1)* %b
+  %b.val = load half, ptr addrspace(1) %b
   %fcmp = fcmp olt half 0xH3800, %b.val
   br i1 %fcmp, label %one, label %two
 
 one:
-  store half 0xH3800, half addrspace(1)* %r
+  store half 0xH3800, ptr addrspace(1) %r
   ret void
 
 two:
-  store half %b.val, half addrspace(1)* %r
+  store half %b.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -103,18 +103,18 @@ two:
 ; GCN: buffer_store_short v[[B_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @br_cc_f16_imm_b(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
+  %a.val = load half, ptr addrspace(1) %a
   %fcmp = fcmp olt half %a.val, 0xH3800
   br i1 %fcmp, label %one, label %two
 
 one:
-  store half %a.val, half addrspace(1)* %r
+  store half %a.val, ptr addrspace(1) %r
   ret void
 
 two:
-  store half 0xH3800, half addrspace(1)* %r
+  store half 0xH3800, ptr addrspace(1) %r
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
index a6c063e6c47ea..0ed1e4d3b1227 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
@@ -30,7 +30,7 @@ bb:
   br i1 %tmp3, label %bb4, label %bb5
 
 bb4:                                              ; preds = %bb
-  store volatile i32 4, i32 addrspace(3)* undef
+  store volatile i32 4, ptr addrspace(3) undef
   unreachable
 
 bb5:                                              ; preds = %bb

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-relax-bundle.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-bundle.ll
index 1f1fe819528bb..d678ee633bc43 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-bundle.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-bundle.ll
@@ -30,7 +30,7 @@ declare void @func() #0
 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, func@
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, func@
 ; GCN: s_swappc_b64
-define amdgpu_kernel void @bundle_size(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @bundle_size(ptr addrspace(1) %arg, i32 %cnd) #0 {
 bb:
   %cmp = icmp eq i32 %cnd, 0
   br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
@@ -46,7 +46,7 @@ bb2:
   br label %bb3
 
 bb3:
-  store volatile i32 %cnd, i32 addrspace(1)* %arg
+  store volatile i32 %cnd, ptr addrspace(1) %arg
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
index bfb0dcd1a560a..f0b0afa02b1ac 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=5 -o - %s | FileCheck %s
 
-define amdgpu_kernel void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 {
 ; CHECK-LABEL: spill:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dword s27, s[4:5], 0x2
@@ -894,7 +894,7 @@ bb3:
   ret void
 }
 
-define void @spill_func(i32 addrspace(1)* %arg) #0 {
+define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-LABEL: spill_func:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
index 7d2a022a53fcf..397d9f5f84df7 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
@@ -23,7 +23,7 @@
 
 ; GCN: [[ENDBB]]:
 ; GCN: global_store_{{dword|b32}}
-define amdgpu_kernel void @long_forward_scc_branch_3f_offset_bug(i32 addrspace(1)* %arg, i32 %cnd0) #0 {
+define amdgpu_kernel void @long_forward_scc_branch_3f_offset_bug(ptr addrspace(1) %arg, i32 %cnd0) #0 {
 bb0:
   %cmp0 = icmp eq i32 %cnd0, 0
   br i1 %cmp0, label %bb2, label %bb3
@@ -47,7 +47,7 @@ bb2:
   br i1 %cmp1, label %bb2, label %bb3   ; +4 (gfx1030), +8 with workaround (gfx1010)
 
 bb3:
-  store volatile i32 %cnd0, i32 addrspace(1)* %arg
+  store volatile i32 %cnd0, ptr addrspace(1) %arg
   ret void
 }
 
@@ -72,7 +72,7 @@ bb3:
 
 ; GCN: [[ENDBB]]:
 ; GCN: global_store_{{dword|b32}}
-define void @long_forward_exec_branch_3f_offset_bug(i32 addrspace(1)* %arg, i32 %cnd0) #0 {
+define void @long_forward_exec_branch_3f_offset_bug(ptr addrspace(1) %arg, i32 %cnd0) #0 {
 bb0:
   %cmp0 = icmp eq i32 %cnd0, 0
   br i1 %cmp0, label %bb2, label %bb3
@@ -96,7 +96,7 @@ bb2:
   br i1 %cmp1, label %bb2, label %bb3   ; +4 (gfx1030), +8 with workaround (gfx1010)
 
 bb3:
-  store volatile i32 %cnd0, i32 addrspace(1)* %arg
+  store volatile i32 %cnd0, ptr addrspace(1) %arg
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll
index 2f98eed6da872..0c7d42ef1cabf 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll
@@ -15,7 +15,7 @@
 ; GFX10: s_add_u32
 ; GFX10: s_addc_u32
 ; GFX10: s_setpc_b64
-define amdgpu_kernel void @long_forward_branch_gfx10only(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @long_forward_branch_gfx10only(ptr addrspace(1) %arg, i32 %cnd) #0 {
 bb0:
   %cmp = icmp eq i32 %cnd, 0
   br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
@@ -29,6 +29,6 @@ bb2:
   br label %bb3
 
 bb3:
-  store volatile i32 %cnd, i32 addrspace(1)* %arg
+  store volatile i32 %cnd, ptr addrspace(1) %arg
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index ecc951a9cacdb..731c560f1a3e5 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -36,7 +36,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 {
 bb:
   %cmp = icmp eq i32 %cnd, 0
   br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
@@ -51,7 +51,7 @@ bb2:
   br label %bb3
 
 bb3:
-  store volatile i32 %cnd, i32 addrspace(1)* %arg
+  store volatile i32 %cnd, ptr addrspace(1) %arg
   ret void
 }
 
@@ -79,7 +79,7 @@ bb3:
 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 {
 bb0:
   %cmp = icmp eq i32 %cnd, 0
   br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
@@ -94,7 +94,7 @@ bb2:
   br label %bb3
 
 bb3:
-  store volatile i32 %cnd, i32 addrspace(1)* %arg
+  store volatile i32 %cnd, ptr addrspace(1) %arg
   ret void
 }
 
@@ -122,7 +122,7 @@ bb3:
 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 {
 bb0:
   %cmp = fcmp oeq float %cnd, 0.0
   br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
@@ -136,7 +136,7 @@ bb2:
   br label %bb3
 
 bb3:
-  store volatile float %cnd, float addrspace(1)* %arg
+  store volatile float %cnd, ptr addrspace(1) %arg
   ret void
 }
 
@@ -154,12 +154,12 @@ bb3:
 ; GCN: s_or_b64 exec, exec, [[SAVE]]
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define amdgpu_kernel void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = zext i32 %tid to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tid.ext
-  %load = load volatile i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tid.ext
+  %load = load volatile i32, ptr addrspace(1) %gep
   %cmp = icmp eq i32 %load, 0
   br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
 
@@ -172,7 +172,7 @@ bb2:
   br label %bb3
 
 bb3:
-  store volatile i32 %load, i32 addrspace(1)* %gep
+  store volatile i32 %load, ptr addrspace(1) %gep
   ret void
 }
 
@@ -203,7 +203,7 @@ bb3:
 
 ; GCN-NEXT: [[ENDBB]]:
 ; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @long_backward_sbranch(ptr addrspace(1) %arg) #0 {
 bb:
   br label %bb2
 
@@ -252,13 +252,13 @@ bb3:
 ; GCN: ;;#ASMEND
 
 ; GCN: .Lfunc_end{{[0-9]+}}:
-define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
+define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) {
 bb0:
   %tmp = icmp ne i32 %arg1, 0
   br i1 %tmp, label %bb2, label %bb3
 
 bb2:
-  store volatile i32 17, i32 addrspace(1)* undef
+  store volatile i32 17, ptr addrspace(1) undef
   br label %bb4
 
 bb3:
@@ -271,7 +271,7 @@ bb3:
   br label %bb4
 
 bb4:
-  store volatile i32 63, i32 addrspace(1)* %arg
+  store volatile i32 63, ptr addrspace(1) %arg
   ret void
 }
 
@@ -296,7 +296,7 @@ bb4:
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], (.L[[LOOP]]-[[POST_GETPC]])>>32
 ; GCN-NEXT: s_setpc_b64 s[[[PC_LO]]:[[PC_HI]]]
 ; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
-define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
+define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(ptr addrspace(1) %arg, i32 %arg1) {
 entry:
   br label %loop
 
@@ -355,7 +355,7 @@ bb0:
   br i1 %cmp0, label %bb2, label %bb1
 
 bb1:
-  %val = load volatile i32, i32 addrspace(4)* undef
+  %val = load volatile i32, ptr addrspace(4) undef
   %cmp1 = icmp eq i32 %val, 3
   br i1 %cmp1, label %bb3, label %bb2
 
@@ -402,19 +402,19 @@ bb3:
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 ; GCN-NEXT: s_sleep 5
 ; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
+define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 %cond) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %d_cmp = icmp ult i32 %tid, 16
   br i1 %d_cmp, label %if, label %endif
 
 if:
-  store i32 0, i32 addrspace(1)* %out
+  store i32 0, ptr addrspace(1) %out
   %u_cmp = icmp eq i32 %cond, 0
   br i1 %u_cmp, label %if_uniform, label %endif
 
 if_uniform:
-  store i32 1, i32 addrspace(1)* %out
+  store i32 1, ptr addrspace(1) %out
   br label %endif
 
 endif:
@@ -479,7 +479,7 @@ loop_body:
   br label %loop
 
 ret:
-  store volatile i32 7, i32 addrspace(1)* undef
+  store volatile i32 7, ptr addrspace(1) undef
   ret void
 }
 
@@ -504,7 +504,7 @@ ret:
 ; GCN: s_setpc_b64
 
 ; GCN: s_endpgm
-define amdgpu_kernel void @long_branch_hang(i32 addrspace(1)* nocapture %arg, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i64 %arg5) #0 {
+define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i64 %arg5) #0 {
 bb:
   %tmp = icmp slt i32 %arg2, 9
   %tmp6 = icmp eq i32 %arg1, 0
@@ -535,8 +535,8 @@ bb14:                                             ; preds = %bb13, %bb9
 
 bb19:                                             ; preds = %bb14, %bb13, %bb9
   %tmp20 = phi i32 [ undef, %bb9 ], [ undef, %bb13 ], [ %tmp18, %bb14 ]
-  %tmp21 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %arg5
-  store i32 %tmp20, i32 addrspace(1)* %tmp21, align 4
+  %tmp21 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %arg5
+  store i32 %tmp20, ptr addrspace(1) %tmp21, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
index 7f1400fec27f4..3de705a947be7 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefix=GCN %s
 
-define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias dereferenceable(18446744073709551615) %arg0, i32 %arg1) {
+define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615) %arg0, i32 %arg1) {
   ; GCN-LABEL: name: mmo_offsets0
   ; GCN: bb.0.bb.0:
   ; GCN-NEXT:   liveins: $sgpr0, $vgpr0
@@ -193,7 +193,7 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
   ; GCN-NEXT:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
   ; GCN-NEXT:   S_ENDPGM 0
 bb.0:
-  %tmp0 = load <4 x i32>, <4 x i32> addrspace(6)* %arg0, align 16, !invariant.load !0
+  %tmp0 = load <4 x i32>, ptr addrspace(6) %arg0, align 16, !invariant.load !0
   %buffer0 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 16, i1 false, i1 false) #0
   %buffer1 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #0
   %buffer2 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 1, i32 16, i1 false, i1 false) #0

diff  --git a/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll b/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll
index c7cc50045c895..5dbbaab3597bf 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll
@@ -20,24 +20,24 @@ define amdgpu_cs void @_amdgpu_cs_main(<3 x i32> inreg %arg3, <3 x i32> %arg5) {
   %tmp9 = add <3 x i32> %arg3, %arg5
   %tmp10 = extractelement <3 x i32> %tmp9, i32 0
   %tmp11 = shl i32 %tmp10, 2
-  %tmp12 = inttoptr i64 undef to <4 x i32> addrspace(4)*
-  %tmp13 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp12, align 16
+  %tmp12 = inttoptr i64 undef to ptr addrspace(4)
+  %tmp13 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16
   %tmp14 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp13, i32 0, i32 %tmp11, i1 false, i1 false) #0
-  %tmp17 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp12, align 16
+  %tmp17 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16
   call void @llvm.amdgcn.buffer.store.f32(float %tmp14, <4 x i32> %tmp17, i32 0, i32 %tmp11, i1 false, i1 false) #0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp12, align 16
+  %tmp20 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16
   %tmp21 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp20, i32 0, i32 %tmp11, i1 false, i1 false) #0
   %tmp22 = fadd reassoc nnan arcp contract float %tmp21, 1.000000e+00
   call void @llvm.amdgcn.buffer.store.f32(float %tmp22, <4 x i32> %tmp20, i32 0, i32 %tmp11, i1 false, i1 false) #0
-  %tmp25 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp12, align 16
+  %tmp25 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16
   %tmp26 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp25, i32 0, i32 %tmp11, i1 false, i1 false) #0
   %tmp27 = fadd reassoc nnan arcp contract float %tmp26, 1.000000e+00
   call void @llvm.amdgcn.buffer.store.f32(float %tmp27, <4 x i32> %tmp25, i32 0, i32 %tmp11, i1 false, i1 false) #0
-  %tmp30 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp12, align 16
+  %tmp30 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16
   %tmp31 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp30, i32 0, i32 %tmp11, i1 false, i1 false) #0
   %tmp32 = fadd reassoc nnan arcp contract float %tmp31, 1.000000e+00
   call void @llvm.amdgcn.buffer.store.f32(float %tmp32, <4 x i32> %tmp30, i32 0, i32 %tmp11, i1 false, i1 false) #0
-  %tmp35 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp12, align 16
+  %tmp35 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16
   %tmp36 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp35, i32 0, i32 %tmp11, i1 false, i1 false) #0
   %tmp37 = fadd reassoc nnan arcp contract float %tmp36, 1.000000e+00
   call void @llvm.amdgcn.buffer.store.f32(float %tmp37, <4 x i32> %tmp35, i32 0, i32 %tmp11, i1 false, i1 false) #0

diff  --git a/llvm/test/CodeGen/AMDGPU/bug-sdag-scheduler-cycle.ll b/llvm/test/CodeGen/AMDGPU/bug-sdag-scheduler-cycle.ll
index 50ba7e19f46e0..a76390be123d9 100644
--- a/llvm/test/CodeGen/AMDGPU/bug-sdag-scheduler-cycle.ll
+++ b/llvm/test/CodeGen/AMDGPU/bug-sdag-scheduler-cycle.ll
@@ -8,17 +8,15 @@
 ; CHECK: ds_read_b32
 ; CHECK: ds_read_b32
 ; CHECK: ds_read_b32
-define amdgpu_gs float @_amdgpu_gs_main(i8 addrspace(3)* %arg0, i8 addrspace(3)* %arg1, i8 addrspace(3)* %arg2) #0 {
-  %tmp0 = bitcast i8 addrspace(3)* %arg0 to i32 addrspace(3)* addrspace(3)*
-  %tmp = load volatile i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* %tmp0, align 4
+define amdgpu_gs float @_amdgpu_gs_main(ptr addrspace(3) %arg0, ptr addrspace(3) %arg1, ptr addrspace(3) %arg2) #0 {
+  %tmp = load volatile ptr addrspace(3), ptr addrspace(3) %arg0, align 4
 
-  %tmp3 = load volatile i32, i32 addrspace(3)* %tmp, align 4
+  %tmp3 = load volatile i32, ptr addrspace(3) %tmp, align 4
 
-  %tmp4a = bitcast i8 addrspace(3)* %arg1 to i32 addrspace(3)*
-  %tmp4 = load volatile i32, i32 addrspace(3)* %tmp4a, align 4
+  %tmp4 = load volatile i32, ptr addrspace(3) %arg1, align 4
 
-  %tmp7a = getelementptr i32, i32 addrspace(3)* %tmp, i32 8
-  %tmp8 = load volatile i32, i32 addrspace(3)* %tmp7a, align 4
+  %tmp7a = getelementptr i32, ptr addrspace(3) %tmp, i32 8
+  %tmp8 = load volatile i32, ptr addrspace(3) %tmp7a, align 4
 
   %tmp9 = add i32 %tmp3, %tmp8
   %tmp10 = add i32 %tmp9, %tmp4

diff  --git a/llvm/test/CodeGen/AMDGPU/bug-v4f64-subvector.ll b/llvm/test/CodeGen/AMDGPU/bug-v4f64-subvector.ll
index 4ead1186f2d69..459913eb4f6e7 100644
--- a/llvm/test/CodeGen/AMDGPU/bug-v4f64-subvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/bug-v4f64-subvector.ll
@@ -8,23 +8,22 @@
 ; CHECK: GLOBAL_STORE_DWORDX4
 define protected amdgpu_kernel void @test1() local_unnamed_addr !kernel_arg_addr_space !0 !kernel_arg_access_qual !1 !kernel_arg_type !2 !kernel_arg_base_type !2 !kernel_arg_type_qual !3 !kernel_arg_name !4 {
 entry:
-  %tmp = load <3 x i64>, <3 x i64> addrspace(4)* undef, align 16, !invariant.load !5
+  %tmp = load <3 x i64>, ptr addrspace(4) undef, align 16, !invariant.load !5
   %srcA.load2 = extractelement <3 x i64> %tmp, i32 0
-  %tmp1 = inttoptr i64 %srcA.load2 to double addrspace(1)*
-  %tmp2 = getelementptr inbounds double, double addrspace(1)* %tmp1, i64 undef
-  %tmp3 = bitcast double addrspace(1)* %tmp2 to <3 x double> addrspace(1)*
-  %tmp4 = load <3 x double>, <3 x double> addrspace(1)* %tmp3, align 8, !tbaa !6
+  %tmp1 = inttoptr i64 %srcA.load2 to ptr addrspace(1)
+  %tmp2 = getelementptr inbounds double, ptr addrspace(1) %tmp1, i64 undef
+  %tmp4 = load <3 x double>, ptr addrspace(1) %tmp2, align 8, !tbaa !6
   %tmp5 = extractelement <3 x double> %tmp4, i32 1
   %tmp6 = insertelement <3 x double> undef, double %tmp5, i32 1
   %tmp7 = insertelement <3 x double> %tmp6, double undef, i32 2
-  %tmp8 = load <3 x double>, <3 x double> addrspace(1)* undef, align 8, !tbaa !6
+  %tmp8 = load <3 x double>, ptr addrspace(1) undef, align 8, !tbaa !6
   %tmp9 = extractelement <3 x double> %tmp8, i32 2
   %tmp10 = insertelement <3 x double> undef, double %tmp9, i32 2
   %tmp11 = fcmp olt <3 x double> %tmp10, %tmp7
   %tmp12 = select <3 x i1> %tmp11, <3 x double> zeroinitializer, <3 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
   %tmp13 = extractelement <3 x double> %tmp12, i64 1
   %tmp14 = insertelement <2 x double> undef, double %tmp13, i32 1
-  store <2 x double> %tmp14, <2 x double> addrspace(1)* undef, align 8, !tbaa !6
+  store <2 x double> %tmp14, ptr addrspace(1) undef, align 8, !tbaa !6
   ret void
 }
 
@@ -36,29 +35,28 @@ entry:
 ; CHECK: GLOBAL_STORE_DWORDX2
 define protected amdgpu_kernel void @test2() local_unnamed_addr !kernel_arg_addr_space !0 !kernel_arg_access_qual !1 !kernel_arg_type !2 !kernel_arg_base_type !2 !kernel_arg_type_qual !3 !kernel_arg_name !4 {
 entry:
-  %tmp = load <3 x i64>, <3 x i64> addrspace(4)* undef, align 16, !invariant.load !5
+  %tmp = load <3 x i64>, ptr addrspace(4) undef, align 16, !invariant.load !5
   %srcA.load2 = extractelement <3 x i64> %tmp, i32 0
-  %tmp1 = inttoptr i64 %srcA.load2 to double addrspace(1)*
-  %tmp2 = getelementptr inbounds double, double addrspace(1)* %tmp1, i64 undef
-  %tmp3 = bitcast double addrspace(1)* %tmp2 to <3 x double> addrspace(1)*
-  %tmp4 = load <3 x double>, <3 x double> addrspace(1)* %tmp3, align 8, !tbaa !6
+  %tmp1 = inttoptr i64 %srcA.load2 to ptr addrspace(1)
+  %tmp2 = getelementptr inbounds double, ptr addrspace(1) %tmp1, i64 undef
+  %tmp4 = load <3 x double>, ptr addrspace(1) %tmp2, align 8, !tbaa !6
   %tmp5 = extractelement <3 x double> %tmp4, i32 1
   %tmp6 = insertelement <3 x double> undef, double %tmp5, i32 1
   %tmp7 = insertelement <3 x double> %tmp6, double undef, i32 2
-  %tmp8 = load <3 x double>, <3 x double> addrspace(1)* undef, align 8, !tbaa !6
+  %tmp8 = load <3 x double>, ptr addrspace(1) undef, align 8, !tbaa !6
   %tmp9 = extractelement <3 x double> %tmp8, i32 1
   %tmp10 = insertelement <3 x double> undef, double %tmp9, i32 1
   %tmp11 = insertelement <3 x double> %tmp10, double undef, i32 2
   %tmp12 = fcmp olt <3 x double> %tmp11, %tmp7
   %tmp13 = select <3 x i1> %tmp12, <3 x double> zeroinitializer, <3 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
   %tmp14 = extractelement <3 x double> %tmp13, i64 2
-  store double %tmp14, double addrspace(1)* undef, align 8, !tbaa !6
+  store double %tmp14, ptr addrspace(1) undef, align 8, !tbaa !6
   ret void
 }
 
 !0 = !{i32 1, i32 1, i32 1}
 !1 = !{!"none", !"none", !"none"}
-!2 = !{!"double*", !"double*", !"double*"}
+!2 = !{!"ptr", !"ptr", !"ptr"}
 !3 = !{!"", !"", !""}
 !4 = !{!"srcA", !"srcB", !"dst"}
 !5 = !{}

diff  --git a/llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll b/llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll
index 433baf4386145..efe8f9303e2dd 100644
--- a/llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll
@@ -6,18 +6,17 @@
 ; GCN-LABEL: {{^}}combine_loop:
 ; GCN: flat_load_short_d16_hi
 ; GCN: flat_store_short
-define amdgpu_kernel void @combine_loop(i16* %arg) #0 {
+define amdgpu_kernel void @combine_loop(ptr %arg) #0 {
 bb:
   br label %bb1
 
 bb1:
   %tmp = phi <2 x i16> [ <i16 15360, i16 15360>, %bb ], [ %tmp5, %bb1 ]
   %tmp2 = phi half [ 0xH0000, %bb ], [ %tmp8, %bb1 ]
-  %tmp3 = load volatile half, half* null, align 536870912
+  %tmp3 = load volatile half, ptr null, align 536870912
   %tmp4 = bitcast half %tmp3 to i16
   %tmp5 = insertelement <2 x i16> <i16 0, i16 undef>, i16 %tmp4, i32 1
-  %tmp6 = bitcast i16* %arg to half*
-  store volatile half %tmp2, half* %tmp6, align 2
+  store volatile half %tmp2, ptr %arg, align 2
   %tmp7 = bitcast <2 x i16> %tmp to <2 x half>
   %tmp8 = extractelement <2 x half> %tmp7, i32 0
   br label %bb1

diff  --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 5104d4a80a59f..52e9192a439f5 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -15,9 +15,9 @@
 ; GFX678: buffer_store_dwordx2 v[[[X]]:[[Y]]]
 ; GFX10: global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX11: global_store_b64 v2, v[0:1], s[0:1]
-define amdgpu_kernel void @build_vector2 (<2 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) {
 entry:
-  store <2 x i32> <i32 5, i32 6>, <2 x i32> addrspace(1)* %out
+  store <2 x i32> <i32 5, i32 6>, ptr addrspace(1) %out
   ret void
 }
 
@@ -38,9 +38,9 @@ entry:
 ; GFX678: buffer_store_dwordx4 v[[[X]]:[[W]]]
 ; GFX10: global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX11: global_store_b128 v4, v[0:3], s[0:1]
-define amdgpu_kernel void @build_vector4 (<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) {
 entry:
-  store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32> addrspace(1)* %out
+  store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, ptr addrspace(1) %out
   ret void
 }
 
@@ -58,9 +58,9 @@ entry:
 ; GFX1011: s_waitcnt lgkmcnt(0)
 ; GFX10: global_store_dword v0, v1, s[0:1]
 ; GFX11: global_store_b32 v0, v1, s[0:1]
-define amdgpu_kernel void @build_vector_v2i16 (<2 x i16> addrspace(1)* %out) {
+define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) {
 entry:
-  store <2 x i16> <i16 5, i16 6>, <2 x i16> addrspace(1)* %out
+  store <2 x i16> <i16 5, i16 6>, ptr addrspace(1) %out
   ret void
 }
 
@@ -91,11 +91,11 @@ entry:
 ; GFX1011: v_mov_b32_e32 v1, s2
 ; GFX10: global_store_dword v0, v1, s[0:1]
 ; GFX11: global_store_b32 v0, v1, s[0:1]
-define amdgpu_kernel void @build_vector_v2i16_trunc (<2 x i16> addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 %a) {
   %srl = lshr i32 %a, 16
   %trunc = trunc i32 %srl to i16
   %ins.0 = insertelement <2 x i16> undef, i16 %trunc, i32 0
   %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
-  store <2 x i16> %ins.1, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %ins.1, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 4f576f8429f0d..7cc65d9615c18 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -16,10 +16,10 @@
 ; GCN-LABEL: @sadd64rr
 ; GCN:       s_add_u32
 ; GCN:       s_addc_u32
-define amdgpu_kernel void @sadd64rr(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
 entry:
   %add = add i64 %a, %b
-  store i64 %add, i64 addrspace(1)* %out
+  store i64 %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -31,10 +31,10 @@ entry:
 ; GCN-LABEL: @sadd64ri
 ; GCN:       s_add_u32  s{{[0-9]+}}, s{{[0-9]+}}, 0x56789876
 ; GCN:       s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1234
-define amdgpu_kernel void @sadd64ri(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
 entry:
   %add = add i64 20015998343286, %a
-  store i64 %add, i64 addrspace(1)* %out
+  store i64 %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -61,12 +61,12 @@ entry:
 ;
 ; GFX11: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX11: v_add_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]]
-define amdgpu_kernel void @vadd64rr(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %add = add i64 %a, %tid.ext
-  store i64 %add, i64 addrspace(1)* %out
+  store i64 %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -96,12 +96,12 @@ entry:
 ;
 ; GFX11: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], 0x56789876, v{{[0-9]+}}
 ; GFX11: v_add_co_ci_u32_e64 v{{[0-9]+}}, null, 0, 0x1234, [[CARRY]]
-define amdgpu_kernel void @vadd64ri(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %add = add i64 20015998343286, %tid.ext
-  store i64 %add, i64 addrspace(1)* %out
+  store i64 %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -109,11 +109,11 @@ entry:
 ; GCN-ISEL-LABEL: body:
 ; GCN-ISEL-LABEL: bb.0
 ; GCN-ISEL: S_ADD_I32
-define amdgpu_kernel void @suaddo32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -144,12 +144,12 @@ define amdgpu_kernel void @suaddo32(i32 addrspace(1)* %out, i1 addrspace(1)* %ca
 ;
 ; GFX11: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX11: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[CARRY]]
-define amdgpu_kernel void @uaddo32_vcc_user(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i32 %val, ptr addrspace(1) %out, align 4
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -162,12 +162,12 @@ define amdgpu_kernel void @uaddo32_vcc_user(i32 addrspace(1)* %out, i1 addrspace
 ;
 ; GCN: s_add_u32
 ; GCN: s_addc_u32
-define amdgpu_kernel void @suaddo64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 {
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i64 %val, ptr addrspace(1) %out, align 8
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -194,14 +194,14 @@ define amdgpu_kernel void @suaddo64(i64 addrspace(1)* %out, i1 addrspace(1)* %ca
 ;
 ; GFX11: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v0
 ; GFX11: v_add_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]]
-define amdgpu_kernel void @vuaddo64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a) #0 {
+define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %tid.ext)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i64 %val, ptr addrspace(1) %out, align 8
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -213,10 +213,10 @@ define amdgpu_kernel void @vuaddo64(i64 addrspace(1)* %out, i1 addrspace(1)* %ca
 ; GCN-LABEL: @ssub64rr
 ; GCN:       s_sub_u32
 ; GCN:       s_subb_u32
-define amdgpu_kernel void @ssub64rr(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
 entry:
   %sub = sub i64 %a, %b
-  store i64 %sub, i64 addrspace(1)* %out
+  store i64 %sub, ptr addrspace(1) %out
   ret void
 }
 
@@ -228,10 +228,10 @@ entry:
 ; GCN-LABEL: @ssub64ri
 ; GCN:       s_sub_u32  s{{[0-9]+}}, 0x56789876, s{{[0-9]+}}
 ; GCN:       s_subb_u32 s{{[0-9]+}}, 0x1234, s{{[0-9]+}}
-define amdgpu_kernel void @ssub64ri(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
 entry:
   %sub = sub i64 20015998343286, %a
-  store i64 %sub, i64 addrspace(1)* %out
+  store i64 %sub, ptr addrspace(1) %out
   ret void
 }
 
@@ -258,12 +258,12 @@ entry:
 ;
 ; GFX11: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX11: v_sub_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]]
-define amdgpu_kernel void @vsub64rr(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %sub = sub i64 %a, %tid.ext
-  store i64 %sub, i64 addrspace(1)* %out
+  store i64 %sub, ptr addrspace(1) %out
   ret void
 }
 
@@ -293,12 +293,12 @@ entry:
 ;
 ; GFX11: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], 0x56789876, v{{[0-9]+}}
 ; GFX11: v_sub_co_ci_u32_e64 v{{[0-9]+}}, null, 0x1234, 0, [[CARRY]]
-define amdgpu_kernel void @vsub64ri(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %sub = sub i64 20015998343286, %tid.ext
-  store i64 %sub, i64 addrspace(1)* %out
+  store i64 %sub, ptr addrspace(1) %out
   ret void
 }
 
@@ -306,11 +306,11 @@ entry:
 ; GCN-ISEL-LABEL: body:
 ; GCN-ISEL-LABEL: bb.0
 ; GCN-ISEL: S_SUB_I32
-define amdgpu_kernel void @susubo32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
   %carry = extractvalue { i32, i1 } %usub, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -341,12 +341,12 @@ define amdgpu_kernel void @susubo32(i32 addrspace(1)* %out, i1 addrspace(1)* %ca
 ;
 ; GFX11: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX11: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[CARRY]]
-define amdgpu_kernel void @usubo32_vcc_user(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
   %carry = extractvalue { i32, i1 } %usub, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i32 %val, ptr addrspace(1) %out, align 4
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -359,12 +359,12 @@ define amdgpu_kernel void @usubo32_vcc_user(i32 addrspace(1)* %out, i1 addrspace
 ;
 ; GCN: s_sub_u32
 ; GCN: s_subb_u32
-define amdgpu_kernel void @susubo64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 {
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i64 %val, ptr addrspace(1) %out, align 8
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -391,14 +391,14 @@ define amdgpu_kernel void @susubo64(i64 addrspace(1)* %out, i1 addrspace(1)* %ca
 ;
 ; GFX11: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v0
 ; GFX11: v_sub_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]]
-define amdgpu_kernel void @vusubo64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a) #0 {
+define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %tid.ext)
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i64 %val, ptr addrspace(1) %out, align 8
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -409,9 +409,9 @@ define amdgpu_kernel void @vusubo64(i64 addrspace(1)* %out, i1 addrspace(1)* %ca
 ; GCN-ISEL: S_ADD_CO_PSEUDO %{{[0-9]+}}, killed %{{[0-9]+}}, killed %[[CARRY]]
 ; GCN-ISEL: %[[CARRY:[0-9]+]]:sreg_64_xexec = V_SUB_CO_U32_e64
 ; GCN-ISEL: S_SUB_CO_PSEUDO killed %{{[0-9]+}}, %{{[0-9]+}}, %[[CARRY]]
-define amdgpu_kernel void @sudiv64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
   %result = udiv i64 %x, %y
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/cayman-loop-bug.ll b/llvm/test/CodeGen/AMDGPU/cayman-loop-bug.ll
index 34e6669434f1b..87381d8b113c8 100644
--- a/llvm/test/CodeGen/AMDGPU/cayman-loop-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/cayman-loop-bug.ll
@@ -23,7 +23,7 @@ outer_loop_body:
 
 inner_loop:
   %cnt2 = phi i32 [0, %outer_loop_body], [%cnt2_incr, %inner_loop_body]
-  %n = load volatile i32, i32 addrspace(1)* undef
+  %n = load volatile i32, ptr addrspace(1) undef
   %cond2 = icmp slt i32 %cnt2, %n
   br i1 %cond2, label %inner_loop_body, label %outer_loop
 

diff  --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
index 70aaf355ea34e..880a88ed9026d 100644
--- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s | FileCheck -check-prefix=GCN_DBG %s
 
-define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN-LABEL: test_loop:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0xa
@@ -97,15 +97,15 @@ for.exit:
 for.body:
   %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %tmp = add i32 %indvar, 32
-  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
-  %vecload = load float, float addrspace(3)* %arrayidx, align 4
+  %arrayidx = getelementptr float, ptr addrspace(3) %ptr, i32 %tmp
+  %vecload = load float, ptr addrspace(3) %arrayidx, align 4
   %add = fadd float %vecload, 1.0
-  store float %add, float addrspace(3)* %arrayidx, align 8
+  store float %add, ptr addrspace(3) %arrayidx, align 8
   %inc = add i32 %indvar, 1
   br label %for.body
 }
 
-define amdgpu_kernel void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN-LABEL: loop_const_true:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s0, s[0:1], 0x9
@@ -184,15 +184,15 @@ for.exit:
 for.body:
   %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %tmp = add i32 %indvar, 32
-  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
-  %vecload = load float, float addrspace(3)* %arrayidx, align 4
+  %arrayidx = getelementptr float, ptr addrspace(3) %ptr, i32 %tmp
+  %vecload = load float, ptr addrspace(3) %arrayidx, align 4
   %add = fadd float %vecload, 1.0
-  store float %add, float addrspace(3)* %arrayidx, align 8
+  store float %add, ptr addrspace(3) %arrayidx, align 8
   %inc = add i32 %indvar, 1
   br i1 true, label %for.body, label %for.exit
 }
 
-define amdgpu_kernel void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN-LABEL: loop_const_false:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s0, s[0:1], 0x9
@@ -268,15 +268,15 @@ for.exit:
 for.body:
   %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %tmp = add i32 %indvar, 32
-  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
-  %vecload = load float, float addrspace(3)* %arrayidx, align 4
+  %arrayidx = getelementptr float, ptr addrspace(3) %ptr, i32 %tmp
+  %vecload = load float, ptr addrspace(3) %arrayidx, align 4
   %add = fadd float %vecload, 1.0
-  store float %add, float addrspace(3)* %arrayidx, align 8
+  store float %add, ptr addrspace(3) %arrayidx, align 8
   %inc = add i32 %indvar, 1
   br i1 false, label %for.body, label %for.exit
 }
 
-define amdgpu_kernel void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN-LABEL: loop_const_undef:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s0, s[0:1], 0x9
@@ -350,15 +350,15 @@ for.exit:
 for.body:
   %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %tmp = add i32 %indvar, 32
-  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
-  %vecload = load float, float addrspace(3)* %arrayidx, align 4
+  %arrayidx = getelementptr float, ptr addrspace(3) %ptr, i32 %tmp
+  %vecload = load float, ptr addrspace(3) %arrayidx, align 4
   %add = fadd float %vecload, 1.0
-  store float %add, float addrspace(3)* %arrayidx, align 8
+  store float %add, ptr addrspace(3) %arrayidx, align 8
   %inc = add i32 %indvar, 1
   br i1 undef, label %for.body, label %for.exit
 }
 
-define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN-LABEL: loop_arg_0:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
@@ -452,7 +452,7 @@ define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n) nounwind
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB4_1
 ; GCN_DBG-NEXT:    s_branch .LBB4_2
 entry:
-  %cond = load volatile i1, i1 addrspace(3)* null
+  %cond = load volatile i1, ptr addrspace(3) null
   br label %for.body
 
 for.exit:
@@ -461,10 +461,10 @@ for.exit:
 for.body:
   %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %tmp = add i32 %indvar, 32
-  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
-  %vecload = load float, float addrspace(3)* %arrayidx, align 4
+  %arrayidx = getelementptr float, ptr addrspace(3) %ptr, i32 %tmp
+  %vecload = load float, ptr addrspace(3) %arrayidx, align 4
   %add = fadd float %vecload, 1.0
-  store float %add, float addrspace(3)* %arrayidx, align 8
+  store float %add, ptr addrspace(3) %arrayidx, align 8
   %inc = add i32 %indvar, 1
   br i1 %cond, label %for.body, label %for.exit
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/cf-stack-bug.ll b/llvm/test/CodeGen/AMDGPU/cf-stack-bug.ll
index 53fe89730f3aa..630fdae50bc49 100644
--- a/llvm/test/CodeGen/AMDGPU/cf-stack-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/cf-stack-bug.ll
@@ -35,7 +35,7 @@
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested3:
-define amdgpu_kernel void @nested3(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested3(ptr addrspace(1) %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
@@ -45,7 +45,7 @@ if.1:
   br i1 %1, label %if.2, label %if.store.1
 
 if.store.1:
-  store i32 1, i32 addrspace(1)* %out
+  store i32 1, ptr addrspace(1) %out
   br label %end
 
 if.2:
@@ -53,11 +53,11 @@ if.2:
   br i1 %2, label %if.3, label %if.2.store
 
 if.2.store:
-  store i32 2, i32 addrspace(1)* %out
+  store i32 2, ptr addrspace(1) %out
   br label %end
 
 if.3:
-  store i32 3, i32 addrspace(1)* %out
+  store i32 3, ptr addrspace(1) %out
   br label %end
 
 end:
@@ -68,7 +68,7 @@ end:
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested4:
-define amdgpu_kernel void @nested4(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested4(ptr addrspace(1) %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
@@ -78,7 +78,7 @@ if.1:
   br i1 %1, label %if.2, label %if.1.store
 
 if.1.store:
-  store i32 1, i32 addrspace(1)* %out
+  store i32 1, ptr addrspace(1) %out
   br label %end
 
 if.2:
@@ -86,7 +86,7 @@ if.2:
   br i1 %2, label %if.3, label %if.2.store
 
 if.2.store:
-  store i32 2, i32 addrspace(1)* %out
+  store i32 2, ptr addrspace(1) %out
   br label %end
 
 if.3:
@@ -94,11 +94,11 @@ if.3:
   br i1 %3, label %if.4, label %if.3.store
 
 if.3.store:
-  store i32 3, i32 addrspace(1)* %out
+  store i32 3, ptr addrspace(1) %out
   br label %end
 
 if.4:
-  store i32 4, i32 addrspace(1)* %out
+  store i32 4, ptr addrspace(1) %out
   br label %end
 
 end:
@@ -109,7 +109,7 @@ end:
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested7:
-define amdgpu_kernel void @nested7(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested7(ptr addrspace(1) %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
@@ -119,7 +119,7 @@ if.1:
   br i1 %1, label %if.2, label %if.1.store
 
 if.1.store:
-  store i32 1, i32 addrspace(1)* %out
+  store i32 1, ptr addrspace(1) %out
   br label %end
 
 if.2:
@@ -127,7 +127,7 @@ if.2:
   br i1 %2, label %if.3, label %if.2.store
 
 if.2.store:
-  store i32 2, i32 addrspace(1)* %out
+  store i32 2, ptr addrspace(1) %out
   br label %end
 
 if.3:
@@ -135,7 +135,7 @@ if.3:
   br i1 %3, label %if.4, label %if.3.store
 
 if.3.store:
-  store i32 3, i32 addrspace(1)* %out
+  store i32 3, ptr addrspace(1) %out
   br label %end
 
 if.4:
@@ -143,7 +143,7 @@ if.4:
   br i1 %4, label %if.5, label %if.4.store
 
 if.4.store:
-  store i32 4, i32 addrspace(1)* %out
+  store i32 4, ptr addrspace(1) %out
   br label %end
 
 if.5:
@@ -151,7 +151,7 @@ if.5:
   br i1 %5, label %if.6, label %if.5.store
 
 if.5.store:
-  store i32 5, i32 addrspace(1)* %out
+  store i32 5, ptr addrspace(1) %out
   br label %end
 
 if.6:
@@ -159,11 +159,11 @@ if.6:
   br i1 %6, label %if.7, label %if.6.store
 
 if.6.store:
-  store i32 6, i32 addrspace(1)* %out
+  store i32 6, ptr addrspace(1) %out
   br label %end
 
 if.7:
-  store i32 7, i32 addrspace(1)* %out
+  store i32 7, ptr addrspace(1) %out
   br label %end
 
 end:
@@ -174,7 +174,7 @@ end:
 ; BUG32: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested8:
-define amdgpu_kernel void @nested8(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested8(ptr addrspace(1) %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
@@ -184,7 +184,7 @@ if.1:
   br i1 %1, label %if.2, label %if.1.store
 
 if.1.store:
-  store i32 1, i32 addrspace(1)* %out
+  store i32 1, ptr addrspace(1) %out
   br label %end
 
 if.2:
@@ -192,7 +192,7 @@ if.2:
   br i1 %2, label %if.3, label %if.2.store
 
 if.2.store:
-  store i32 2, i32 addrspace(1)* %out
+  store i32 2, ptr addrspace(1) %out
   br label %end
 
 if.3:
@@ -200,7 +200,7 @@ if.3:
   br i1 %3, label %if.4, label %if.3.store
 
 if.3.store:
-  store i32 3, i32 addrspace(1)* %out
+  store i32 3, ptr addrspace(1) %out
   br label %end
 
 if.4:
@@ -208,7 +208,7 @@ if.4:
   br i1 %4, label %if.5, label %if.4.store
 
 if.4.store:
-  store i32 4, i32 addrspace(1)* %out
+  store i32 4, ptr addrspace(1) %out
   br label %end
 
 if.5:
@@ -216,7 +216,7 @@ if.5:
   br i1 %5, label %if.6, label %if.5.store
 
 if.5.store:
-  store i32 5, i32 addrspace(1)* %out
+  store i32 5, ptr addrspace(1) %out
   br label %end
 
 if.6:
@@ -224,7 +224,7 @@ if.6:
   br i1 %6, label %if.7, label %if.6.store
 
 if.6.store:
-  store i32 6, i32 addrspace(1)* %out
+  store i32 6, ptr addrspace(1) %out
   br label %end
 
 if.7:
@@ -232,11 +232,11 @@ if.7:
   br i1 %7, label %if.8, label %if.7.store
 
 if.7.store:
-  store i32 7, i32 addrspace(1)* %out
+  store i32 7, ptr addrspace(1) %out
   br label %end
 
 if.8:
-  store i32 8, i32 addrspace(1)* %out
+  store i32 8, ptr addrspace(1) %out
   br label %end
 
 end:

diff  --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
index c5fdacb710589..21cbd196f183b 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@@ -9,69 +9,65 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck --check-prefix=GFX10 %s
 
-define void @test_sinkable_flat_small_offset_i32(i32* %out, i32* %in, i32 %cond) {
+define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) {
 ; OPT-GFX7-LABEL: @test_sinkable_flat_small_offset_i32(
 ; OPT-GFX7-NEXT:  entry:
-; OPT-GFX7-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i64 999999
-; OPT-GFX7-NEXT:    [[IN_GEP:%.*]] = getelementptr i32, i32* [[IN:%.*]], i64 7
+; OPT-GFX7-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999
+; OPT-GFX7-NEXT:    [[IN_GEP:%.*]] = getelementptr i32, ptr [[IN:%.*]], i64 7
 ; OPT-GFX7-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0
 ; OPT-GFX7-NEXT:    br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT-GFX7:       if:
-; OPT-GFX7-NEXT:    [[LOAD:%.*]] = load i32, i32* [[IN_GEP]], align 4
+; OPT-GFX7-NEXT:    [[LOAD:%.*]] = load i32, ptr [[IN_GEP]], align 4
 ; OPT-GFX7-NEXT:    br label [[ENDIF]]
 ; OPT-GFX7:       endif:
 ; OPT-GFX7-NEXT:    [[X:%.*]] = phi i32 [ [[LOAD]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
-; OPT-GFX7-NEXT:    store i32 [[X]], i32* [[OUT_GEP]], align 4
+; OPT-GFX7-NEXT:    store i32 [[X]], ptr [[OUT_GEP]], align 4
 ; OPT-GFX7-NEXT:    br label [[DONE:%.*]]
 ; OPT-GFX7:       done:
 ; OPT-GFX7-NEXT:    ret void
 ;
 ; OPT-GFX8-LABEL: @test_sinkable_flat_small_offset_i32(
 ; OPT-GFX8-NEXT:  entry:
-; OPT-GFX8-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i64 999999
-; OPT-GFX8-NEXT:    [[IN_GEP:%.*]] = getelementptr i32, i32* [[IN:%.*]], i64 7
+; OPT-GFX8-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999
+; OPT-GFX8-NEXT:    [[IN_GEP:%.*]] = getelementptr i32, ptr [[IN:%.*]], i64 7
 ; OPT-GFX8-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0
 ; OPT-GFX8-NEXT:    br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT-GFX8:       if:
-; OPT-GFX8-NEXT:    [[LOAD:%.*]] = load i32, i32* [[IN_GEP]], align 4
+; OPT-GFX8-NEXT:    [[LOAD:%.*]] = load i32, ptr [[IN_GEP]], align 4
 ; OPT-GFX8-NEXT:    br label [[ENDIF]]
 ; OPT-GFX8:       endif:
 ; OPT-GFX8-NEXT:    [[X:%.*]] = phi i32 [ [[LOAD]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
-; OPT-GFX8-NEXT:    store i32 [[X]], i32* [[OUT_GEP]], align 4
+; OPT-GFX8-NEXT:    store i32 [[X]], ptr [[OUT_GEP]], align 4
 ; OPT-GFX8-NEXT:    br label [[DONE:%.*]]
 ; OPT-GFX8:       done:
 ; OPT-GFX8-NEXT:    ret void
 ;
 ; OPT-GFX9-LABEL: @test_sinkable_flat_small_offset_i32(
 ; OPT-GFX9-NEXT:  entry:
-; OPT-GFX9-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i64 999999
+; OPT-GFX9-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999
 ; OPT-GFX9-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0
 ; OPT-GFX9-NEXT:    br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT-GFX9:       if:
-; OPT-GFX9-NEXT:    [[TMP0:%.*]] = bitcast i32* [[IN:%.*]] to i8*
-; OPT-GFX9-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, i8* [[TMP0]], i64 28
-; OPT-GFX9-NEXT:    [[TMP1:%.*]] = bitcast i8* [[SUNKADDR]] to i32*
-; OPT-GFX9-NEXT:    [[LOAD:%.*]] = load i32, i32* [[TMP1]], align 4
+; OPT-GFX9-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 28
+; OPT-GFX9-NEXT:    [[LOAD:%.*]] = load i32, ptr [[SUNKADDR]], align 4
 ; OPT-GFX9-NEXT:    br label [[ENDIF]]
 ; OPT-GFX9:       endif:
 ; OPT-GFX9-NEXT:    [[X:%.*]] = phi i32 [ [[LOAD]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
-; OPT-GFX9-NEXT:    store i32 [[X]], i32* [[OUT_GEP]], align 4
+; OPT-GFX9-NEXT:    store i32 [[X]], ptr [[OUT_GEP]], align 4
 ; OPT-GFX9-NEXT:    ret void
 ;
 ; OPT-GFX10-LABEL: @test_sinkable_flat_small_offset_i32(
 ; OPT-GFX10-NEXT:  entry:
-; OPT-GFX10-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i64 999999
+; OPT-GFX10-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999
 ; OPT-GFX10-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0
 ; OPT-GFX10-NEXT:    br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT-GFX10:       if:
-; OPT-GFX10-NEXT:    [[TMP0:%.*]] = bitcast i32* [[IN:%.*]] to i8*
-; OPT-GFX10-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, i8* [[TMP0]], i64 28
-; OPT-GFX10-NEXT:    [[TMP1:%.*]] = bitcast i8* [[SUNKADDR]] to i32*
-; OPT-GFX10-NEXT:    [[LOAD:%.*]] = load i32, i32* [[TMP1]], align 4
+; OPT-GFX10-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 28
+; OPT-GFX10-NEXT:    [[LOAD:%.*]] = load i32, ptr [[SUNKADDR]], align 4
 ; OPT-GFX10-NEXT:    br label [[ENDIF]]
 ; OPT-GFX10:       endif:
 ; OPT-GFX10-NEXT:    [[X:%.*]] = phi i32 [ [[LOAD]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
-; OPT-GFX10-NEXT:    store i32 [[X]], i32* [[OUT_GEP]], align 4
+; OPT-GFX10-NEXT:    store i32 [[X]], ptr [[OUT_GEP]], align 4
 ; OPT-GFX10-NEXT:    ret void
 ;
 ; GFX7-LABEL: test_sinkable_flat_small_offset_i32:
@@ -152,86 +148,83 @@ define void @test_sinkable_flat_small_offset_i32(i32* %out, i32* %in, i32 %cond)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %out.gep = getelementptr i32, i32* %out, i64 999999
-  %in.gep = getelementptr i32, i32* %in, i64 7
+  %out.gep = getelementptr i32, ptr %out, i64 999999
+  %in.gep = getelementptr i32, ptr %in, i64 7
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %endif, label %if
 
 if:
-  %load = load i32, i32* %in.gep
+  %load = load i32, ptr %in.gep
   br label %endif
 
 endif:
   %x = phi i32 [ %load, %if ], [ 0, %entry ]
-  store i32 %x, i32* %out.gep
+  store i32 %x, ptr %out.gep
   br label %done
 
 done:
   ret void
 }
 
-define void @test_sink_noop_addrspacecast_flat_to_global_i32(i32* %out, i32* %in, i32 %cond) {
+define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in, i32 %cond) {
 ; OPT-GFX7-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32(
 ; OPT-GFX7-NEXT:  entry:
-; OPT-GFX7-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i64 999999
+; OPT-GFX7-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999
 ; OPT-GFX7-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0
 ; OPT-GFX7-NEXT:    br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT-GFX7:       if:
-; OPT-GFX7-NEXT:    [[TMP0:%.*]] = addrspacecast i32* [[IN:%.*]] to i8 addrspace(1)*
-; OPT-GFX7-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 28
-; OPT-GFX7-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SUNKADDR]] to i32 addrspace(1)*
-; OPT-GFX7-NEXT:    [[LOAD:%.*]] = load i32, i32 addrspace(1)* [[TMP1]], align 4
+; OPT-GFX7-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[IN:%.*]] to ptr addrspace(1)
+; OPT-GFX7-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP0]], i64 28
+; OPT-GFX7-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[SUNKADDR]], align 4
 ; OPT-GFX7-NEXT:    br label [[ENDIF]]
 ; OPT-GFX7:       endif:
 ; OPT-GFX7-NEXT:    [[X:%.*]] = phi i32 [ [[LOAD]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
-; OPT-GFX7-NEXT:    store i32 [[X]], i32* [[OUT_GEP]], align 4
+; OPT-GFX7-NEXT:    store i32 [[X]], ptr [[OUT_GEP]], align 4
 ; OPT-GFX7-NEXT:    ret void
 ;
 ; OPT-GFX8-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32(
 ; OPT-GFX8-NEXT:  entry:
-; OPT-GFX8-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i64 999999
-; OPT-GFX8-NEXT:    [[IN_GEP:%.*]] = getelementptr i32, i32* [[IN:%.*]], i64 7
+; OPT-GFX8-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999
+; OPT-GFX8-NEXT:    [[IN_GEP:%.*]] = getelementptr i32, ptr [[IN:%.*]], i64 7
 ; OPT-GFX8-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0
 ; OPT-GFX8-NEXT:    br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT-GFX8:       if:
-; OPT-GFX8-NEXT:    [[TMP0:%.*]] = addrspacecast i32* [[IN_GEP]] to i32 addrspace(1)*
-; OPT-GFX8-NEXT:    [[LOAD:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4
+; OPT-GFX8-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[IN_GEP]] to ptr addrspace(1)
+; OPT-GFX8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
 ; OPT-GFX8-NEXT:    br label [[ENDIF]]
 ; OPT-GFX8:       endif:
 ; OPT-GFX8-NEXT:    [[X:%.*]] = phi i32 [ [[LOAD]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
-; OPT-GFX8-NEXT:    store i32 [[X]], i32* [[OUT_GEP]], align 4
+; OPT-GFX8-NEXT:    store i32 [[X]], ptr [[OUT_GEP]], align 4
 ; OPT-GFX8-NEXT:    ret void
 ;
 ; OPT-GFX9-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32(
 ; OPT-GFX9-NEXT:  entry:
-; OPT-GFX9-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i64 999999
+; OPT-GFX9-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999
 ; OPT-GFX9-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0
 ; OPT-GFX9-NEXT:    br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT-GFX9:       if:
-; OPT-GFX9-NEXT:    [[TMP0:%.*]] = addrspacecast i32* [[IN:%.*]] to i8 addrspace(1)*
-; OPT-GFX9-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 28
-; OPT-GFX9-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SUNKADDR]] to i32 addrspace(1)*
-; OPT-GFX9-NEXT:    [[LOAD:%.*]] = load i32, i32 addrspace(1)* [[TMP1]], align 4
+; OPT-GFX9-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[IN:%.*]] to ptr addrspace(1)
+; OPT-GFX9-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP0]], i64 28
+; OPT-GFX9-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[SUNKADDR]], align 4
 ; OPT-GFX9-NEXT:    br label [[ENDIF]]
 ; OPT-GFX9:       endif:
 ; OPT-GFX9-NEXT:    [[X:%.*]] = phi i32 [ [[LOAD]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
-; OPT-GFX9-NEXT:    store i32 [[X]], i32* [[OUT_GEP]], align 4
+; OPT-GFX9-NEXT:    store i32 [[X]], ptr [[OUT_GEP]], align 4
 ; OPT-GFX9-NEXT:    ret void
 ;
 ; OPT-GFX10-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32(
 ; OPT-GFX10-NEXT:  entry:
-; OPT-GFX10-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i64 999999
+; OPT-GFX10-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999
 ; OPT-GFX10-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0
 ; OPT-GFX10-NEXT:    br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT-GFX10:       if:
-; OPT-GFX10-NEXT:    [[TMP0:%.*]] = addrspacecast i32* [[IN:%.*]] to i8 addrspace(1)*
-; OPT-GFX10-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 28
-; OPT-GFX10-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SUNKADDR]] to i32 addrspace(1)*
-; OPT-GFX10-NEXT:    [[LOAD:%.*]] = load i32, i32 addrspace(1)* [[TMP1]], align 4
+; OPT-GFX10-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[IN:%.*]] to ptr addrspace(1)
+; OPT-GFX10-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP0]], i64 28
+; OPT-GFX10-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[SUNKADDR]], align 4
 ; OPT-GFX10-NEXT:    br label [[ENDIF]]
 ; OPT-GFX10:       endif:
 ; OPT-GFX10-NEXT:    [[X:%.*]] = phi i32 [ [[LOAD]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
-; OPT-GFX10-NEXT:    store i32 [[X]], i32* [[OUT_GEP]], align 4
+; OPT-GFX10-NEXT:    store i32 [[X]], ptr [[OUT_GEP]], align 4
 ; OPT-GFX10-NEXT:    ret void
 ;
 ; GFX7-LABEL: test_sink_noop_addrspacecast_flat_to_global_i32:
@@ -314,40 +307,39 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(i32* %out, i32* %in
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %out.gep = getelementptr i32, i32* %out, i64 999999
-  %in.gep = getelementptr i32, i32* %in, i64 7
-  %cast = addrspacecast i32* %in.gep to i32 addrspace(1)*
+  %out.gep = getelementptr i32, ptr %out, i64 999999
+  %in.gep = getelementptr i32, ptr %in, i64 7
+  %cast = addrspacecast ptr %in.gep to ptr addrspace(1)
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %endif, label %if
 
 if:
-  %load = load i32, i32 addrspace(1)* %cast
+  %load = load i32, ptr addrspace(1) %cast
   br label %endif
 
 endif:
   %x = phi i32 [ %load, %if ], [ 0, %entry ]
-  store i32 %x, i32* %out.gep
+  store i32 %x, ptr %out.gep
   br label %done
 
 done:
   ret void
 }
 
-define void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32* %out, i32* %in, i32 %cond) {
+define void @test_sink_noop_addrspacecast_flat_to_constant_i32(ptr %out, ptr %in, i32 %cond) {
 ; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_constant_i32(
 ; OPT-NEXT:  entry:
-; OPT-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i64 999999
+; OPT-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999
 ; OPT-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0
 ; OPT-NEXT:    br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT:       if:
-; OPT-NEXT:    [[TMP0:%.*]] = addrspacecast i32* [[IN:%.*]] to i8 addrspace(4)*
-; OPT-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP0]], i64 28
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[SUNKADDR]] to i32 addrspace(4)*
-; OPT-NEXT:    [[LOAD:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
+; OPT-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[IN:%.*]] to ptr addrspace(4)
+; OPT-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], i64 28
+; OPT-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[SUNKADDR]], align 4
 ; OPT-NEXT:    br label [[ENDIF]]
 ; OPT:       endif:
 ; OPT-NEXT:    [[X:%.*]] = phi i32 [ [[LOAD]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
-; OPT-NEXT:    store i32 [[X]], i32* [[OUT_GEP]], align 4
+; OPT-NEXT:    store i32 [[X]], ptr [[OUT_GEP]], align 4
 ; OPT-NEXT:    ret void
 ;
 ; GFX7-LABEL: test_sink_noop_addrspacecast_flat_to_constant_i32:
@@ -430,92 +422,92 @@ define void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32* %out, i32* %
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %out.gep = getelementptr i32, i32* %out, i64 999999
-  %in.gep = getelementptr i32, i32* %in, i64 7
-  %cast = addrspacecast i32* %in.gep to i32 addrspace(4)*
+  %out.gep = getelementptr i32, ptr %out, i64 999999
+  %in.gep = getelementptr i32, ptr %in, i64 7
+  %cast = addrspacecast ptr %in.gep to ptr addrspace(4)
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %endif, label %if
 
 if:
-  %load = load i32, i32 addrspace(4)* %cast
+  %load = load i32, ptr addrspace(4) %cast
   br label %endif
 
 endif:
   %x = phi i32 [ %load, %if ], [ 0, %entry ]
-  store i32 %x, i32* %out.gep
+  store i32 %x, ptr %out.gep
   br label %done
 
 done:
   ret void
 }
 
-define void @test_sink_flat_small_max_flat_offset(i32* %out, i8* %in) #1 {
+define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 {
 ; OPT-GFX7-LABEL: @test_sink_flat_small_max_flat_offset(
 ; OPT-GFX7-NEXT:  entry:
-; OPT-GFX7-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i32 1024
-; OPT-GFX7-NEXT:    [[IN_GEP:%.*]] = getelementptr i8, i8* [[IN:%.*]], i64 4095
+; OPT-GFX7-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i32 1024
+; OPT-GFX7-NEXT:    [[IN_GEP:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 4095
 ; OPT-GFX7-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]]
 ; OPT-GFX7-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[TID]], 0
 ; OPT-GFX7-NEXT:    br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT-GFX7:       if:
-; OPT-GFX7-NEXT:    [[LOAD:%.*]] = load i8, i8* [[IN_GEP]], align 1
+; OPT-GFX7-NEXT:    [[LOAD:%.*]] = load i8, ptr [[IN_GEP]], align 1
 ; OPT-GFX7-NEXT:    [[CAST:%.*]] = sext i8 [[LOAD]] to i32
 ; OPT-GFX7-NEXT:    br label [[ENDIF]]
 ; OPT-GFX7:       endif:
 ; OPT-GFX7-NEXT:    [[X:%.*]] = phi i32 [ [[CAST]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
-; OPT-GFX7-NEXT:    store i32 [[X]], i32* [[OUT_GEP]], align 4
+; OPT-GFX7-NEXT:    store i32 [[X]], ptr [[OUT_GEP]], align 4
 ; OPT-GFX7-NEXT:    br label [[DONE:%.*]]
 ; OPT-GFX7:       done:
 ; OPT-GFX7-NEXT:    ret void
 ;
 ; OPT-GFX8-LABEL: @test_sink_flat_small_max_flat_offset(
 ; OPT-GFX8-NEXT:  entry:
-; OPT-GFX8-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i32 1024
-; OPT-GFX8-NEXT:    [[IN_GEP:%.*]] = getelementptr i8, i8* [[IN:%.*]], i64 4095
+; OPT-GFX8-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i32 1024
+; OPT-GFX8-NEXT:    [[IN_GEP:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 4095
 ; OPT-GFX8-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]]
 ; OPT-GFX8-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[TID]], 0
 ; OPT-GFX8-NEXT:    br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT-GFX8:       if:
-; OPT-GFX8-NEXT:    [[LOAD:%.*]] = load i8, i8* [[IN_GEP]], align 1
+; OPT-GFX8-NEXT:    [[LOAD:%.*]] = load i8, ptr [[IN_GEP]], align 1
 ; OPT-GFX8-NEXT:    [[CAST:%.*]] = sext i8 [[LOAD]] to i32
 ; OPT-GFX8-NEXT:    br label [[ENDIF]]
 ; OPT-GFX8:       endif:
 ; OPT-GFX8-NEXT:    [[X:%.*]] = phi i32 [ [[CAST]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
-; OPT-GFX8-NEXT:    store i32 [[X]], i32* [[OUT_GEP]], align 4
+; OPT-GFX8-NEXT:    store i32 [[X]], ptr [[OUT_GEP]], align 4
 ; OPT-GFX8-NEXT:    br label [[DONE:%.*]]
 ; OPT-GFX8:       done:
 ; OPT-GFX8-NEXT:    ret void
 ;
 ; OPT-GFX9-LABEL: @test_sink_flat_small_max_flat_offset(
 ; OPT-GFX9-NEXT:  entry:
-; OPT-GFX9-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i32 1024
+; OPT-GFX9-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i32 1024
 ; OPT-GFX9-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]]
 ; OPT-GFX9-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[TID]], 0
 ; OPT-GFX9-NEXT:    br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT-GFX9:       if:
-; OPT-GFX9-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, i8* [[IN:%.*]], i64 4095
-; OPT-GFX9-NEXT:    [[LOAD:%.*]] = load i8, i8* [[SUNKADDR]], align 1
+; OPT-GFX9-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 4095
+; OPT-GFX9-NEXT:    [[LOAD:%.*]] = load i8, ptr [[SUNKADDR]], align 1
 ; OPT-GFX9-NEXT:    [[CAST:%.*]] = sext i8 [[LOAD]] to i32
 ; OPT-GFX9-NEXT:    br label [[ENDIF]]
 ; OPT-GFX9:       endif:
 ; OPT-GFX9-NEXT:    [[X:%.*]] = phi i32 [ [[CAST]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
-; OPT-GFX9-NEXT:    store i32 [[X]], i32* [[OUT_GEP]], align 4
+; OPT-GFX9-NEXT:    store i32 [[X]], ptr [[OUT_GEP]], align 4
 ; OPT-GFX9-NEXT:    ret void
 ;
 ; OPT-GFX10-LABEL: @test_sink_flat_small_max_flat_offset(
 ; OPT-GFX10-NEXT:  entry:
-; OPT-GFX10-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i32 1024
-; OPT-GFX10-NEXT:    [[IN_GEP:%.*]] = getelementptr i8, i8* [[IN:%.*]], i64 4095
+; OPT-GFX10-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i32 1024
+; OPT-GFX10-NEXT:    [[IN_GEP:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 4095
 ; OPT-GFX10-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]]
 ; OPT-GFX10-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[TID]], 0
 ; OPT-GFX10-NEXT:    br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT-GFX10:       if:
-; OPT-GFX10-NEXT:    [[LOAD:%.*]] = load i8, i8* [[IN_GEP]], align 1
+; OPT-GFX10-NEXT:    [[LOAD:%.*]] = load i8, ptr [[IN_GEP]], align 1
 ; OPT-GFX10-NEXT:    [[CAST:%.*]] = sext i8 [[LOAD]] to i32
 ; OPT-GFX10-NEXT:    br label [[ENDIF]]
 ; OPT-GFX10:       endif:
 ; OPT-GFX10-NEXT:    [[X:%.*]] = phi i32 [ [[CAST]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
-; OPT-GFX10-NEXT:    store i32 [[X]], i32* [[OUT_GEP]], align 4
+; OPT-GFX10-NEXT:    store i32 [[X]], ptr [[OUT_GEP]], align 4
 ; OPT-GFX10-NEXT:    br label [[DONE:%.*]]
 ; OPT-GFX10:       done:
 ; OPT-GFX10-NEXT:    ret void
@@ -604,41 +596,41 @@ define void @test_sink_flat_small_max_flat_offset(i32* %out, i8* %in) #1 {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %out.gep = getelementptr i32, i32* %out, i32 1024
-  %in.gep = getelementptr i8, i8* %in, i64 4095
+  %out.gep = getelementptr i32, ptr %out, i32 1024
+  %in.gep = getelementptr i8, ptr %in, i64 4095
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %cmp0 = icmp eq i32 %tid, 0
   br i1 %cmp0, label %endif, label %if
 
 if:
-  %load = load i8, i8* %in.gep
+  %load = load i8, ptr %in.gep
   %cast = sext i8 %load to i32
   br label %endif
 
 endif:
   %x = phi i32 [ %cast, %if ], [ 0, %entry ]
-  store i32 %x, i32* %out.gep
+  store i32 %x, ptr %out.gep
   br label %done
 
 done:
   ret void
 }
 
-define void @test_sink_flat_small_max_plus_1_flat_offset(i32* %out, i8* %in) #1 {
+define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 {
 ; OPT-LABEL: @test_sink_flat_small_max_plus_1_flat_offset(
 ; OPT-NEXT:  entry:
-; OPT-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i64 99999
-; OPT-NEXT:    [[IN_GEP:%.*]] = getelementptr i8, i8* [[IN:%.*]], i64 4096
+; OPT-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 99999
+; OPT-NEXT:    [[IN_GEP:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 4096
 ; OPT-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]]
 ; OPT-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[TID]], 0
 ; OPT-NEXT:    br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT:       if:
-; OPT-NEXT:    [[LOAD:%.*]] = load i8, i8* [[IN_GEP]], align 1
+; OPT-NEXT:    [[LOAD:%.*]] = load i8, ptr [[IN_GEP]], align 1
 ; OPT-NEXT:    [[CAST:%.*]] = sext i8 [[LOAD]] to i32
 ; OPT-NEXT:    br label [[ENDIF]]
 ; OPT:       endif:
 ; OPT-NEXT:    [[X:%.*]] = phi i32 [ [[CAST]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
-; OPT-NEXT:    store i32 [[X]], i32* [[OUT_GEP]], align 4
+; OPT-NEXT:    store i32 [[X]], ptr [[OUT_GEP]], align 4
 ; OPT-NEXT:    br label [[DONE:%.*]]
 ; OPT:       done:
 ; OPT-NEXT:    ret void
@@ -729,41 +721,41 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(i32* %out, i8* %in) #1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %out.gep = getelementptr i32, i32* %out, i64 99999
-  %in.gep = getelementptr i8, i8* %in, i64 4096
+  %out.gep = getelementptr i32, ptr %out, i64 99999
+  %in.gep = getelementptr i8, ptr %in, i64 4096
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %cmp0 = icmp eq i32 %tid, 0
   br i1 %cmp0, label %endif, label %if
 
 if:
-  %load = load i8, i8* %in.gep
+  %load = load i8, ptr %in.gep
   %cast = sext i8 %load to i32
   br label %endif
 
 endif:
   %x = phi i32 [ %cast, %if ], [ 0, %entry ]
-  store i32 %x, i32* %out.gep
+  store i32 %x, ptr %out.gep
   br label %done
 
 done:
   ret void
 }
 
-define void @test_sinkable_flat_reg_offset(i32* %out, i8* %in, i64 %reg) #1 {
+define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 {
 ; OPT-LABEL: @test_sinkable_flat_reg_offset(
 ; OPT-NEXT:  entry:
-; OPT-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32* [[OUT:%.*]], i32 1024
-; OPT-NEXT:    [[IN_GEP:%.*]] = getelementptr i8, i8* [[IN:%.*]], i64 [[REG:%.*]]
+; OPT-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i32 1024
+; OPT-NEXT:    [[IN_GEP:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 [[REG:%.*]]
 ; OPT-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3]]
 ; OPT-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[TID]], 0
 ; OPT-NEXT:    br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT:       if:
-; OPT-NEXT:    [[LOAD:%.*]] = load i8, i8* [[IN_GEP]], align 1
+; OPT-NEXT:    [[LOAD:%.*]] = load i8, ptr [[IN_GEP]], align 1
 ; OPT-NEXT:    [[CAST:%.*]] = sext i8 [[LOAD]] to i32
 ; OPT-NEXT:    br label [[ENDIF]]
 ; OPT:       endif:
 ; OPT-NEXT:    [[X:%.*]] = phi i32 [ [[CAST]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
-; OPT-NEXT:    store i32 [[X]], i32* [[OUT_GEP]], align 4
+; OPT-NEXT:    store i32 [[X]], ptr [[OUT_GEP]], align 4
 ; OPT-NEXT:    br label [[DONE:%.*]]
 ; OPT:       done:
 ; OPT-NEXT:    ret void
@@ -854,20 +846,20 @@ define void @test_sinkable_flat_reg_offset(i32* %out, i8* %in, i64 %reg) #1 {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %out.gep = getelementptr i32, i32* %out, i32 1024
-  %in.gep = getelementptr i8, i8* %in, i64 %reg
+  %out.gep = getelementptr i32, ptr %out, i32 1024
+  %in.gep = getelementptr i8, ptr %in, i64 %reg
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %cmp0 = icmp eq i32 %tid, 0
   br i1 %cmp0, label %endif, label %if
 
 if:
-  %load = load i8, i8* %in.gep
+  %load = load i8, ptr %in.gep
   %cast = sext i8 %load to i32
   br label %endif
 
 endif:
   %x = phi i32 [ %cast, %if ], [ 0, %entry ]
-  store i32 %x, i32* %out.gep
+  store i32 %x, ptr %out.gep
   br label %done
 
 done:

diff  --git a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
index 31c490a70c311..46602976f13f6 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
@@ -35,24 +35,24 @@
 
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define amdgpu_kernel void @sink_ubfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i32(ptr addrspace(1) %out, i32 %arg1) #0 {
 entry:
   %shr = lshr i32 %arg1, 8
   br i1 undef, label %bb0, label %bb1
 
 bb0:
   %val0 = and i32 %shr, 255
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   br label %ret
 
 bb1:
   %val1 = and i32 %shr, 127
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   br label %ret
 
 ret:
   %phi = phi i32 [ %val0, %bb0 ], [ %val1, %bb1 ]
-  store i32 %phi, i32 addrspace(1)* %out
+  store i32 %phi, ptr addrspace(1) %out
   ret void
 }
 
@@ -75,24 +75,24 @@ ret:
 ; OPT: ret
 
 ; GCN-LABEL: {{^}}sink_sbfe_i32:
-define amdgpu_kernel void @sink_sbfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
+define amdgpu_kernel void @sink_sbfe_i32(ptr addrspace(1) %out, i32 %arg1) #0 {
 entry:
   %shr = ashr i32 %arg1, 8
   br i1 undef, label %bb0, label %bb1
 
 bb0:
   %val0 = and i32 %shr, 255
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   br label %ret
 
 bb1:
   %val1 = and i32 %shr, 127
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   br label %ret
 
 ret:
   %phi = phi i32 [ %val0, %bb0 ], [ %val1, %bb1 ]
-  store i32 %phi, i32 addrspace(1)* %out
+  store i32 %phi, ptr addrspace(1) %out
   ret void
 }
 
@@ -132,24 +132,24 @@ ret:
 
 ; GCN: buffer_store_short
 ; GCN: s_endpgm
-define amdgpu_kernel void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i16(ptr addrspace(1) %out, i16 %arg1) #0 {
 entry:
   %shr = lshr i16 %arg1, 4
   br i1 undef, label %bb0, label %bb1
 
 bb0:
   %val0 = and i16 %shr, 255
-  store volatile i16 0, i16 addrspace(1)* undef
+  store volatile i16 0, ptr addrspace(1) undef
   br label %ret
 
 bb1:
   %val1 = and i16 %shr, 127
-  store volatile i16 0, i16 addrspace(1)* undef
+  store volatile i16 0, ptr addrspace(1) undef
   br label %ret
 
 ret:
   %phi = phi i16 [ %val0, %bb0 ], [ %val1, %bb1 ]
-  store i16 %phi, i16 addrspace(1)* %out
+  store i16 %phi, ptr addrspace(1) %out
   ret void
 }
 
@@ -183,24 +183,24 @@ ret:
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xff, v[[LO]]
 
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @sink_ubfe_i64_span_midpoint(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_span_midpoint(ptr addrspace(1) %out, i64 %arg1) #0 {
 entry:
   %shr = lshr i64 %arg1, 30
   br i1 undef, label %bb0, label %bb1
 
 bb0:
   %val0 = and i64 %shr, 255
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   br label %ret
 
 bb1:
   %val1 = and i64 %shr, 127
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   br label %ret
 
 ret:
   %phi = phi i64 [ %val0, %bb0 ], [ %val1, %bb1 ]
-  store i64 %phi, i64 addrspace(1)* %out
+  store i64 %phi, ptr addrspace(1) %out
   ret void
 }
 
@@ -231,24 +231,24 @@ ret:
 ; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x8000f
 
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @sink_ubfe_i64_low32(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_low32(ptr addrspace(1) %out, i64 %arg1) #0 {
 entry:
   %shr = lshr i64 %arg1, 15
   br i1 undef, label %bb0, label %bb1
 
 bb0:
   %val0 = and i64 %shr, 255
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   br label %ret
 
 bb1:
   %val1 = and i64 %shr, 127
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   br label %ret
 
 ret:
   %phi = phi i64 [ %val0, %bb0 ], [ %val1, %bb1 ]
-  store i64 %phi, i64 addrspace(1)* %out
+  store i64 %phi, ptr addrspace(1) %out
   ret void
 }
 
@@ -277,24 +277,24 @@ ret:
 ; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80003
 
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @sink_ubfe_i64_high32(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_high32(ptr addrspace(1) %out, i64 %arg1) #0 {
 entry:
   %shr = lshr i64 %arg1, 35
   br i1 undef, label %bb0, label %bb1
 
 bb0:
   %val0 = and i64 %shr, 255
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   br label %ret
 
 bb1:
   %val1 = and i64 %shr, 127
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   br label %ret
 
 ret:
   %phi = phi i64 [ %val0, %bb0 ], [ %val1, %bb1 ]
-  store i64 %phi, i64 addrspace(1)* %out
+  store i64 %phi, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll b/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll
index 5b7c8e4550b96..9edf566335925 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll
@@ -9,14 +9,14 @@
 ; GCN-NEXT: v_mov_b32_e32 v{{[0-9]*}}[[LO:[02468]]], v{{[0-9]+}}
 ; GCN-NEXT: global_store_dwordx2 v{{[0-9]+}}, v[[[LO]]:{{[0-9]+\]}}, s[{{[0-9:]+}}]
 
-define amdgpu_kernel void @test_odd_int4(<4 x i32> addrspace(1)* %arg, <2 x i32> addrspace(1)* %arg1) {
+define amdgpu_kernel void @test_odd_int4(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) {
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %lid
-  %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep1, align 16
+  %gep1 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i32 %lid
+  %load = load <4 x i32>, ptr addrspace(1) %gep1, align 16
   %shuffle = shufflevector <4 x i32> %load, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
-  %gep2 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %arg1, i32 %lid
-  store <2 x i32> %shuffle, <2 x i32> addrspace(1)* %gep2, align 8
+  %gep2 = getelementptr inbounds <2 x i32>, ptr addrspace(1) %arg1, i32 %lid
+  store <2 x i32> %shuffle, ptr addrspace(1) %gep2, align 8
   ret void
 }
 
@@ -26,11 +26,11 @@ bb:
 ; GCN:     global_store_dwordx4 v[{{[0-9]*[02468]:[0-9]*[13579]}}], v[{{[0-9]*[02468]:[0-9]*[13579]}}]
 define amdgpu_kernel void @test_vector_creation() {
 entry:
-  %tmp231 = load <4 x i16>, <4 x i16> addrspace(1)* undef, align 2
+  %tmp231 = load <4 x i16>, ptr addrspace(1) undef, align 2
   %vext466 = shufflevector <4 x i16> %tmp231, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %vecinit467 = shufflevector <8 x i16> undef, <8 x i16> %vext466, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef>
   %vecinit471 = shufflevector <8 x i16> %vecinit467, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-  store <8 x i16> %vecinit471, <8 x i16> addrspace(1)* undef, align 16
+  store <8 x i16> %vecinit471, ptr addrspace(1) undef, align 16
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
index 679aea660ff6e..37f36547bc17c 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
@@ -13,7 +13,7 @@ declare float @llvm.fma.f32(float, float, float)
 ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
 ; It's probably OK if this is slightly higher:
 ; CHECK: ; NumVgprs: 8
-define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* noalias %out, <4 x float> addrspace(1)* noalias %in, i32 %flag) {
+define amdgpu_kernel void @foobar(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %flag) {
 entry:
   %cmpflag = icmp eq i32 %flag, 1
   br i1 %cmpflag, label %loop, label %exit
@@ -27,9 +27,9 @@ loop:
 
   ; Try to get the 0 constant to get coalesced into a wide register
   %blup = insertelement <4 x float> undef, float %v0, i32 0
-  store <4 x float> %blup, <4 x float> addrspace(1)* %out
+  store <4 x float> %blup, ptr addrspace(1) %out
 
-  %load = load <4 x float>, <4 x float> addrspace(1)* %in
+  %load = load <4 x float>, ptr addrspace(1) %in
   %load.0 = extractelement <4 x float> %load, i32 0
   %load.1 = extractelement <4 x float> %load, i32 1
   %load.2 = extractelement <4 x float> %load, i32 2
@@ -52,6 +52,6 @@ exit:
   %dst.1 = insertelement <4 x float> %dst.0, float %ev1, i32 1
   %dst.2 = insertelement <4 x float> %dst.1, float %ev2, i32 2
   %dst.3 = insertelement <4 x float> %dst.2, float %ev3, i32 3
-  store <4 x float> %dst.3, <4 x float> addrspace(1)* %out
+  store <4 x float> %dst.3, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index 2bc34cedd1e03..1db0f29ed74a1 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -66,26 +66,26 @@
 ; OSABI-AMDHSA-ELF: AMDGPU 0x{{[0-9a-f]+}} NT_AMDGPU_METADATA (AMDGPU Metadata)
 
 define amdgpu_kernel void @fadd(
-    float addrspace(1)* %r,
-    float addrspace(1)* %a,
-    float addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load float, float addrspace(1)* %a
-  %b.val = load float, float addrspace(1)* %b
+  %a.val = load float, ptr addrspace(1) %a
+  %b.val = load float, ptr addrspace(1) %b
   %r.val = fadd float %a.val, %b.val
-  store float %r.val, float addrspace(1)* %r
+  store float %r.val, ptr addrspace(1) %r
   ret void
 }
 
 define amdgpu_kernel void @fsub(
-    float addrspace(1)* %r,
-    float addrspace(1)* %a,
-    float addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load float, float addrspace(1)* %a
-  %b.val = load float, float addrspace(1)* %b
+  %a.val = load float, ptr addrspace(1) %a
+  %b.val = load float, ptr addrspace(1) %b
   %r.val = fsub float %a.val, %b.val
-  store float %r.val, float addrspace(1)* %r
+  store float %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -97,9 +97,9 @@ entry:
 ; ALL-ASM:     .amdhsa_next_free_sgpr 1
 define amdgpu_kernel void @empty(
     i32 %i,
-    float addrspace(1)* %r,
-    float addrspace(1)* %a,
-    float addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index 4210a8111ea8e..ee9da868068af 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -42,26 +42,26 @@
 ; GCN-O0:      ds_write_b32
 ; GCN-O0:      s_endpgm
 ;
-define amdgpu_kernel void @simple_nested_if(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = icmp ugt i32 %tmp, 1
   br i1 %tmp1, label %bb.outer.then, label %bb.outer.end
 
 bb.outer.then:                                    ; preds = %bb
-  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp
-  store i32 0, i32 addrspace(1)* %tmp4, align 4
+  %tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
+  store i32 0, ptr addrspace(1) %tmp4, align 4
   %tmp5 = icmp eq i32 %tmp, 2
   br i1 %tmp5, label %bb.outer.end, label %bb.inner.then
 
 bb.inner.then:                                    ; preds = %bb.outer.then
   %tmp7 = add i32 %tmp, 1
-  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp7
-  store i32 1, i32 addrspace(1)* %tmp9, align 4
+  %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp7
+  store i32 1, ptr addrspace(1) %tmp9, align 4
   br label %bb.outer.end
 
 bb.outer.end:                                     ; preds = %bb.outer.then, %bb.inner.then, %bb
-  store i32 3, i32 addrspace(3)* null
+  store i32 3, ptr addrspace(3) null
   ret void
 }
 
@@ -111,32 +111,32 @@ bb.outer.end:                                     ; preds = %bb.outer.then, %bb.
 ; GCN-O0:      ds_write_b32
 ; GCN-O0:      s_endpgm
 ;
-define amdgpu_kernel void @uncollapsable_nested_if(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = icmp ugt i32 %tmp, 1
   br i1 %tmp1, label %bb.outer.then, label %bb.outer.end
 
 bb.outer.then:                                    ; preds = %bb
-  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp
-  store i32 0, i32 addrspace(1)* %tmp4, align 4
+  %tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
+  store i32 0, ptr addrspace(1) %tmp4, align 4
   %tmp5 = icmp eq i32 %tmp, 2
   br i1 %tmp5, label %bb.inner.end, label %bb.inner.then
 
 bb.inner.then:                                    ; preds = %bb.outer.then
   %tmp7 = add i32 %tmp, 1
-  %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp7
-  store i32 1, i32 addrspace(1)* %tmp8, align 4
+  %tmp8 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp7
+  store i32 1, ptr addrspace(1) %tmp8, align 4
   br label %bb.inner.end
 
 bb.inner.end:                                     ; preds = %bb.inner.then, %bb.outer.then
   %tmp9 = add i32 %tmp, 2
-  %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp9
-  store i32 2, i32 addrspace(1)* %tmp10, align 4
+  %tmp10 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp9
+  store i32 2, ptr addrspace(1) %tmp10, align 4
   br label %bb.outer.end
 
 bb.outer.end:                                     ; preds = %bb.inner.then, %bb
-  store i32 3, i32 addrspace(3)* null
+  store i32 3, ptr addrspace(3) null
   ret void
 }
 
@@ -201,11 +201,11 @@ bb.outer.end:                                     ; preds = %bb.inner.then, %bb
 ; GCN-O0:      ds_write_b32
 ; GCN-O0:      s_endpgm
 ;
-define amdgpu_kernel void @nested_if_if_else(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp
-  store i32 0, i32 addrspace(1)* %tmp1, align 4
+  %tmp1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
+  store i32 0, ptr addrspace(1) %tmp1, align 4
   %tmp2 = icmp ugt i32 %tmp, 1
   br i1 %tmp2, label %bb.outer.then, label %bb.outer.end
 
@@ -215,18 +215,18 @@ bb.outer.then:                                       ; preds = %bb
 
 bb.then:                                             ; preds = %bb.outer.then
   %tmp3 = add i32 %tmp, 1
-  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp3
-  store i32 1, i32 addrspace(1)* %tmp4, align 4
+  %tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp3
+  store i32 1, ptr addrspace(1) %tmp4, align 4
   br label %bb.outer.end
 
 bb.else:                                             ; preds = %bb.outer.then
   %tmp7 = add i32 %tmp, 2
-  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp7
-  store i32 2, i32 addrspace(1)* %tmp9, align 4
+  %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp7
+  store i32 2, ptr addrspace(1) %tmp9, align 4
   br label %bb.outer.end
 
 bb.outer.end:                                        ; preds = %bb, %bb.then, %bb.else
-  store i32 3, i32 addrspace(3)* null
+  store i32 3, ptr addrspace(3) null
   ret void
 }
 
@@ -314,38 +314,38 @@ bb.outer.end:                                        ; preds = %bb, %bb.then, %b
 ; GCN-O0:      ds_write_b32
 ; GCN-O0:      s_endpgm
 ;
-define amdgpu_kernel void @nested_if_else_if(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp
-  store i32 0, i32 addrspace(1)* %tmp1, align 4
+  %tmp1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
+  store i32 0, ptr addrspace(1) %tmp1, align 4
   %cc1 = icmp ugt i32 %tmp, 1
   br i1 %cc1, label %bb.outer.then, label %bb.outer.else
 
 bb.outer.then:
-  %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %tmp1, i32 1
-  store i32 1, i32 addrspace(1)* %tmp2, align 4
+  %tmp2 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 1
+  store i32 1, ptr addrspace(1) %tmp2, align 4
   %cc2 = icmp eq i32 %tmp, 2
   br i1 %cc2, label %bb.inner.then, label %bb.outer.end
 
 bb.inner.then:
-  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %tmp1, i32 2
-  store i32 2, i32 addrspace(1)* %tmp3, align 4
+  %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 2
+  store i32 2, ptr addrspace(1) %tmp3, align 4
   br label %bb.outer.end
 
 bb.outer.else:
-  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %tmp1, i32 3
-  store i32 3, i32 addrspace(1)* %tmp4, align 4
+  %tmp4 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 3
+  store i32 3, ptr addrspace(1) %tmp4, align 4
   %cc3 = icmp eq i32 %tmp, 2
   br i1 %cc3, label %bb.inner.then2, label %bb.outer.end
 
 bb.inner.then2:
-  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %tmp1, i32 4
-  store i32 4, i32 addrspace(1)* %tmp5, align 4
+  %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %tmp1, i32 4
+  store i32 4, ptr addrspace(1) %tmp5, align 4
   br label %bb.outer.end
 
 bb.outer.end:
-  store i32 3, i32 addrspace(3)* null
+  store i32 3, ptr addrspace(3) null
   ret void
 }
 
@@ -375,15 +375,15 @@ bb.outer.end:
 ; GCN-O0:      s_barrier
 ; GCN-O0:      s_endpgm
 ;
-define amdgpu_kernel void @s_endpgm_unsafe_barrier(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = icmp ugt i32 %tmp, 1
   br i1 %tmp1, label %bb.then, label %bb.end
 
 bb.then:                                          ; preds = %bb
-  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp
-  store i32 0, i32 addrspace(1)* %tmp4, align 4
+  %tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp
+  store i32 0, ptr addrspace(1) %tmp4, align 4
   br label %bb.end
 
 bb.end:                                           ; preds = %bb.then, %bb
@@ -520,7 +520,7 @@ bb2:                                              ; preds = %bb1
   br i1 %tmp3, label %bb4, label %bb10
 
 bb4:                                              ; preds = %bb2
-  %tmp6 = load float, float addrspace(5)* undef
+  %tmp6 = load float, ptr addrspace(5) undef
   %tmp7 = fcmp olt float %tmp6, 0.0
   br i1 %tmp7, label %bb8, label %Flow
 
@@ -540,7 +540,7 @@ Flow1:                                            ; preds = %bb10
   br label %bb1
 
 bb12:                                             ; preds = %bb10
-  store volatile <4 x float> %tmp11, <4 x float> addrspace(5)* undef, align 16
+  store volatile <4 x float> %tmp11, ptr addrspace(5) undef, align 16
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/combine-ftrunc.ll b/llvm/test/CodeGen/AMDGPU/combine-ftrunc.ll
index 5ea9b8318ff46..40cb09e5fd5cb 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-ftrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-ftrunc.ll
@@ -3,22 +3,22 @@
 ; GCN-LABEL: {{^}}combine_ftrunc_frint_f64:
 ; GCN: v_rndne_f64_e32 [[RND:v\[[0-9:]+\]]],
 ; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[RND]]
-define amdgpu_kernel void @combine_ftrunc_frint_f64(double addrspace(1)* %p) {
-  %v = load double, double addrspace(1)* %p, align 8
+define amdgpu_kernel void @combine_ftrunc_frint_f64(ptr addrspace(1) %p) {
+  %v = load double, ptr addrspace(1) %p, align 8
   %round = tail call double @llvm.rint.f64(double %v)
   %trunc = tail call double @llvm.trunc.f64(double %round)
-  store double %trunc, double addrspace(1)* %p, align 8
+  store double %trunc, ptr addrspace(1) %p, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}combine_ftrunc_frint_f32:
 ; GCN: v_rndne_f32_e32 [[RND:v[0-9]+]],
 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[RND]]
-define amdgpu_kernel void @combine_ftrunc_frint_f32(float addrspace(1)* %p) {
-  %v = load float, float addrspace(1)* %p, align 4
+define amdgpu_kernel void @combine_ftrunc_frint_f32(ptr addrspace(1) %p) {
+  %v = load float, ptr addrspace(1) %p, align 4
   %round = tail call float @llvm.rint.f32(float %v)
   %trunc = tail call float @llvm.trunc.f32(float %round)
-  store float %trunc, float addrspace(1)* %p, align 4
+  store float %trunc, ptr addrspace(1) %p, align 4
   ret void
 }
 
@@ -28,44 +28,44 @@ define amdgpu_kernel void @combine_ftrunc_frint_f32(float addrspace(1)* %p) {
 ; GCN-DAG: v_rndne_f32_e32 v[[RND1:[0-9]+]], s[[SRC1]]
 ; GCN-DAG: v_rndne_f32_e32 v[[RND2:[0-9]+]], s[[SRC2]]
 ; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], v[[[RND1]]:[[RND2]]]
-define amdgpu_kernel void @combine_ftrunc_frint_v2f32(<2 x float> addrspace(1)* %p) {
-  %v = load <2 x float>, <2 x float> addrspace(1)* %p, align 8
+define amdgpu_kernel void @combine_ftrunc_frint_v2f32(ptr addrspace(1) %p) {
+  %v = load <2 x float>, ptr addrspace(1) %p, align 8
   %round = tail call <2 x float> @llvm.rint.v2f32(<2 x float> %v)
   %trunc = tail call <2 x float> @llvm.trunc.v2f32(<2 x float> %round)
-  store <2 x float> %trunc, <2 x float> addrspace(1)* %p, align 8
+  store <2 x float> %trunc, ptr addrspace(1) %p, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}combine_ftrunc_fceil_f32:
 ; GCN: v_ceil_f32_e32 [[RND:v[0-9]+]],
 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[RND]]
-define amdgpu_kernel void @combine_ftrunc_fceil_f32(float addrspace(1)* %p) {
-  %v = load float, float addrspace(1)* %p, align 4
+define amdgpu_kernel void @combine_ftrunc_fceil_f32(ptr addrspace(1) %p) {
+  %v = load float, ptr addrspace(1) %p, align 4
   %round = tail call float @llvm.ceil.f32(float %v)
   %trunc = tail call float @llvm.trunc.f32(float %round)
-  store float %trunc, float addrspace(1)* %p, align 4
+  store float %trunc, ptr addrspace(1) %p, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}combine_ftrunc_ffloor_f32:
 ; GCN: v_floor_f32_e32 [[RND:v[0-9]+]],
 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[RND]]
-define amdgpu_kernel void @combine_ftrunc_ffloor_f32(float addrspace(1)* %p) {
-  %v = load float, float addrspace(1)* %p, align 4
+define amdgpu_kernel void @combine_ftrunc_ffloor_f32(ptr addrspace(1) %p) {
+  %v = load float, ptr addrspace(1) %p, align 4
   %round = tail call float @llvm.floor.f32(float %v)
   %trunc = tail call float @llvm.trunc.f32(float %round)
-  store float %trunc, float addrspace(1)* %p, align 4
+  store float %trunc, ptr addrspace(1) %p, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}combine_ftrunc_fnearbyint_f32:
 ; GCN: v_rndne_f32_e32 [[RND:v[0-9]+]],
 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[RND]]
-define amdgpu_kernel void @combine_ftrunc_fnearbyint_f32(float addrspace(1)* %p) {
-  %v = load float, float addrspace(1)* %p, align 4
+define amdgpu_kernel void @combine_ftrunc_fnearbyint_f32(ptr addrspace(1) %p) {
+  %v = load float, ptr addrspace(1) %p, align 4
   %round = tail call float @llvm.nearbyint.f32(float %v)
   %trunc = tail call float @llvm.trunc.f32(float %round)
-  store float %trunc, float addrspace(1)* %p, align 4
+  store float %trunc, ptr addrspace(1) %p, align 4
   ret void
 }
 
@@ -73,11 +73,11 @@ define amdgpu_kernel void @combine_ftrunc_fnearbyint_f32(float addrspace(1)* %p)
 ; GCN: s_load_dword [[SRC:s[0-9]+]],
 ; GCN: v_trunc_f32_e32 [[RND:v[0-9]+]], [[SRC]]
 ; GCN: flat_store_dword v[{{[0-9:]+}}], [[RND]]
-define amdgpu_kernel void @combine_ftrunc_ftrunc_f32(float addrspace(1)* %p) {
-  %v = load float, float addrspace(1)* %p, align 4
+define amdgpu_kernel void @combine_ftrunc_ftrunc_f32(ptr addrspace(1) %p) {
+  %v = load float, ptr addrspace(1) %p, align 4
   %round = tail call float @llvm.trunc.f32(float %v)
   %trunc = tail call float @llvm.trunc.f32(float %round)
-  store float %trunc, float addrspace(1)* %p, align 4
+  store float %trunc, ptr addrspace(1) %p, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
index d5958e5b3d74c..c703a1dd7734d 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
@@ -8,27 +8,27 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 ; GCN-LABEL: {{^}}commute_eq_64_i32:
 ; GCN: v_cmp_eq_u32_e32 vcc, 64, v{{[0-9]+}}
-define amdgpu_kernel void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_eq_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i32, ptr addrspace(1) %gep.in
   %cmp = icmp eq i32 %val, 64
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ne_64_i32:
 ; GCN: v_cmp_ne_u32_e32 vcc, 64, v{{[0-9]+}}
-define amdgpu_kernel void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i32, ptr addrspace(1) %gep.in
   %cmp = icmp ne i32 %val, 64
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
@@ -36,132 +36,132 @@ define amdgpu_kernel void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspa
 ; GCN-LABEL: {{^}}commute_ne_litk_i32:
 ; GCN: s_movk_i32 [[K:s[0-9]+]], 0x3039
 ; GCN: v_cmp_ne_u32_e32 vcc, [[K]], v{{[0-9]+}}
-define amdgpu_kernel void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_litk_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i32, ptr addrspace(1) %gep.in
   %cmp = icmp ne i32 %val, 12345
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ugt_64_i32:
 ; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}}
-define amdgpu_kernel void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i32, ptr addrspace(1) %gep.in
   %cmp = icmp ugt i32 %val, 64
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_uge_64_i32:
 ; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}}
-define amdgpu_kernel void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i32, ptr addrspace(1) %gep.in
   %cmp = icmp uge i32 %val, 64
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ult_64_i32:
 ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
-define amdgpu_kernel void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i32, ptr addrspace(1) %gep.in
   %cmp = icmp ult i32 %val, 64
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ule_63_i32:
 ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
-define amdgpu_kernel void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_63_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i32, ptr addrspace(1) %gep.in
   %cmp = icmp ule i32 %val, 63
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ule_64_i32:
 ; GCN: s_movk_i32 [[K:s[0-9]+]], 0x41{{$}}
 ; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}}
-define amdgpu_kernel void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i32, ptr addrspace(1) %gep.in
   %cmp = icmp ule i32 %val, 64
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_sgt_neg1_i32:
 ; GCN: v_ashrrev_i32_e32 v2, 31, v2
-define amdgpu_kernel void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sgt_neg1_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i32, ptr addrspace(1) %gep.in
   %cmp = icmp sgt i32 %val, -1
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_sge_neg2_i32:
 ; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}}
-define amdgpu_kernel void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sge_neg2_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i32, ptr addrspace(1) %gep.in
   %cmp = icmp sge i32 %val, -2
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_slt_neg16_i32:
 ; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}}
-define amdgpu_kernel void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_slt_neg16_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i32, ptr addrspace(1) %gep.in
   %cmp = icmp slt i32 %val, -16
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_sle_5_i32:
 ; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}}
-define amdgpu_kernel void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sle_5_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i32, ptr addrspace(1) %gep.in
   %cmp = icmp sle i32 %val, 5
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
@@ -171,79 +171,79 @@ define amdgpu_kernel void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspa
 
 ; GCN-LABEL: {{^}}commute_eq_64_i64:
 ; GCN: v_cmp_eq_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_eq_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
+  %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i64, ptr addrspace(1) %gep.in
   %cmp = icmp eq i64 %val, 64
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ne_64_i64:
 ; GCN: v_cmp_ne_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
+  %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i64, ptr addrspace(1) %gep.in
   %cmp = icmp ne i64 %val, 64
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ugt_64_i64:
 ; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
+  %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i64, ptr addrspace(1) %gep.in
   %cmp = icmp ugt i64 %val, 64
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_uge_64_i64:
 ; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
+  %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i64, ptr addrspace(1) %gep.in
   %cmp = icmp uge i64 %val, 64
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ult_64_i64:
 ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
+  %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i64, ptr addrspace(1) %gep.in
   %cmp = icmp ult i64 %val, 64
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ule_63_i64:
 ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
+  %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i64, ptr addrspace(1) %gep.in
   %cmp = icmp ule i64 %val, 63
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
@@ -252,66 +252,66 @@ define amdgpu_kernel void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrsp
 ; GCN-LABEL: {{^}}commute_ule_64_i64:
 ; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}}
 ; GCN: v_cmp_gt_u64_e32 vcc, s[[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
+  %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i64, ptr addrspace(1) %gep.in
   %cmp = icmp ule i64 %val, 64
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_sgt_neg1_i64:
 ; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sgt_neg1_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
+  %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i64, ptr addrspace(1) %gep.in
   %cmp = icmp sgt i64 %val, -1
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_sge_neg2_i64:
 ; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sge_neg2_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
+  %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i64, ptr addrspace(1) %gep.in
   %cmp = icmp sge i64 %val, -2
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_slt_neg16_i64:
 ; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_slt_neg16_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
+  %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i64, ptr addrspace(1) %gep.in
   %cmp = icmp slt i64 %val, -16
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_sle_5_i64:
 ; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sle_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load i64, i64 addrspace(1)* %gep.in
+  %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load i64, ptr addrspace(1) %gep.in
   %cmp = icmp sle i64 %val, 5
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
@@ -322,184 +322,184 @@ define amdgpu_kernel void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspa
 
 ; GCN-LABEL: {{^}}commute_oeq_2.0_f32:
 ; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define amdgpu_kernel void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oeq_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load float, ptr addrspace(1) %gep.in
   %cmp = fcmp oeq float %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 
 ; GCN-LABEL: {{^}}commute_ogt_2.0_f32:
 ; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define amdgpu_kernel void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ogt_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load float, ptr addrspace(1) %gep.in
   %cmp = fcmp ogt float %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_oge_2.0_f32:
 ; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define amdgpu_kernel void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oge_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load float, ptr addrspace(1) %gep.in
   %cmp = fcmp oge float %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_olt_2.0_f32:
 ; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define amdgpu_kernel void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_olt_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load float, ptr addrspace(1) %gep.in
   %cmp = fcmp olt float %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ole_2.0_f32:
 ; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define amdgpu_kernel void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ole_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load float, ptr addrspace(1) %gep.in
   %cmp = fcmp ole float %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_one_2.0_f32:
 ; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define amdgpu_kernel void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_one_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load float, ptr addrspace(1) %gep.in
   %cmp = fcmp one float %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ord_2.0_f32:
 ; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
-define amdgpu_kernel void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ord_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load float, ptr addrspace(1) %gep.in
   %cmp = fcmp ord float %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ueq_2.0_f32:
 ; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define amdgpu_kernel void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ueq_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load float, ptr addrspace(1) %gep.in
   %cmp = fcmp ueq float %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ugt_2.0_f32:
 ; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define amdgpu_kernel void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load float, ptr addrspace(1) %gep.in
   %cmp = fcmp ugt float %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_uge_2.0_f32:
 ; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define amdgpu_kernel void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load float, ptr addrspace(1) %gep.in
   %cmp = fcmp uge float %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ult_2.0_f32:
 ; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define amdgpu_kernel void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load float, ptr addrspace(1) %gep.in
   %cmp = fcmp ult float %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ule_2.0_f32:
 ; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define amdgpu_kernel void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load float, ptr addrspace(1) %gep.in
   %cmp = fcmp ule float %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_une_2.0_f32:
 ; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define amdgpu_kernel void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_une_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load float, ptr addrspace(1) %gep.in
   %cmp = fcmp une float %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_uno_2.0_f32:
 ; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
-define amdgpu_kernel void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uno_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load float, float addrspace(1)* %gep.in
+  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load float, ptr addrspace(1) %gep.in
   %cmp = fcmp uno float %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
@@ -510,184 +510,184 @@ define amdgpu_kernel void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float add
 
 ; GCN-LABEL: {{^}}commute_oeq_2.0_f64:
 ; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oeq_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
+  %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load double, ptr addrspace(1) %gep.in
   %cmp = fcmp oeq double %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 
 ; GCN-LABEL: {{^}}commute_ogt_2.0_f64:
 ; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ogt_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
+  %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load double, ptr addrspace(1) %gep.in
   %cmp = fcmp ogt double %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_oge_2.0_f64:
 ; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oge_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
+  %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load double, ptr addrspace(1) %gep.in
   %cmp = fcmp oge double %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_olt_2.0_f64:
 ; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_olt_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
+  %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load double, ptr addrspace(1) %gep.in
   %cmp = fcmp olt double %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ole_2.0_f64:
 ; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ole_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
+  %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load double, ptr addrspace(1) %gep.in
   %cmp = fcmp ole double %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_one_2.0_f64:
 ; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_one_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
+  %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load double, ptr addrspace(1) %gep.in
   %cmp = fcmp one double %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ord_2.0_f64:
 ; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
-define amdgpu_kernel void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ord_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
+  %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load double, ptr addrspace(1) %gep.in
   %cmp = fcmp ord double %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ueq_2.0_f64:
 ; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ueq_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
+  %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load double, ptr addrspace(1) %gep.in
   %cmp = fcmp ueq double %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ugt_2.0_f64:
 ; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
+  %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load double, ptr addrspace(1) %gep.in
   %cmp = fcmp ugt double %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_uge_2.0_f64:
 ; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
+  %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load double, ptr addrspace(1) %gep.in
   %cmp = fcmp uge double %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ult_2.0_f64:
 ; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
+  %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load double, ptr addrspace(1) %gep.in
   %cmp = fcmp ult double %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_ule_2.0_f64:
 ; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
+  %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load double, ptr addrspace(1) %gep.in
   %cmp = fcmp ule double %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_une_2.0_f64:
 ; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_une_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
+  %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load double, ptr addrspace(1) %gep.in
   %cmp = fcmp une double %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_uno_2.0_f64:
 ; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
-define amdgpu_kernel void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uno_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %val = load double, double addrspace(1)* %gep.in
+  %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %val = load double, ptr addrspace(1) %gep.in
   %cmp = fcmp uno double %val, 2.0
   %ext = sext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %gep.out
+  store i32 %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
@@ -701,13 +701,13 @@ define amdgpu_kernel void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double ad
 
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, [[FI]]
-define amdgpu_kernel void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @commute_frameindex(ptr addrspace(1) nocapture %out) #0 {
 entry:
   %stack0 = alloca i32, addrspace(5)
-  %ptr0 = load volatile i32 addrspace(5)*, i32 addrspace(5)* addrspace(1)* undef
-  %eq = icmp eq i32 addrspace(5)* %ptr0, %stack0
+  %ptr0 = load volatile ptr addrspace(5), ptr addrspace(1) undef
+  %eq = icmp eq ptr addrspace(5) %ptr0, %stack0
   %ext = zext i1 %eq to i32
-  store volatile i32 %ext, i32 addrspace(1)* %out
+  store volatile i32 %ext, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/concat_vectors.ll b/llvm/test/CodeGen/AMDGPU/concat_vectors.ll
index 61c10982d5f28..adee2b7990921 100644
--- a/llvm/test/CodeGen/AMDGPU/concat_vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/concat_vectors.ll
@@ -8,302 +8,302 @@
 ; value if we want to ensure scratch memory is not being used.
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i32(ptr addrspace(1) %out, <1 x i32> %a, <1 x i32> %b) nounwind {
   %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
-  store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8
+  store <2 x i32> %concat, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v2i32:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b) nounwind {
   %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %concat, ptr addrspace(1) %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v4i32:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i32(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b) nounwind {
   %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32
+  store <8 x i32> %concat, ptr addrspace(1) %out, align 32
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v8i32:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b) nounwind {
   %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64
+  store <16 x i32> %concat, ptr addrspace(1) %out, align 64
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v16i32:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <16 x i32> %b) nounwind {
   %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128
+  store <32 x i32> %concat, ptr addrspace(1) %out, align 128
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v1f32:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1f32(ptr addrspace(1) %out, <1 x float> %a, <1 x float> %b) nounwind {
   %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> <i32 0, i32 1>
-  store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8
+  store <2 x float> %concat, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v2f32:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) nounwind {
   %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16
+  store <4 x float> %concat, ptr addrspace(1) %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v4f32:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4f32(ptr addrspace(1) %out, <4 x float> %a, <4 x float> %b) nounwind {
   %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32
+  store <8 x float> %concat, ptr addrspace(1) %out, align 32
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v8f32:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8f32(ptr addrspace(1) %out, <8 x float> %a, <8 x float> %b) nounwind {
   %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64
+  store <16 x float> %concat, ptr addrspace(1) %out, align 64
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v16f32:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16f32(ptr addrspace(1) %out, <16 x float> %a, <16 x float> %b) nounwind {
   %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128
+  store <32 x float> %concat, ptr addrspace(1) %out, align 128
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v1i64:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i64(ptr addrspace(1) %out, <1 x double> %a, <1 x double> %b) nounwind {
   %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
-  store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
+  store <2 x double> %concat, ptr addrspace(1) %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v2i64:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i64(ptr addrspace(1) %out, <2 x double> %a, <2 x double> %b) nounwind {
   %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
+  store <4 x double> %concat, ptr addrspace(1) %out, align 32
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v4i64:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i64(ptr addrspace(1) %out, <4 x double> %a, <4 x double> %b) nounwind {
   %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
+  store <8 x double> %concat, ptr addrspace(1) %out, align 64
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v8i64:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i64(ptr addrspace(1) %out, <8 x double> %a, <8 x double> %b) nounwind {
   %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
+  store <16 x double> %concat, ptr addrspace(1) %out, align 128
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v16i64:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i64(ptr addrspace(1) %out, <16 x double> %a, <16 x double> %b) nounwind {
   %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
+  store <32 x double> %concat, ptr addrspace(1) %out, align 256
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v1f64:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1f64(ptr addrspace(1) %out, <1 x double> %a, <1 x double> %b) nounwind {
   %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
-  store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
+  store <2 x double> %concat, ptr addrspace(1) %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v2f64:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2f64(ptr addrspace(1) %out, <2 x double> %a, <2 x double> %b) nounwind {
   %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
+  store <4 x double> %concat, ptr addrspace(1) %out, align 32
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v4f64:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4f64(ptr addrspace(1) %out, <4 x double> %a, <4 x double> %b) nounwind {
   %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
+  store <8 x double> %concat, ptr addrspace(1) %out, align 64
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v8f64:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8f64(ptr addrspace(1) %out, <8 x double> %a, <8 x double> %b) nounwind {
   %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
+  store <16 x double> %concat, ptr addrspace(1) %out, align 128
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v16f64:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16f64(ptr addrspace(1) %out, <16 x double> %a, <16 x double> %b) nounwind {
   %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
+  store <32 x double> %concat, ptr addrspace(1) %out, align 256
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v1i1:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i1(ptr addrspace(1) %out, <1 x i1> %a, <1 x i1> %b) nounwind {
   %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> <i32 0, i32 1>
-  store <2 x i1> %concat, <2 x i1> addrspace(1)* %out
+  store <2 x i1> %concat, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v2i1:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i1(ptr addrspace(1) %out, <2 x i1> %a, <2 x i1> %b) nounwind {
   %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x i1> %concat, <4 x i1> addrspace(1)* %out
+  store <4 x i1> %concat, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v4i1:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i1(ptr addrspace(1) %out, <4 x i1> %a, <4 x i1> %b) nounwind {
   %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  store <8 x i1> %concat, <8 x i1> addrspace(1)* %out
+  store <8 x i1> %concat, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v8i1:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i1(ptr addrspace(1) %out, <8 x i1> %a, <8 x i1> %b) nounwind {
   %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  store <16 x i1> %concat, <16 x i1> addrspace(1)* %out
+  store <16 x i1> %concat, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v16i1:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i1(ptr addrspace(1) %out, <16 x i1> %a, <16 x i1> %b) nounwind {
   %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  store <32 x i1> %concat, <32 x i1> addrspace(1)* %out
+  store <32 x i1> %concat, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v32i1:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v32i1(ptr addrspace(1) %out, <32 x i1> %a, <32 x i1> %b) nounwind {
   %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-  store <64 x i1> %concat, <64 x i1> addrspace(1)* %out
+  store <64 x i1> %concat, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v1i16:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i16(ptr addrspace(1) %out, <1 x i16> %a, <1 x i16> %b) nounwind {
   %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> <i32 0, i32 1>
-  store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4
+  store <2 x i16> %concat, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v2i16:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) nounwind {
   %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8
+  store <4 x i16> %concat, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v4i16:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i16(ptr addrspace(1) %out, <4 x i16> %a, <4 x i16> %b) nounwind {
   %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16
+  store <8 x i16> %concat, ptr addrspace(1) %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v8i16:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i16(ptr addrspace(1) %out, <8 x i16> %a, <8 x i16> %b) nounwind {
   %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32
+  store <16 x i16> %concat, ptr addrspace(1) %out, align 32
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_concat_v16i16:
 ; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; GCN-NOT: movrel
-define amdgpu_kernel void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i16(ptr addrspace(1) %out, <16 x i16> %a, <16 x i16> %b) nounwind {
   %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64
+  store <32 x i16> %concat, ptr addrspace(1) %out, align 64
   ret void
 }
 
 ; GCN-LABEL: {{^}}concat_vector_crash:
 ; GCN: s_endpgm
-define amdgpu_kernel void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @concat_vector_crash(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 bb:
-  %tmp = load <2 x float>, <2 x float> addrspace(1)* %in, align 4
+  %tmp = load <2 x float>, ptr addrspace(1) %in, align 4
   %tmp1 = shufflevector <2 x float> %tmp, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %tmp2 = shufflevector <8 x float> undef, <8 x float> %tmp1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-  store <8 x float> %tmp2, <8 x float> addrspace(1)* %out, align 32
+  store <8 x float> %tmp2, ptr addrspace(1) %out, align 32
   ret void
 }
 
 ; GCN-LABEL: {{^}}concat_vector_crash2:
 ; GCN: s_endpgm
-define amdgpu_kernel void @concat_vector_crash2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %tmp = load i32, i32 addrspace(1)* %in, align 1
+define amdgpu_kernel void @concat_vector_crash2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %tmp = load i32, ptr addrspace(1) %in, align 1
   %tmp1 = trunc i32 %tmp to i24
   %tmp2 = bitcast i24 %tmp1 to <3 x i8>
   %tmp3 = shufflevector <3 x i8> %tmp2, <3 x i8> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef>
   %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 7, i8 8>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-  store <8 x i8> %tmp4, <8 x i8> addrspace(1)* %out, align 8
+  store <8 x i8> %tmp4, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -313,7 +313,7 @@ define amdgpu_kernel void @concat_vector_crash2(<8 x i8> addrspace(1)* %out, i32
 ; VI: ds_write_b128
 define amdgpu_kernel void @build_vector_splat_concat_v8i16() {
 entry:
-  store <8 x i16> zeroinitializer, <8 x i16> addrspace(3)* undef, align 16
-  store <8 x i16> zeroinitializer, <8 x i16> addrspace(3)* null, align 16
+  store <8 x i16> zeroinitializer, ptr addrspace(3) undef, align 16
+  store <8 x i16> zeroinitializer, ptr addrspace(3) null, align 16
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
index 6a12c0e70c186..863d0a834e5f9 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
@@ -11,10 +11,10 @@
 ; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2
 ; GFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
 ; GFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8
-define amdgpu_vs float @load_i32(i32 addrspace(6)* inreg %p0, i32 addrspace(6)* inreg %p1) #0 {
-  %gep1 = getelementptr inbounds i32, i32 addrspace(6)* %p1, i32 2
-  %r0 = load i32, i32 addrspace(6)* %p0
-  %r1 = load i32, i32 addrspace(6)* %gep1
+define amdgpu_vs float @load_i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+  %gep1 = getelementptr inbounds i32, ptr addrspace(6) %p1, i32 2
+  %r0 = load i32, ptr addrspace(6) %p0
+  %r1 = load i32, ptr addrspace(6) %gep1
   %r = add i32 %r0, %r1
   %r2 = bitcast i32 %r to float
   ret float %r2
@@ -33,10 +33,10 @@ define amdgpu_vs float @load_i32(i32 addrspace(6)* inreg %p0, i32 addrspace(6)*
 ; GFX9-DAG: s_mov_b32 s1, s3
 ; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
 ; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10
-define amdgpu_vs <2 x float> @load_v2i32(<2 x i32> addrspace(6)* inreg %p0, <2 x i32> addrspace(6)* inreg %p1) #0 {
-  %gep1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(6)* %p1, i32 2
-  %r0 = load <2 x i32>, <2 x i32> addrspace(6)* %p0
-  %r1 = load <2 x i32>, <2 x i32> addrspace(6)* %gep1
+define amdgpu_vs <2 x float> @load_v2i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+  %gep1 = getelementptr inbounds <2 x i32>, ptr addrspace(6) %p1, i32 2
+  %r0 = load <2 x i32>, ptr addrspace(6) %p0
+  %r1 = load <2 x i32>, ptr addrspace(6) %gep1
   %r = add <2 x i32> %r0, %r1
   %r2 = bitcast <2 x i32> %r to <2 x float>
   ret <2 x float> %r2
@@ -52,10 +52,10 @@ define amdgpu_vs <2 x float> @load_v2i32(<2 x i32> addrspace(6)* inreg %p0, <2 x
 ; VI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20
 ; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
 ; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20
-define amdgpu_vs <4 x float> @load_v4i32(<4 x i32> addrspace(6)* inreg %p0, <4 x i32> addrspace(6)* inreg %p1) #0 {
-  %gep1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(6)* %p1, i32 2
-  %r0 = load <4 x i32>, <4 x i32> addrspace(6)* %p0
-  %r1 = load <4 x i32>, <4 x i32> addrspace(6)* %gep1
+define amdgpu_vs <4 x float> @load_v4i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+  %gep1 = getelementptr inbounds <4 x i32>, ptr addrspace(6) %p1, i32 2
+  %r0 = load <4 x i32>, ptr addrspace(6) %p0
+  %r1 = load <4 x i32>, ptr addrspace(6) %gep1
   %r = add <4 x i32> %r0, %r1
   %r2 = bitcast <4 x i32> %r to <4 x float>
   ret <4 x float> %r2
@@ -71,10 +71,10 @@ define amdgpu_vs <4 x float> @load_v4i32(<4 x i32> addrspace(6)* inreg %p0, <4 x
 ; VI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40
 ; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
 ; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40
-define amdgpu_vs <8 x float> @load_v8i32(<8 x i32> addrspace(6)* inreg %p0, <8 x i32> addrspace(6)* inreg %p1) #0 {
-  %gep1 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(6)* %p1, i32 2
-  %r0 = load <8 x i32>, <8 x i32> addrspace(6)* %p0
-  %r1 = load <8 x i32>, <8 x i32> addrspace(6)* %gep1
+define amdgpu_vs <8 x float> @load_v8i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+  %gep1 = getelementptr inbounds <8 x i32>, ptr addrspace(6) %p1, i32 2
+  %r0 = load <8 x i32>, ptr addrspace(6) %p0
+  %r1 = load <8 x i32>, ptr addrspace(6) %gep1
   %r = add <8 x i32> %r0, %r1
   %r2 = bitcast <8 x i32> %r to <8 x float>
   ret <8 x float> %r2
@@ -90,10 +90,10 @@ define amdgpu_vs <8 x float> @load_v8i32(<8 x i32> addrspace(6)* inreg %p0, <8 x
 ; VI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80
 ; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
 ; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80
-define amdgpu_vs <16 x float> @load_v16i32(<16 x i32> addrspace(6)* inreg %p0, <16 x i32> addrspace(6)* inreg %p1) #0 {
-  %gep1 = getelementptr inbounds <16 x i32>, <16 x i32> addrspace(6)* %p1, i32 2
-  %r0 = load <16 x i32>, <16 x i32> addrspace(6)* %p0
-  %r1 = load <16 x i32>, <16 x i32> addrspace(6)* %gep1
+define amdgpu_vs <16 x float> @load_v16i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+  %gep1 = getelementptr inbounds <16 x i32>, ptr addrspace(6) %p1, i32 2
+  %r0 = load <16 x i32>, ptr addrspace(6) %p0
+  %r1 = load <16 x i32>, ptr addrspace(6) %gep1
   %r = add <16 x i32> %r0, %r1
   %r2 = bitcast <16 x i32> %r to <16 x float>
   ret <16 x float> %r2
@@ -109,10 +109,10 @@ define amdgpu_vs <16 x float> @load_v16i32(<16 x i32> addrspace(6)* inreg %p0, <
 ; VI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8
 ; GFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
 ; GFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8
-define amdgpu_vs float @load_float(float addrspace(6)* inreg %p0, float addrspace(6)* inreg %p1) #0 {
-  %gep1 = getelementptr inbounds float, float addrspace(6)* %p1, i32 2
-  %r0 = load float, float addrspace(6)* %p0
-  %r1 = load float, float addrspace(6)* %gep1
+define amdgpu_vs float @load_float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+  %gep1 = getelementptr inbounds float, ptr addrspace(6) %p1, i32 2
+  %r0 = load float, ptr addrspace(6) %p0
+  %r1 = load float, ptr addrspace(6) %gep1
   %r = fadd float %r0, %r1
   ret float %r
 }
@@ -130,10 +130,10 @@ define amdgpu_vs float @load_float(float addrspace(6)* inreg %p0, float addrspac
 ; GFX9-DAG: s_mov_b32 s1, s3
 ; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
 ; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10
-define amdgpu_vs <2 x float> @load_v2float(<2 x float> addrspace(6)* inreg %p0, <2 x float> addrspace(6)* inreg %p1) #0 {
-  %gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(6)* %p1, i32 2
-  %r0 = load <2 x float>, <2 x float> addrspace(6)* %p0
-  %r1 = load <2 x float>, <2 x float> addrspace(6)* %gep1
+define amdgpu_vs <2 x float> @load_v2float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+  %gep1 = getelementptr inbounds <2 x float>, ptr addrspace(6) %p1, i32 2
+  %r0 = load <2 x float>, ptr addrspace(6) %p0
+  %r1 = load <2 x float>, ptr addrspace(6) %gep1
   %r = fadd <2 x float> %r0, %r1
   ret <2 x float> %r
 }
@@ -148,10 +148,10 @@ define amdgpu_vs <2 x float> @load_v2float(<2 x float> addrspace(6)* inreg %p0,
 ; VI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20
 ; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
 ; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20
-define amdgpu_vs <4 x float> @load_v4float(<4 x float> addrspace(6)* inreg %p0, <4 x float> addrspace(6)* inreg %p1) #0 {
-  %gep1 = getelementptr inbounds <4 x float>, <4 x float> addrspace(6)* %p1, i32 2
-  %r0 = load <4 x float>, <4 x float> addrspace(6)* %p0
-  %r1 = load <4 x float>, <4 x float> addrspace(6)* %gep1
+define amdgpu_vs <4 x float> @load_v4float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+  %gep1 = getelementptr inbounds <4 x float>, ptr addrspace(6) %p1, i32 2
+  %r0 = load <4 x float>, ptr addrspace(6) %p0
+  %r1 = load <4 x float>, ptr addrspace(6) %gep1
   %r = fadd <4 x float> %r0, %r1
   ret <4 x float> %r
 }
@@ -166,10 +166,10 @@ define amdgpu_vs <4 x float> @load_v4float(<4 x float> addrspace(6)* inreg %p0,
 ; VI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40
 ; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
 ; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40
-define amdgpu_vs <8 x float> @load_v8float(<8 x float> addrspace(6)* inreg %p0, <8 x float> addrspace(6)* inreg %p1) #0 {
-  %gep1 = getelementptr inbounds <8 x float>, <8 x float> addrspace(6)* %p1, i32 2
-  %r0 = load <8 x float>, <8 x float> addrspace(6)* %p0
-  %r1 = load <8 x float>, <8 x float> addrspace(6)* %gep1
+define amdgpu_vs <8 x float> @load_v8float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+  %gep1 = getelementptr inbounds <8 x float>, ptr addrspace(6) %p1, i32 2
+  %r0 = load <8 x float>, ptr addrspace(6) %p0
+  %r1 = load <8 x float>, ptr addrspace(6) %gep1
   %r = fadd <8 x float> %r0, %r1
   ret <8 x float> %r
 }
@@ -184,10 +184,10 @@ define amdgpu_vs <8 x float> @load_v8float(<8 x float> addrspace(6)* inreg %p0,
 ; VI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80
 ; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
 ; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80
-define amdgpu_vs <16 x float> @load_v16float(<16 x float> addrspace(6)* inreg %p0, <16 x float> addrspace(6)* inreg %p1) #0 {
-  %gep1 = getelementptr inbounds <16 x float>, <16 x float> addrspace(6)* %p1, i32 2
-  %r0 = load <16 x float>, <16 x float> addrspace(6)* %p0
-  %r1 = load <16 x float>, <16 x float> addrspace(6)* %gep1
+define amdgpu_vs <16 x float> @load_v16float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
+  %gep1 = getelementptr inbounds <16 x float>, ptr addrspace(6) %p1, i32 2
+  %r0 = load <16 x float>, ptr addrspace(6) %p0
+  %r1 = load <16 x float>, ptr addrspace(6) %gep1
   %r = fadd <16 x float> %r0, %r1
   ret <16 x float> %r
 }
@@ -195,32 +195,32 @@ define amdgpu_vs <16 x float> @load_v16float(<16 x float> addrspace(6)* inreg %p
 ; GCN-LABEL: {{^}}load_i32_hi0:
 ; GCN: s_mov_b32 s1, 0
 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
-define amdgpu_vs i32 @load_i32_hi0(i32 addrspace(6)* inreg %p) #1 {
-  %r0 = load i32, i32 addrspace(6)* %p
+define amdgpu_vs i32 @load_i32_hi0(ptr addrspace(6) inreg %p) #1 {
+  %r0 = load i32, ptr addrspace(6) %p
   ret i32 %r0
 }
 
 ; GCN-LABEL: {{^}}load_i32_hi1:
 ; GCN: s_mov_b32 s1, 1
 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
-define amdgpu_vs i32 @load_i32_hi1(i32 addrspace(6)* inreg %p) #2 {
-  %r0 = load i32, i32 addrspace(6)* %p
+define amdgpu_vs i32 @load_i32_hi1(ptr addrspace(6) inreg %p) #2 {
+  %r0 = load i32, ptr addrspace(6) %p
   ret i32 %r0
 }
 
 ; GCN-LABEL: {{^}}load_i32_hiffff8000:
 ; GCN: s_movk_i32 s1, 0x8000
 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
-define amdgpu_vs i32 @load_i32_hiffff8000(i32 addrspace(6)* inreg %p) #3 {
-  %r0 = load i32, i32 addrspace(6)* %p
+define amdgpu_vs i32 @load_i32_hiffff8000(ptr addrspace(6) inreg %p) #3 {
+  %r0 = load i32, ptr addrspace(6) %p
   ret i32 %r0
 }
 
 ; GCN-LABEL: {{^}}load_i32_hifffffff0:
 ; GCN: s_mov_b32 s1, -16
 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
-define amdgpu_vs i32 @load_i32_hifffffff0(i32 addrspace(6)* inreg %p) #4 {
-  %r0 = load i32, i32 addrspace(6)* %p
+define amdgpu_vs i32 @load_i32_hifffffff0(ptr addrspace(6) inreg %p) #4 {
+  %r0 = load i32, ptr addrspace(6) %p
   ret i32 %r0
 }
 
@@ -230,31 +230,30 @@ define amdgpu_vs i32 @load_i32_hifffffff0(i32 addrspace(6)* inreg %p) #4 {
 ; GCN: s_load_dwordx8
 ; GCN-NEXT: s_load_dwordx4
 ; GCN: image_sample
-define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @load_sampler([0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 {
+define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @load_sampler(ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 {
 main_body:
   %22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8
   %23 = bitcast float %22 to i32
   %24 = shl i32 %23, 1
-  %25 = getelementptr inbounds [0 x <8 x i32>], [0 x <8 x i32>] addrspace(6)* %1, i32 0, i32 %24, !amdgpu.uniform !0
-  %26 = load <8 x i32>, <8 x i32> addrspace(6)* %25, align 32, !invariant.load !0
+  %25 = getelementptr inbounds [0 x <8 x i32>], ptr addrspace(6) %1, i32 0, i32 %24, !amdgpu.uniform !0
+  %26 = load <8 x i32>, ptr addrspace(6) %25, align 32, !invariant.load !0
   %27 = shl i32 %23, 2
   %28 = or i32 %27, 3
-  %29 = bitcast [0 x <8 x i32>] addrspace(6)* %1 to [0 x <4 x i32>] addrspace(6)*
-  %30 = getelementptr inbounds [0 x <4 x i32>], [0 x <4 x i32>] addrspace(6)* %29, i32 0, i32 %28, !amdgpu.uniform !0
-  %31 = load <4 x i32>, <4 x i32> addrspace(6)* %30, align 16, !invariant.load !0
-  %32 = call nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> %26, <4 x i32> %31, i1 0, i32 0, i32 0) #8
-  %33 = extractelement <4 x float> %32, i32 0
-  %34 = extractelement <4 x float> %32, i32 1
-  %35 = extractelement <4 x float> %32, i32 2
-  %36 = extractelement <4 x float> %32, i32 3
-  %37 = bitcast float %4 to i32
-  %38 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %37, 4
-  %39 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %38, float %33, 5
-  %40 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %39, float %34, 6
-  %41 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %40, float %35, 7
-  %42 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %41, float %36, 8
-  %43 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42, float %20, 19
-  ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %43
+  %29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %28, !amdgpu.uniform !0
+  %30 = load <4 x i32>, ptr addrspace(6) %29, align 16, !invariant.load !0
+  %31 = call nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> %26, <4 x i32> %30, i1 0, i32 0, i32 0) #8
+  %32 = extractelement <4 x float> %31, i32 0
+  %33 = extractelement <4 x float> %31, i32 1
+  %34 = extractelement <4 x float> %31, i32 2
+  %35 = extractelement <4 x float> %31, i32 3
+  %36 = bitcast float %4 to i32
+  %37 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %36, 4
+  %38 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %37, float %32, 5
+  %39 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %38, float %33, 6
+  %40 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %39, float %34, 7
+  %41 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %40, float %35, 8
+  %42 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %41, float %20, 19
+  ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42
 }
 
 ; GCN-LABEL: {{^}}load_sampler_nouniform
@@ -263,40 +262,39 @@ main_body:
 ; GCN: s_load_dwordx8
 ; GCN-NEXT: s_load_dwordx4
 ; GCN: image_sample
-define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @load_sampler_nouniform([0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 {
+define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @load_sampler_nouniform(ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 {
 main_body:
   %22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8
   %23 = bitcast float %22 to i32
   %24 = shl i32 %23, 1
-  %25 = getelementptr inbounds [0 x <8 x i32>], [0 x <8 x i32>] addrspace(6)* %1, i32 0, i32 %24
-  %26 = load <8 x i32>, <8 x i32> addrspace(6)* %25, align 32, !invariant.load !0
+  %25 = getelementptr inbounds [0 x <8 x i32>], ptr addrspace(6) %1, i32 0, i32 %24
+  %26 = load <8 x i32>, ptr addrspace(6) %25, align 32, !invariant.load !0
   %27 = shl i32 %23, 2
   %28 = or i32 %27, 3
-  %29 = bitcast [0 x <8 x i32>] addrspace(6)* %1 to [0 x <4 x i32>] addrspace(6)*
-  %30 = getelementptr inbounds [0 x <4 x i32>], [0 x <4 x i32>] addrspace(6)* %29, i32 0, i32 %28
-  %31 = load <4 x i32>, <4 x i32> addrspace(6)* %30, align 16, !invariant.load !0
-  %32 = call nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> %26, <4 x i32> %31, i1 0, i32 0, i32 0) #8
-  %33 = extractelement <4 x float> %32, i32 0
-  %34 = extractelement <4 x float> %32, i32 1
-  %35 = extractelement <4 x float> %32, i32 2
-  %36 = extractelement <4 x float> %32, i32 3
-  %37 = bitcast float %4 to i32
-  %38 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %37, 4
-  %39 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %38, float %33, 5
-  %40 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %39, float %34, 6
-  %41 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %40, float %35, 7
-  %42 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %41, float %36, 8
-  %43 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42, float %20, 19
-  ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %43
+  %29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %28
+  %30 = load <4 x i32>, ptr addrspace(6) %29, align 16, !invariant.load !0
+  %31 = call nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> %26, <4 x i32> %30, i1 0, i32 0, i32 0) #8
+  %32 = extractelement <4 x float> %31, i32 0
+  %33 = extractelement <4 x float> %31, i32 1
+  %34 = extractelement <4 x float> %31, i32 2
+  %35 = extractelement <4 x float> %31, i32 3
+  %36 = bitcast float %4 to i32
+  %37 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %36, 4
+  %38 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %37, float %32, 5
+  %39 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %38, float %33, 6
+  %40 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %39, float %34, 7
+  %41 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %40, float %35, 8
+  %42 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %41, float %20, 19
+  ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42
 }
 
 ; GCN-LABEL: {{^}}load_addr_no_fold:
 ; GCN-DAG: s_add_i32 s0, s0, 4
 ; GCN-DAG: s_mov_b32 s1, 0
 ; GCN: s_load_dword s{{[0-9]}}, s[0:1], 0x0
-define amdgpu_vs float @load_addr_no_fold(i32 addrspace(6)* inreg noalias %p0) #0 {
-  %gep1 = getelementptr i32, i32 addrspace(6)* %p0, i32 1
-  %r1 = load i32, i32 addrspace(6)* %gep1
+define amdgpu_vs float @load_addr_no_fold(ptr addrspace(6) inreg noalias %p0) #0 {
+  %gep1 = getelementptr i32, ptr addrspace(6) %p0, i32 1
+  %r1 = load i32, ptr addrspace(6) %gep1
   %r2 = bitcast i32 %r1 to float
   ret float %r2
 }
@@ -305,9 +303,9 @@ define amdgpu_vs float @load_addr_no_fold(i32 addrspace(6)* inreg noalias %p0) #
 ; GCN: v_readfirstlane_b32 s[[READLANE:[0-9]+]], v0
 ; GCN: s_mov_b32 s[[ZERO:[0-9]+]]
 ; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[[[READLANE]]:[[ZERO]]]
-define amdgpu_vs float @vgpr_arg_src(<4 x i32> addrspace(6)* %arg) {
+define amdgpu_vs float @vgpr_arg_src(ptr addrspace(6) %arg) {
 main_body:
-  %tmp9 = load <4 x i32>, <4 x i32> addrspace(6)* %arg
+  %tmp9 = load <4 x i32>, ptr addrspace(6) %arg
   %tmp10 = call nsz float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %tmp9, i32 undef, i32 0, i32 0, i32 0) #1
   ret float %tmp10
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
index 0ca0c6896fff1..236dee7c3b825 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
@@ -5,11 +5,11 @@
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_and_0(ptr addrspace(1) %out) {
   %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %and = and i32 %size, %x
-  store i32 %and, i32 addrspace(1)* %out
+  store i32 %and, ptr addrspace(1) %out
   ret void
 }
 
@@ -17,10 +17,10 @@ define amdgpu_kernel void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_and_0(ptr addrspace(1) %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %and = and i32 %size, %x
-  store i32 %and, i32 addrspace(1)* %out
+  store i32 %and, ptr addrspace(1) %out
   ret void
 }
 
@@ -28,11 +28,11 @@ define amdgpu_kernel void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_or_0(ptr addrspace(1) %out) {
   %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %or = or i32 %size, %x
-  store i32 %or, i32 addrspace(1)* %out
+  store i32 %or, ptr addrspace(1) %out
   ret void
 }
 
@@ -42,10 +42,10 @@ define amdgpu_kernel void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define amdgpu_kernel void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_or_0(ptr addrspace(1) %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %or = or i32 %size, %x
-  store i32 %or, i32 addrspace(1)* %out
+  store i32 %or, ptr addrspace(1) %out
   ret void
 }
 
@@ -53,11 +53,11 @@ define amdgpu_kernel void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_xor_0(ptr addrspace(1) %out) {
   %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %xor = xor i32 %size, %x
-  store i32 %xor, i32 addrspace(1)* %out
+  store i32 %xor, ptr addrspace(1) %out
   ret void
 }
 
@@ -67,10 +67,10 @@ define amdgpu_kernel void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define amdgpu_kernel void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_xor_0(ptr addrspace(1) %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %xor = xor i32 %size, %x
-  store i32 %xor, i32 addrspace(1)* %out
+  store i32 %xor, ptr addrspace(1) %out
   ret void
 }
 
@@ -78,10 +78,10 @@ define amdgpu_kernel void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], -1{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_not_0(ptr addrspace(1) %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %xor = xor i32 %size, -1
-  store i32 %xor, i32 addrspace(1)* %out
+  store i32 %xor, ptr addrspace(1) %out
   ret void
 }
 
@@ -91,11 +91,11 @@ define amdgpu_kernel void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN-NEXT: v_not_b32_e32 v[[RESULT_LO]]
 ; GCN-NEXT: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], -1{{$}}
 ; GCN-NEXT: buffer_store_dwordx2 v[[[RESULT_LO]]:[[RESULT_HI]]]
-define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
-  %vreg = load volatile i64, i64 addrspace(1)* undef
+define amdgpu_kernel void @fold_mi_v_not_0(ptr addrspace(1) %out) {
+  %vreg = load volatile i64, ptr addrspace(1) undef
   %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg)
   %xor = xor i64 %ctpop, -1
-  store i64 %xor, i64 addrspace(1)* %out
+  store i64 %xor, ptr addrspace(1) %out
   ret void
 }
 
@@ -110,13 +110,13 @@ define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
 ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]]
 ; GCN: buffer_store_dwordx2 v[[[RESULT_LO]]:[[RESULT_HI]]]
-define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
-  %vreg0 = load volatile i64, i64 addrspace(1)* undef
-  %vreg1 = load volatile i64, i64 addrspace(1)* undef
+define amdgpu_kernel void @fold_mi_or_neg1(ptr addrspace(1) %out) {
+  %vreg0 = load volatile i64, ptr addrspace(1) undef
+  %vreg1 = load volatile i64, ptr addrspace(1) undef
   %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0)
   %xor = xor i64 %ctpop, -1
   %or = or i64 %xor, %vreg1
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   ret void
 }
 
@@ -126,13 +126,13 @@ define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
 ; GCN: v_not_b32
 ; GCN: v_and_b32
 ; GCN-NOT: v_and_b32
-define amdgpu_kernel void @fold_mi_and_neg1(i64 addrspace(1)* %out) {
-  %vreg0 = load volatile i64, i64 addrspace(1)* undef
-  %vreg1 = load volatile i64, i64 addrspace(1)* undef
+define amdgpu_kernel void @fold_mi_and_neg1(ptr addrspace(1) %out) {
+  %vreg0 = load volatile i64, ptr addrspace(1) undef
+  %vreg1 = load volatile i64, ptr addrspace(1) undef
   %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0)
   %xor = xor i64 %ctpop, -1
   %and = and i64 %xor, %vreg1
-  store i64 %and, i64 addrspace(1)* %out
+  store i64 %and, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index 5622f17063032..71dade0f278dc 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -63,21 +63,21 @@
 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
-define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_if_endif(ptr addrspace(1) %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %load0 = load volatile i32, i32 addrspace(3)* undef
+  %load0 = load volatile i32, ptr addrspace(3) undef
   %cmp0 = icmp eq i32 %tid, 0
   br i1 %cmp0, label %if, label %endif
 
 if:
-  %load1 = load volatile i32, i32 addrspace(3)* undef
+  %load1 = load volatile i32, ptr addrspace(3) undef
   %val = add i32 %load0, %load1
   br label %endif
 
 endif:
   %tmp4 = phi i32 [ %val, %if ], [ 0, %entry ]
-  store i32 %tmp4, i32 addrspace(1)* %out
+  store i32 %tmp4, ptr addrspace(1) %out
   ret void
 }
 
@@ -133,17 +133,17 @@ endif:
 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
-define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_loop(ptr addrspace(1) %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %load0 = load volatile i32, i32 addrspace(3)* null
+  %load0 = load volatile i32, ptr addrspace(3) null
   %cmp0 = icmp eq i32 %tid, 0
   br i1 %cmp0, label %loop, label %end
 
 loop:
   %i = phi i32 [ %i.inc, %loop ], [ 0, %entry ]
   %val = phi i32 [ %val.sub, %loop ], [ %load0, %entry ]
-  %load1 = load volatile i32, i32 addrspace(3)* undef
+  %load1 = load volatile i32, ptr addrspace(3) undef
   %i.inc = add i32 %i, 1
   %val.sub = sub i32 %val, %load1
   %cmp1 = icmp ne i32 %i, 256
@@ -151,7 +151,7 @@ loop:
 
 end:
   %tmp4 = phi i32 [ %val.sub, %loop ], [ 0, %entry ]
-  store i32 %tmp4, i32 addrspace(1)* %out
+  store i32 %tmp4, ptr addrspace(1) %out
   ret void
 }
 
@@ -244,26 +244,26 @@ end:
 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]]
-define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_if_else_endif(ptr addrspace(1) %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %load0 = load volatile i32, i32 addrspace(3)* null
+  %load0 = load volatile i32, ptr addrspace(3) null
   %cmp0 = icmp eq i32 %tid, 0
   br i1 %cmp0, label %if, label %else
 
 if:
-  %load1 = load volatile i32, i32 addrspace(3)* undef
+  %load1 = load volatile i32, ptr addrspace(3) undef
   %val0 = add i32 %load0, %load1
   br label %endif
 
 else:
-  %load2 = load volatile i32, i32 addrspace(3)* undef
+  %load2 = load volatile i32, ptr addrspace(3) undef
   %val1 = sub i32 %load0, %load2
   br label %endif
 
 endif:
   %result = phi i32 [ %val0, %if ], [ %val1, %else ]
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll b/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll
index 81a5e540da6b6..c06e56ec2eb21 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll
@@ -40,7 +40,7 @@ bb9:                                              ; preds = %bb14, %bb8
   br i1 %tmp12, label %bb13, label %bb14
 
 bb13:                                             ; preds = %bb9
-  store volatile i32 0, i32 addrspace(1)* undef, align 4
+  store volatile i32 0, ptr addrspace(1) undef, align 4
   br label %bb14
 
 bb14:                                             ; preds = %bb13, %bb9

diff  --git a/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll b/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll
index 2ba4d0cf1d992..bd523d4ac30b9 100644
--- a/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll
+++ b/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 ; GCN: v_cmp_ne_u32_e64
 ; GCN: s_cbranch_execz
 ; GCN: ; %bb.{{[0-9]+}}:
-define amdgpu_kernel void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @convergent_inlineasm(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1) #1
@@ -14,8 +14,8 @@ bb:
   br i1 %tmp2, label %bb3, label %bb5
 
 bb3:                                              ; preds = %bb
-  %tmp4 = getelementptr i64, i64 addrspace(1)* %arg, i32 %tmp
-  store i64 %tmp1, i64 addrspace(1)* %arg, align 8
+  %tmp4 = getelementptr i64, ptr addrspace(1) %arg, i32 %tmp
+  store i64 %tmp1, ptr addrspace(1) %arg, align 8
   br label %bb5
 
 bb5:                                              ; preds = %bb3, %bb
@@ -30,7 +30,7 @@ bb5:                                              ; preds = %bb3, %bb
 
 ; GCN: BB{{[0-9]+_[0-9]+}}:
 
-define amdgpu_kernel void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @nonconvergent_inlineasm(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1)
@@ -38,8 +38,8 @@ bb:
   br i1 %tmp2, label %bb3, label %bb5
 
 bb3:                                              ; preds = %bb
-  %tmp4 = getelementptr i64, i64 addrspace(1)* %arg, i32 %tmp
-  store i64 %tmp1, i64 addrspace(1)* %arg, align 8
+  %tmp4 = getelementptr i64, ptr addrspace(1) %arg, i32 %tmp
+  store i64 %tmp1, ptr addrspace(1) %arg, align 8
   br label %bb5
 
 bb5:                                              ; preds = %bb3, %bb

diff  --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
index eb0fc7e06d177..b006c6b2aff7a 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -5,7 +5,7 @@
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 
-define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
 ; SI-LABEL: test_copy_v4i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -39,13 +39,13 @@ define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8>
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
-  %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
+  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
+  %val = load <4 x i8>, ptr addrspace(1) %gep, align 4
+  store <4 x i8> %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x2(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) nounwind {
 ; SI-LABEL: test_copy_v4i8_x2:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -94,14 +94,14 @@ define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x
 ; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
-  %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
+  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
+  %val = load <4 x i8>, ptr addrspace(1) %gep, align 4
+  store <4 x i8> %val, ptr addrspace(1) %out0, align 4
+  store <4 x i8> %val, ptr addrspace(1) %out1, align 4
   ret void
 }
 
-define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %in) nounwind {
 ; SI-LABEL: test_copy_v4i8_x3:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -152,15 +152,15 @@ define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
-  %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
+  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
+  %val = load <4 x i8>, ptr addrspace(1) %gep, align 4
+  store <4 x i8> %val, ptr addrspace(1) %out0, align 4
+  store <4 x i8> %val, ptr addrspace(1) %out1, align 4
+  store <4 x i8> %val, ptr addrspace(1) %out2, align 4
   ret void
 }
 
-define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3, ptr addrspace(1) %in) nounwind {
 ; SI-LABEL: test_copy_v4i8_x4:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x11
@@ -241,16 +241,16 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x
 ; VI-NEXT:    buffer_store_dword v0, off, s[20:23], 0
 ; VI-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
-  %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4
+  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
+  %val = load <4 x i8>, ptr addrspace(1) %gep, align 4
+  store <4 x i8> %val, ptr addrspace(1) %out0, align 4
+  store <4 x i8> %val, ptr addrspace(1) %out1, align 4
+  store <4 x i8> %val, ptr addrspace(1) %out2, align 4
+  store <4 x i8> %val, ptr addrspace(1) %out3, align 4
   ret void
 }
 
-define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) nounwind {
 ; SI-LABEL: test_copy_v4i8_extra_use:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -326,16 +326,16 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0
 ; VI-NEXT:    buffer_store_dword v1, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
-  %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
+  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
+  %val = load <4 x i8>, ptr addrspace(1) %gep, align 4
   %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
-  store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
+  store <4 x i8> %val, ptr addrspace(1) %out0, align 4
+  store <4 x i8> %add, ptr addrspace(1) %out1, align 4
   ret void
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %in) nounwind {
 ; SI-LABEL: test_copy_v4i8_x2_extra_use:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -413,16 +413,16 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
-  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
+  %in.ptr = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
+  %val = load <4 x i8>, ptr addrspace(1) %in.ptr, align 4
   %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
-  store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
+  store <4 x i8> %val, ptr addrspace(1) %out0, align 4
+  store <4 x i8> %add, ptr addrspace(1) %out1, align 4
+  store <4 x i8> %val, ptr addrspace(1) %out2, align 4
   ret void
 }
 
-define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
 ; SI-LABEL: test_copy_v3i8_align4:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -460,13 +460,13 @@ define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3
 ; VI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:2
 ; VI-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x
-  %val = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
-  store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
+  %gep = getelementptr <3 x i8>, ptr addrspace(1) %in, i32 %tid.x
+  %val = load <3 x i8>, ptr addrspace(1) %gep, align 4
+  store <3 x i8> %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
 ; SI-LABEL: test_copy_v3i8_align2:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -506,12 +506,12 @@ define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    buffer_store_short v1, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2
-  store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2
+  %val = load <3 x i8>, ptr addrspace(1) %in, align 2
+  store <3 x i8> %val, ptr addrspace(1) %out, align 2
   ret void
 }
 
-define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
 ; SI-LABEL: test_copy_v3i8_align1:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -556,12 +556,12 @@ define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3
 ; VI-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:1
 ; VI-NEXT:    s_endpgm
-  %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1
-  store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1
+  %val = load <3 x i8>, ptr addrspace(1) %in, align 1
+  store <3 x i8> %val, ptr addrspace(1) %out, align 1
   ret void
 }
 
-define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_volatile_load(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
 ; SI-LABEL: test_copy_v4i8_volatile_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -595,12 +595,12 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
+  %val = load volatile <4 x i8>, ptr addrspace(1) %in, align 4
+  store <4 x i8> %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_volatile_store(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
 ; SI-LABEL: test_copy_v4i8_volatile_store:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -654,7 +654,7 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)*
 ; VI-NEXT:    buffer_store_byte v3, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
-  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
-  store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
+  %val = load <4 x i8>, ptr addrspace(1) %in, align 4
+  store volatile <4 x i8> %val, ptr addrspace(1) %out, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll
index d899ddd272996..cfe4850cf76ba 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll
@@ -6,22 +6,21 @@
 
 ; Make sure this doesn't crash
 ; CHECK-LABEL: {{^}}copy_to_reg_frameindex:
-define amdgpu_kernel void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @copy_to_reg_frameindex(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %alloca = alloca [16 x i32], addrspace(5)
   br label %loop
 
 loop:
   %inc = phi i32 [0, %entry], [%inc.i, %loop]
-  %ptr = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %inc
-  store i32 %inc, i32 addrspace(5)* %ptr
+  %ptr = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %inc
+  store i32 %inc, ptr addrspace(5) %ptr
   %inc.i = add i32 %inc, 1
   %cnd = icmp uge i32 %inc.i, 16
   br i1 %cnd, label %done, label %loop
 
 done:
-  %tmp0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
-  %tmp1 = load i32, i32 addrspace(5)* %tmp0
-  store i32 %tmp1, i32 addrspace(1)* %out
+  %tmp1 = load i32, ptr addrspace(5) %alloca
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
index 7846fcc97555e..242d2e7d07048 100644
--- a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-define amdgpu_kernel void @copy_to_scc(i32 addrspace(1)* %out, i32 addrspace(1)* %in, <4 x i32> addrspace(4)* %addrSrc) {
+define amdgpu_kernel void @copy_to_scc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(4) %addrSrc) {
 ; GCN-LABEL: copy_to_scc:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -25,14 +25,14 @@ define amdgpu_kernel void @copy_to_scc(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; GCN-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 entry:                                             ; preds = %1009
-  %0 = load i32, i32 addrspace(1)* %in, align 4
-  %1 = load <4 x i32>, <4 x i32> addrspace(4)* %addrSrc, align 16
+  %0 = load i32, ptr addrspace(1) %in, align 4
+  %1 = load <4 x i32>, ptr addrspace(4) %addrSrc, align 16
   %2 = icmp ne i32 %0, 0
   %3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %1, i32 252, i32 0, i32 0)
   %4 = icmp ne i32 %3, 0
   %5 = xor i1 %2, %4
   %result = select i1 %5, i32 2, i32 3
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 8cad50dff20e1..28e82208f53ad 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -234,7 +234,7 @@ if.else:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.else, %if.then
   %call6.sink = phi <3 x i16> [ %call6, %if.else ], [ zeroinitializer, %if.then ]
-  store <3 x i16> %call6.sink, <3 x i16> addrspace(1)* undef
+  store <3 x i16> %call6.sink, ptr addrspace(1) undef
   ret void
 }
 
@@ -287,7 +287,7 @@ if.else:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.else, %if.then
   %call6.sink = phi <3 x half> [ %call6, %if.else ], [ zeroinitializer, %if.then ]
-  store <3 x half> %call6.sink, <3 x half> addrspace(1)* undef
+  store <3 x half> %call6.sink, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
index ee99093fcbd0f..7e09f85cdffba 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
@@ -5,7 +5,7 @@
 
 ; The result of that atomic ops should not be used as a uniform value.
 
-define protected amdgpu_kernel void @add(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: add:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -20,14 +20,14 @@ define protected amdgpu_kernel void @add(i32 addrspace(1)* %p, %S addrspace(1)*
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw add i32 addrspace(1)* %p, i32 1 monotonic
+  %n32 = atomicrmw add ptr addrspace(1) %p, i32 1 monotonic
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @sub(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: sub:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -42,14 +42,14 @@ define protected amdgpu_kernel void @sub(i32 addrspace(1)* %p, %S addrspace(1)*
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw sub i32 addrspace(1)* %p, i32 1 monotonic
+  %n32 = atomicrmw sub ptr addrspace(1) %p, i32 1 monotonic
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @and(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: and:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -64,14 +64,14 @@ define protected amdgpu_kernel void @and(i32 addrspace(1)* %p, %S addrspace(1)*
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw and i32 addrspace(1)* %p, i32 1 monotonic
+  %n32 = atomicrmw and ptr addrspace(1) %p, i32 1 monotonic
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @or(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: or:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -86,14 +86,14 @@ define protected amdgpu_kernel void @or(i32 addrspace(1)* %p, %S addrspace(1)* %
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw or i32 addrspace(1)* %p, i32 1 monotonic
+  %n32 = atomicrmw or ptr addrspace(1) %p, i32 1 monotonic
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @xor(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: xor:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -108,14 +108,14 @@ define protected amdgpu_kernel void @xor(i32 addrspace(1)* %p, %S addrspace(1)*
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw xor i32 addrspace(1)* %p, i32 1 monotonic
+  %n32 = atomicrmw xor ptr addrspace(1) %p, i32 1 monotonic
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @nand(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: nand:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -144,14 +144,14 @@ define protected amdgpu_kernel void @nand(i32 addrspace(1)* %p, %S addrspace(1)*
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw nand i32 addrspace(1)* %p, i32 1 monotonic
+  %n32 = atomicrmw nand ptr addrspace(1) %p, i32 1 monotonic
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @max_workgroup(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: max_workgroup:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -166,14 +166,14 @@ define protected amdgpu_kernel void @max_workgroup(i32 addrspace(1)* %p, %S addr
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw max i32 addrspace(1)* %p, i32 1 syncscope("workgroup") monotonic
+  %n32 = atomicrmw max ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @max(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: max:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -201,14 +201,14 @@ define protected amdgpu_kernel void @max(i32 addrspace(1)* %p, %S addrspace(1)*
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw max i32 addrspace(1)* %p, i32 1 monotonic
+  %n32 = atomicrmw max ptr addrspace(1) %p, i32 1 monotonic
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @min_workgroup(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: min_workgroup:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -223,14 +223,14 @@ define protected amdgpu_kernel void @min_workgroup(i32 addrspace(1)* %p, %S addr
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw min i32 addrspace(1)* %p, i32 1 syncscope("workgroup") monotonic
+  %n32 = atomicrmw min ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @min(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: min:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -258,14 +258,14 @@ define protected amdgpu_kernel void @min(i32 addrspace(1)* %p, %S addrspace(1)*
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw min i32 addrspace(1)* %p, i32 1 monotonic
+  %n32 = atomicrmw min ptr addrspace(1) %p, i32 1 monotonic
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @umax_workgroup(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: umax_workgroup:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -280,14 +280,14 @@ define protected amdgpu_kernel void @umax_workgroup(i32 addrspace(1)* %p, %S add
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw umax i32 addrspace(1)* %p, i32 1 syncscope("workgroup") monotonic
+  %n32 = atomicrmw umax ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @umax(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: umax:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -315,14 +315,14 @@ define protected amdgpu_kernel void @umax(i32 addrspace(1)* %p, %S addrspace(1)*
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw umax i32 addrspace(1)* %p, i32 1 monotonic
+  %n32 = atomicrmw umax ptr addrspace(1) %p, i32 1 monotonic
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @umin_workgroup(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: umin_workgroup:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -337,14 +337,14 @@ define protected amdgpu_kernel void @umin_workgroup(i32 addrspace(1)* %p, %S add
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw umin i32 addrspace(1)* %p, i32 1 syncscope("workgroup") monotonic
+  %n32 = atomicrmw umin ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @umin(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: umin:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -372,14 +372,14 @@ define protected amdgpu_kernel void @umin(i32 addrspace(1)* %p, %S addrspace(1)*
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw umin i32 addrspace(1)* %p, i32 1 monotonic
+  %n32 = atomicrmw umin ptr addrspace(1) %p, i32 1 monotonic
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @cmpxchg(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: cmpxchg:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -395,15 +395,15 @@ define protected amdgpu_kernel void @cmpxchg(i32 addrspace(1)* %p, %S addrspace(
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %agg = cmpxchg i32 addrspace(1)* %p, i32 1, i32 2 monotonic monotonic
+  %agg = cmpxchg ptr addrspace(1) %p, i32 1, i32 2 monotonic monotonic
   %n32 = extractvalue {i32, i1} %agg, 0
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @xchg(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: xchg:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -418,14 +418,14 @@ define protected amdgpu_kernel void @xchg(i32 addrspace(1)* %p, %S addrspace(1)*
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw xchg i32 addrspace(1)* %p, i32 1 monotonic
+  %n32 = atomicrmw xchg ptr addrspace(1) %p, i32 1 monotonic
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @inc(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: inc:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -439,14 +439,14 @@ define protected amdgpu_kernel void @inc(i32 addrspace(1)* %p, %S addrspace(1)*
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %p, i32 0, i32 0, i32 0, i1 false)
+  %n32 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %p, i32 0, i32 0, i32 0, i1 false)
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @dec(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: dec:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -460,14 +460,14 @@ define protected amdgpu_kernel void @dec(i32 addrspace(1)* %p, %S addrspace(1)*
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %p, i32 0, i32 0, i32 0, i1 false)
+  %n32 = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %p, i32 0, i32 0, i32 0, i1 false)
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @fadd(float addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: fadd:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -496,15 +496,15 @@ define protected amdgpu_kernel void @fadd(float addrspace(1)* %p, %S addrspace(1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %f32 = atomicrmw fadd float addrspace(1)* %p, float 1.0 monotonic
+  %f32 = atomicrmw fadd ptr addrspace(1) %p, float 1.0 monotonic
   %n32 = fptoui float %f32 to i32
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @fsub(float addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: fsub:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -533,15 +533,15 @@ define protected amdgpu_kernel void @fsub(float addrspace(1)* %p, %S addrspace(1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %f32 = atomicrmw fsub float addrspace(1)* %p, float 1.0 monotonic
+  %f32 = atomicrmw fsub ptr addrspace(1) %p, float 1.0 monotonic
   %n32 = fptoui float %f32 to i32
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @fmin(double addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: fmin:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -558,15 +558,15 @@ define protected amdgpu_kernel void @fmin(double addrspace(1)* %p, %S addrspace(
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %f64 = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %p, double 1.0)
+  %f64 = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %p, double 1.0)
   %n32 = fptoui double %f64 to i32
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @fmax(double addrspace(1)* %p, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @fmax(ptr addrspace(1) %p, ptr addrspace(1) %q) {
 ; CHECK-LABEL: fmax:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -583,15 +583,15 @@ define protected amdgpu_kernel void @fmax(double addrspace(1)* %p, %S addrspace(
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %f64 = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %p, double 1.0)
+  %f64 = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %p, double 1.0)
   %n32 = fptoui double %f64 to i32
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.atomic.swap(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @buffer.atomic.swap(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.atomic.swap:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -608,12 +608,12 @@ define protected amdgpu_kernel void @buffer.atomic.swap(<4 x i32> inreg %rsrc, i
 ; CHECK-NEXT:    s_endpgm
   %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.atomic.add(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @buffer.atomic.add(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.atomic.add:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -630,12 +630,12 @@ define protected amdgpu_kernel void @buffer.atomic.add(<4 x i32> inreg %rsrc, i3
 ; CHECK-NEXT:    s_endpgm
   %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.atomic.sub(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @buffer.atomic.sub(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.atomic.sub:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -652,12 +652,12 @@ define protected amdgpu_kernel void @buffer.atomic.sub(<4 x i32> inreg %rsrc, i3
 ; CHECK-NEXT:    s_endpgm
   %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.atomic.smin(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @buffer.atomic.smin(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.atomic.smin:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -674,12 +674,12 @@ define protected amdgpu_kernel void @buffer.atomic.smin(<4 x i32> inreg %rsrc, i
 ; CHECK-NEXT:    s_endpgm
   %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.atomic.smax(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @buffer.atomic.smax(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.atomic.smax:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -696,12 +696,12 @@ define protected amdgpu_kernel void @buffer.atomic.smax(<4 x i32> inreg %rsrc, i
 ; CHECK-NEXT:    s_endpgm
   %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.atomic.umin(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @buffer.atomic.umin(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.atomic.umin:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -718,12 +718,12 @@ define protected amdgpu_kernel void @buffer.atomic.umin(<4 x i32> inreg %rsrc, i
 ; CHECK-NEXT:    s_endpgm
   %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.atomic.umax(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @buffer.atomic.umax(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.atomic.umax:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -740,12 +740,12 @@ define protected amdgpu_kernel void @buffer.atomic.umax(<4 x i32> inreg %rsrc, i
 ; CHECK-NEXT:    s_endpgm
   %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.atomic.and(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @buffer.atomic.and(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.atomic.and:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -762,12 +762,12 @@ define protected amdgpu_kernel void @buffer.atomic.and(<4 x i32> inreg %rsrc, i3
 ; CHECK-NEXT:    s_endpgm
   %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.atomic.or(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @buffer.atomic.or(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.atomic.or:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -784,12 +784,12 @@ define protected amdgpu_kernel void @buffer.atomic.or(<4 x i32> inreg %rsrc, i32
 ; CHECK-NEXT:    s_endpgm
   %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.atomic.xor(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @buffer.atomic.xor(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.atomic.xor:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -806,12 +806,12 @@ define protected amdgpu_kernel void @buffer.atomic.xor(<4 x i32> inreg %rsrc, i3
 ; CHECK-NEXT:    s_endpgm
   %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.atomic.inc(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @buffer.atomic.inc(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.atomic.inc:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -828,12 +828,12 @@ define protected amdgpu_kernel void @buffer.atomic.inc(<4 x i32> inreg %rsrc, i3
 ; CHECK-NEXT:    s_endpgm
   %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.atomic.dec(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @buffer.atomic.dec(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.atomic.dec:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -850,12 +850,12 @@ define protected amdgpu_kernel void @buffer.atomic.dec(<4 x i32> inreg %rsrc, i3
 ; CHECK-NEXT:    s_endpgm
   %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.atomic.cmpswap(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @buffer.atomic.cmpswap(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.atomic.cmpswap:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -873,12 +873,12 @@ define protected amdgpu_kernel void @buffer.atomic.cmpswap(<4 x i32> inreg %rsrc
 ; CHECK-NEXT:    s_endpgm
   %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 1, i32 2, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.atomic.fadd(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @buffer.atomic.fadd(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.atomic.fadd:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -897,12 +897,12 @@ define protected amdgpu_kernel void @buffer.atomic.fadd(<4 x i32> inreg %rsrc, i
   %f32 = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
   %n32 = fptoui float %f32 to i32
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.atomic.fmin(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @buffer.atomic.fmin(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.atomic.fmin:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -922,12 +922,12 @@ define protected amdgpu_kernel void @buffer.atomic.fmin(<4 x i32> inreg %rsrc, i
   %f64 = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double 1.0, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
   %n32 = fptoui double %f64 to i32
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.atomic.fmax(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
+define protected amdgpu_kernel void @buffer.atomic.fmax(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.atomic.fmax:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -947,15 +947,15 @@ define protected amdgpu_kernel void @buffer.atomic.fmax(<4 x i32> inreg %rsrc, i
   %f64 = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double 1.0, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
   %n32 = fptoui double %f64 to i32
   %n64 = zext i32 %n32 to i64
-  %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
-  store float 1.0, float addrspace(1)* %p1
+  %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
+  store float 1.0, ptr addrspace(1) %p1
   ret void
 }
 
-declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)*, i32, i32 immarg, i32 immarg, i1 immarg)
-declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)*, i32, i32 immarg, i32 immarg, i1 immarg)
-declare double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)*, double)
-declare double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)*, double)
+declare i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1), i32, i32 immarg, i32 immarg, i1 immarg)
+declare i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1), i32, i32 immarg, i32 immarg, i1 immarg)
+declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1), double)
+declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1), double)
 declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32)
 declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32)
 declare i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i32)

diff  --git a/llvm/test/CodeGen/AMDGPU/dag-divergence.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence.ll
index 6694fda145f56..b0b9bbe4835ed 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-divergence.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-divergence.ll
@@ -5,11 +5,11 @@
 ; GCN-NOT: s_load_dword s
 ; GCN: flat_load_dword
 ; GCN-NOT: s_load_dword s
-define amdgpu_kernel void @private_load_maybe_divergent(i32 addrspace(4)* %k, i32* %flat) {
-  %load = load volatile i32, i32 addrspace(5)* undef, align 4
-  %gep = getelementptr inbounds i32, i32 addrspace(4)* %k, i32 %load
-  %maybe.not.uniform.load = load i32, i32 addrspace(4)* %gep, align 4
-  store i32 %maybe.not.uniform.load, i32 addrspace(1)* undef
+define amdgpu_kernel void @private_load_maybe_divergent(ptr addrspace(4) %k, ptr %flat) {
+  %load = load volatile i32, ptr addrspace(5) undef, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(4) %k, i32 %load
+  %maybe.not.uniform.load = load i32, ptr addrspace(4) %gep, align 4
+  store i32 %maybe.not.uniform.load, ptr addrspace(1) undef
   ret void
 }
 
@@ -21,10 +21,10 @@ define amdgpu_kernel void @private_load_maybe_divergent(i32 addrspace(4)* %k, i3
 ; GCN: flat_load_dword
 ; GCN-NOT: s_load
 ; GCN: flat_store_dword
-define amdgpu_kernel void @flat_load_maybe_divergent(i32 addrspace(4)* %k, i32* %flat) {
-  %load = load i32, i32* %flat, align 4
-  %gep = getelementptr inbounds i32, i32 addrspace(4)* %k, i32 %load
-  %maybe.not.uniform.load = load i32, i32 addrspace(4)* %gep, align 4
-  store i32 %maybe.not.uniform.load, i32 addrspace(1)* undef
+define amdgpu_kernel void @flat_load_maybe_divergent(ptr addrspace(4) %k, ptr %flat) {
+  %load = load i32, ptr %flat, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(4) %k, i32 %load
+  %maybe.not.uniform.load = load i32, ptr addrspace(4) %gep, align 4
+  store i32 %maybe.not.uniform.load, ptr addrspace(1) undef
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll
index 9b416710c640a..8c4643fabb2a3 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s
 
-define i32 @mullohi_u32(i32 %arg, i32 %arg1, i32* %arg2) {
+define i32 @mullohi_u32(i32 %arg, i32 %arg1, ptr %arg2) {
 ; CHECK-LABEL: mullohi_u32:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -15,12 +15,12 @@ bb:
   %i4 = mul nuw i64 %i3, %i
   %i5 = lshr i64 %i4, 32
   %i6 = trunc i64 %i5 to i32
-  store i32 %i6, i32* %arg2, align 4
+  store i32 %i6, ptr %arg2, align 4
   %i7 = trunc i64 %i4 to i32
   ret i32 %i7
 }
 
-define i32 @mullohi_s32(i32 %arg, i32 %arg1, i32* %arg2) {
+define i32 @mullohi_s32(i32 %arg, i32 %arg1, ptr %arg2) {
 ; CHECK-LABEL: mullohi_s32:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34,12 +34,12 @@ bb:
   %i4 = mul nsw i64 %i3, %i
   %i5 = ashr i64 %i4, 32
   %i6 = trunc i64 %i5 to i32
-  store i32 %i6, i32* %arg2, align 4
+  store i32 %i6, ptr %arg2, align 4
   %i7 = trunc i64 %i4 to i32
   ret i32 %i7
 }
 
-define i32 @mullohi_u32_non_const_shift(i32 %arg, i32 %arg1, i32* %arg2, i64 %shift) {
+define i32 @mullohi_u32_non_const_shift(i32 %arg, i32 %arg1, ptr %arg2, i64 %shift) {
 ; CHECK-LABEL: mullohi_u32_non_const_shift:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -54,13 +54,13 @@ bb:
   %i4 = mul nuw i64 %i3, %i
   %i5 = lshr i64 %i4, 32
   %i6 = trunc i64 %i5 to i32
-  store i32 %i6, i32* %arg2, align 4
+  store i32 %i6, ptr %arg2, align 4
   %i7 = lshr i64 %i4, %shift
   %i8 = trunc i64 %i7 to i32
   ret i32 %i8
 }
 
-define <2 x i32> @mullohi_2xu32(<2 x i32> %arg, <2 x i32> %arg1, <2 x i32>* %arg2) {
+define <2 x i32> @mullohi_2xu32(<2 x i32> %arg, <2 x i32> %arg1, ptr %arg2) {
 ; CHECK-LABEL: mullohi_2xu32:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -79,12 +79,12 @@ bb:
   %i4 = mul nuw <2 x i64> %i3, %i
   %i5 = lshr <2 x i64> %i4, <i64 32, i64 32>
   %i6 = trunc <2 x i64> %i5 to <2 x i32>
-  store <2 x i32> %i6, <2 x i32>* %arg2, align 8
+  store <2 x i32> %i6, ptr %arg2, align 8
   %i7 = trunc <2 x i64> %i4 to <2 x i32>
   ret <2 x i32> %i7
 }
 
-define i8 @mullohi_illegal_ty(i8 %arg, i8 %arg1, i8* %arg2) {
+define i8 @mullohi_illegal_ty(i8 %arg, i8 %arg1, ptr %arg2) {
 ; CHECK-LABEL: mullohi_illegal_ty:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -99,12 +99,12 @@ bb:
   %i4 = mul nuw i16 %i3, %i
   %i5 = lshr i16 %i4, 8
   %i6 = trunc i16 %i5 to i8
-  store i8 %i6, i8* %arg2, align 1
+  store i8 %i6, ptr %arg2, align 1
   %i7 = trunc i16 %i4 to i8
   ret i8 %i7
 }
 
-define i32 @mul_one_bit_low_hi_u32(i32 %arg, i32 %arg1, i32* %arg2) {
+define i32 @mul_one_bit_low_hi_u32(i32 %arg, i32 %arg1, ptr %arg2) {
 ; CHECK-LABEL: mul_one_bit_low_hi_u32:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -119,13 +119,13 @@ bb:
   %i4 = mul nsw i64 %i3, %i
   %i5 = lshr i64 %i4, 32
   %i6 = trunc i64 %i5 to i32
-  store i32 %i6, i32* %arg2, align 4
+  store i32 %i6, ptr %arg2, align 4
   %i7 = lshr i64 %i4, 31
   %i8 = trunc i64 %i7 to i32
   ret i32 %i8
 }
 
-define i32 @mul_one_bit_hi_hi_u32_lshr_lshr(i32 %arg, i32 %arg1, i32* %arg2) {
+define i32 @mul_one_bit_hi_hi_u32_lshr_lshr(i32 %arg, i32 %arg1, ptr %arg2) {
 ; CHECK-LABEL: mul_one_bit_hi_hi_u32_lshr_lshr:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -140,13 +140,13 @@ bb:
   %i4 = mul nsw i64 %i3, %i
   %i5 = lshr i64 %i4, 32
   %i6 = trunc i64 %i5 to i32
-  store i32 %i6, i32* %arg2, align 4
+  store i32 %i6, ptr %arg2, align 4
   %i7 = lshr i64 %i4, 33
   %i8 = trunc i64 %i7 to i32
   ret i32 %i8
 }
 
-define i32 @mul_one_bit_hi_hi_u32_lshr_ashr(i32 %arg, i32 %arg1, i32* %arg2) {
+define i32 @mul_one_bit_hi_hi_u32_lshr_ashr(i32 %arg, i32 %arg1, ptr %arg2) {
 ; CHECK-LABEL: mul_one_bit_hi_hi_u32_lshr_ashr:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -161,7 +161,7 @@ bb:
   %i4 = mul nsw i64 %i3, %i
   %i5 = lshr i64 %i4, 32
   %i6 = trunc i64 %i5 to i32
-  store i32 %i6, i32* %arg2, align 4
+  store i32 %i6, ptr %arg2, align 4
   %i7 = ashr i64 %i4, 33
   %i8 = trunc i64 %i7 to i32
   ret i32 %i8

diff  --git a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
index ccf9eec087b25..b4ca1547e734c 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
@@ -5,11 +5,11 @@
 ; GCN:     v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
 ; GCN-NOT: v_and_b32
 ; GCN:     store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @select_and1(i32 addrspace(1)* %p, i32 %x, i32 %y) {
+define amdgpu_kernel void @select_and1(ptr addrspace(1) %p, i32 %x, i32 %y) {
   %c = icmp slt i32 %x, 11
   %s = select i1 %c, i32 0, i32 -1
   %a = and i32 %y, %s
-  store i32 %a, i32 addrspace(1)* %p, align 4
+  store i32 %a, ptr addrspace(1) %p, align 4
   ret void
 }
 
@@ -18,11 +18,11 @@ define amdgpu_kernel void @select_and1(i32 addrspace(1)* %p, i32 %x, i32 %y) {
 ; GCN:     v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
 ; GCN-NOT: v_and_b32
 ; GCN:     store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @select_and2(i32 addrspace(1)* %p, i32 %x, i32 %y) {
+define amdgpu_kernel void @select_and2(ptr addrspace(1) %p, i32 %x, i32 %y) {
   %c = icmp slt i32 %x, 11
   %s = select i1 %c, i32 0, i32 -1
   %a = and i32 %s, %y
-  store i32 %a, i32 addrspace(1)* %p, align 4
+  store i32 %a, ptr addrspace(1) %p, align 4
   ret void
 }
 
@@ -31,11 +31,11 @@ define amdgpu_kernel void @select_and2(i32 addrspace(1)* %p, i32 %x, i32 %y) {
 ; GCN:     v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
 ; GCN-NOT: v_and_b32
 ; GCN:     store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @select_and3(i32 addrspace(1)* %p, i32 %x, i32 %y) {
+define amdgpu_kernel void @select_and3(ptr addrspace(1) %p, i32 %x, i32 %y) {
   %c = icmp slt i32 %x, 11
   %s = select i1 %c, i32 -1, i32 0
   %a = and i32 %y, %s
-  store i32 %a, i32 addrspace(1)* %p, align 4
+  store i32 %a, ptr addrspace(1) %p, align 4
   ret void
 }
 
@@ -50,11 +50,11 @@ define amdgpu_kernel void @select_and3(i32 addrspace(1)* %p, i32 %x, i32 %y) {
 ; GCN:     v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
 ; GCN-NOT: v_and_b32
 ; GCN:     global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
-define amdgpu_kernel void @select_and_v4(<4 x i32> addrspace(1)* %p, i32 %x, <4 x i32> %y) {
+define amdgpu_kernel void @select_and_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> %y) {
   %c = icmp slt i32 %x, 11
   %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
   %a = and <4 x i32> %s, %y
-  store <4 x i32> %a, <4 x i32> addrspace(1)* %p, align 32
+  store <4 x i32> %a, ptr addrspace(1) %p, align 32
   ret void
 }
 
@@ -63,11 +63,11 @@ define amdgpu_kernel void @select_and_v4(<4 x i32> addrspace(1)* %p, i32 %x, <4
 ; GCN:     v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
 ; GCN-NOT: v_or_b32
 ; GCN:     store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @select_or1(i32 addrspace(1)* %p, i32 %x, i32 %y) {
+define amdgpu_kernel void @select_or1(ptr addrspace(1) %p, i32 %x, i32 %y) {
   %c = icmp slt i32 %x, 11
   %s = select i1 %c, i32 0, i32 -1
   %a = or i32 %y, %s
-  store i32 %a, i32 addrspace(1)* %p, align 4
+  store i32 %a, ptr addrspace(1) %p, align 4
   ret void
 }
 
@@ -76,11 +76,11 @@ define amdgpu_kernel void @select_or1(i32 addrspace(1)* %p, i32 %x, i32 %y) {
 ; GCN:     v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
 ; GCN-NOT: v_or_b32
 ; GCN:     store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @select_or2(i32 addrspace(1)* %p, i32 %x, i32 %y) {
+define amdgpu_kernel void @select_or2(ptr addrspace(1) %p, i32 %x, i32 %y) {
   %c = icmp slt i32 %x, 11
   %s = select i1 %c, i32 0, i32 -1
   %a = or i32 %s, %y
-  store i32 %a, i32 addrspace(1)* %p, align 4
+  store i32 %a, ptr addrspace(1) %p, align 4
   ret void
 }
 
@@ -89,11 +89,11 @@ define amdgpu_kernel void @select_or2(i32 addrspace(1)* %p, i32 %x, i32 %y) {
 ; GCN:     v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
 ; GCN-NOT: v_or_b32
 ; GCN:     store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @select_or3(i32 addrspace(1)* %p, i32 %x, i32 %y) {
+define amdgpu_kernel void @select_or3(ptr addrspace(1) %p, i32 %x, i32 %y) {
   %c = icmp slt i32 %x, 11
   %s = select i1 %c, i32 -1, i32 0
   %a = or i32 %y, %s
-  store i32 %a, i32 addrspace(1)* %p, align 4
+  store i32 %a, ptr addrspace(1) %p, align 4
   ret void
 }
 
@@ -108,49 +108,49 @@ define amdgpu_kernel void @select_or3(i32 addrspace(1)* %p, i32 %x, i32 %y) {
 ; GCN:     v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]]
 ; GCN:     v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
 ; GCN:     global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
-define amdgpu_kernel void @select_or_v4(<4 x i32> addrspace(1)* %p, i32 %x, <4 x i32> %y) {
+define amdgpu_kernel void @select_or_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> %y) {
   %c = icmp slt i32 %x, 11
   %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
   %a = or <4 x i32> %s, %y
-  store <4 x i32> %a, <4 x i32> addrspace(1)* %p, align 32
+  store <4 x i32> %a, ptr addrspace(1) %p, align 32
   ret void
 }
 
 ; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants:
 ; GCN: s_cselect_b32 s{{[0-9]+}}, 9, 2
-define amdgpu_kernel void @sel_constants_sub_constant_sel_constants(i32 addrspace(1)* %p, i1 %cond) {
+define amdgpu_kernel void @sel_constants_sub_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, i32 -4, i32 3
   %bo = sub i32 5, %sel
-  store i32 %bo, i32 addrspace(1)* %p, align 4
+  store i32 %bo, ptr addrspace(1) %p, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16:
 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 9,
-define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16(i16 addrspace(1)* %p, i1 %cond) {
+define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, i16 -4, i16 3
   %bo = sub i16 5, %sel
-  store i16 %bo, i16 addrspace(1)* %p, align 2
+  store i16 %bo, ptr addrspace(1) %p, align 2
   ret void
 }
 
 ; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16_neg:
 ; GCN: v_mov_b32_e32 [[F:v[0-9]+]], 0xfffff449
 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, [[F]], -3,
-define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16_neg(i16 addrspace(1)* %p, i1 %cond) {
+define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16_neg(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, i16 4, i16 3000
   %bo = sub i16 1, %sel
-  store i16 %bo, i16 addrspace(1)* %p, align 2
+  store i16 %bo, ptr addrspace(1) %p, align 2
   ret void
 }
 
 ; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_v2i16:
 ; GCN-DAG: s_mov_b32 [[T:s[0-9]+]], 0x50009
 ; GCN:     s_cselect_b32 s{{[0-9]+}}, [[T]], 0x60002
-define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v2i16(<2 x i16> addrspace(1)* %p, i1 %cond) {
+define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v2i16(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, <2 x i16> <i16 -4, i16 2>, <2 x i16> <i16 3, i16 1>
   %bo = sub <2 x i16> <i16 5, i16 7>, %sel
-  store <2 x i16> %bo, <2 x i16> addrspace(1)* %p, align 4
+  store <2 x i16> %bo, ptr addrspace(1) %p, align 4
   ret void
 }
 
@@ -164,91 +164,91 @@ define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v2i16(<2 x i
 ; GCN:     v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]]
 ; GCN:     v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
 ; GCN:     global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
-define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v4i32(<4 x i32> addrspace(1)* %p, i1 %cond) {
+define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v4i32(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, <4 x i32> <i32 -4, i32 2, i32 3, i32 4>, <4 x i32> <i32 3, i32 1, i32 -1, i32 -3>
   %bo = sub <4 x i32> <i32 5, i32 7, i32 9, i32 11>, %sel
-  store <4 x i32> %bo, <4 x i32> addrspace(1)* %p, align 32
+  store <4 x i32> %bo, ptr addrspace(1) %p, align 32
   ret void
 }
 
 ; GCN-LABEL: {{^}}sdiv_constant_sel_constants_i64:
 ; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 5
-define amdgpu_kernel void @sdiv_constant_sel_constants_i64(i64 addrspace(1)* %p, i1 %cond) {
+define amdgpu_kernel void @sdiv_constant_sel_constants_i64(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, i64 121, i64 23
   %bo = sdiv i64 120, %sel
-  store i64 %bo, i64 addrspace(1)* %p, align 8
+  store i64 %bo, ptr addrspace(1) %p, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}sdiv_constant_sel_constants_i32:
 ; GCN: s_cselect_b32 s{{[0-9]+}}, 26, 8
-define amdgpu_kernel void @sdiv_constant_sel_constants_i32(i32 addrspace(1)* %p, i1 %cond) {
+define amdgpu_kernel void @sdiv_constant_sel_constants_i32(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, i32 7, i32 23
   %bo = sdiv i32 184, %sel
-  store i32 %bo, i32 addrspace(1)* %p, align 8
+  store i32 %bo, ptr addrspace(1) %p, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}udiv_constant_sel_constants_i64:
 ; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 5
-define amdgpu_kernel void @udiv_constant_sel_constants_i64(i64 addrspace(1)* %p, i1 %cond) {
+define amdgpu_kernel void @udiv_constant_sel_constants_i64(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, i64 -4, i64 23
   %bo = udiv i64 120, %sel
-  store i64 %bo, i64 addrspace(1)* %p, align 8
+  store i64 %bo, ptr addrspace(1) %p, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}srem_constant_sel_constants:
 ; GCN: s_cselect_b32 s{{[0-9]+}}, 33, 3
-define amdgpu_kernel void @srem_constant_sel_constants(i64 addrspace(1)* %p, i1 %cond) {
+define amdgpu_kernel void @srem_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, i64 34, i64 15
   %bo = srem i64 33, %sel
-  store i64 %bo, i64 addrspace(1)* %p, align 8
+  store i64 %bo, ptr addrspace(1) %p, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}urem_constant_sel_constants:
 ; GCN: s_cselect_b32 s{{[0-9]+}}, 33, 3
-define amdgpu_kernel void @urem_constant_sel_constants(i64 addrspace(1)* %p, i1 %cond) {
+define amdgpu_kernel void @urem_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, i64 34, i64 15
   %bo = urem i64 33, %sel
-  store i64 %bo, i64 addrspace(1)* %p, align 8
+  store i64 %bo, ptr addrspace(1) %p, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}shl_constant_sel_constants:
 ; GCN: s_cselect_b32 s{{[0-9]+}}, 4, 8
-define amdgpu_kernel void @shl_constant_sel_constants(i32 addrspace(1)* %p, i1 %cond) {
+define amdgpu_kernel void @shl_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, i32 2, i32 3
   %bo = shl i32 1, %sel
-  store i32 %bo, i32 addrspace(1)* %p, align 4
+  store i32 %bo, ptr addrspace(1) %p, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}lshr_constant_sel_constants:
 ; GCN: s_cselect_b32 s{{[0-9]+}}, 16, 8
-define amdgpu_kernel void @lshr_constant_sel_constants(i32 addrspace(1)* %p, i1 %cond) {
+define amdgpu_kernel void @lshr_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, i32 2, i32 3
   %bo = lshr i32 64, %sel
-  store i32 %bo, i32 addrspace(1)* %p, align 4
+  store i32 %bo, ptr addrspace(1) %p, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}ashr_constant_sel_constants:
 ; GCN: s_cselect_b32 s{{[0-9]+}}, 32, 16
-define amdgpu_kernel void @ashr_constant_sel_constants(i32 addrspace(1)* %p, i1 %cond) {
+define amdgpu_kernel void @ashr_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, i32 2, i32 3
   %bo = ashr i32 128, %sel
-  store i32 %bo, i32 addrspace(1)* %p, align 4
+  store i32 %bo, ptr addrspace(1) %p, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}fsub_constant_sel_constants:
 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, -4.0, 1.0,
-define amdgpu_kernel void @fsub_constant_sel_constants(float addrspace(1)* %p, i1 %cond) {
+define amdgpu_kernel void @fsub_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, float -2.0, float 3.0
   %bo = fsub float -1.0, %sel
-  store float %bo, float addrspace(1)* %p, align 4
+  store float %bo, ptr addrspace(1) %p, align 4
   ret void
 }
 
@@ -257,19 +257,19 @@ define amdgpu_kernel void @fsub_constant_sel_constants(float addrspace(1)* %p, i
 ; GCN-DAG: v_mov_b32_e32 [[T:v[0-9]+]], 0x3c00
 ; GCN-DAG: v_mov_b32_e32 [[F:v[0-9]+]], 0xc400
 ; GCN:     v_cndmask_b32_e32 v{{[0-9]+}}, [[F]], [[T]],
-define amdgpu_kernel void @fsub_constant_sel_constants_f16(half addrspace(1)* %p, i1 %cond) {
+define amdgpu_kernel void @fsub_constant_sel_constants_f16(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, half -2.0, half 3.0
   %bo = fsub half -1.0, %sel
-  store half %bo, half addrspace(1)* %p, align 2
+  store half %bo, ptr addrspace(1) %p, align 2
   ret void
 }
 
 ; GCN-LABEL: {{^}}fsub_constant_sel_constants_v2f16:
 ; GCN:     s_cselect_b32 s{{[0-9]+}}, 0x45003c00, -2.0
-define amdgpu_kernel void @fsub_constant_sel_constants_v2f16(<2 x half> addrspace(1)* %p, i1 %cond) {
+define amdgpu_kernel void @fsub_constant_sel_constants_v2f16(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, <2 x half> <half -2.0, half -3.0>, <2 x half> <half -1.0, half 4.0>
   %bo = fsub <2 x half> <half -1.0, half 2.0>, %sel
-  store <2 x half> %bo, <2 x half> addrspace(1)* %p, align 4
+  store <2 x half> %bo, ptr addrspace(1) %p, align 4
   ret void
 }
 
@@ -284,27 +284,27 @@ define amdgpu_kernel void @fsub_constant_sel_constants_v2f16(<2 x half> addrspac
 ; GCN:     v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]]
 ; GCN:     v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
 ; GCN:     global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
-define amdgpu_kernel void @fsub_constant_sel_constants_v4f32(<4 x float> addrspace(1)* %p, i1 %cond) {
+define amdgpu_kernel void @fsub_constant_sel_constants_v4f32(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, <4 x float> <float -2.0, float -3.0, float -4.0, float -5.0>, <4 x float> <float -1.0, float 0.0, float 1.0, float 2.0>
   %bo = fsub <4 x float> <float -1.0, float 2.0, float 5.0, float 8.0>, %sel
-  store <4 x float> %bo, <4 x float> addrspace(1)* %p, align 32
+  store <4 x float> %bo, ptr addrspace(1) %p, align 32
   ret void
 }
 
 ; GCN-LABEL: {{^}}fdiv_constant_sel_constants:
 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 4.0, -2.0,
-define amdgpu_kernel void @fdiv_constant_sel_constants(float addrspace(1)* %p, i1 %cond) {
+define amdgpu_kernel void @fdiv_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, float -4.0, float 2.0
   %bo = fdiv float 8.0, %sel
-  store float %bo, float addrspace(1)* %p, align 4
+  store float %bo, ptr addrspace(1) %p, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}frem_constant_sel_constants:
 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0,
-define amdgpu_kernel void @frem_constant_sel_constants(float addrspace(1)* %p, i1 %cond) {
+define amdgpu_kernel void @frem_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, float -4.0, float 3.0
   %bo = frem float 5.0, %sel
-  store float %bo, float addrspace(1)* %p, align 4
+  store float %bo, ptr addrspace(1) %p, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll
index c7b12e17832b3..347368210b56a 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @eq_t(float %x) {
   %s1 = select i1 %c1, i32 56789, i32 1
   %c2 = icmp eq i32 %s1, 56789
   %s2 = select i1 %c2, float 4.0, float 2.0
-  store float %s2, float* undef, align 4
+  store float %s2, ptr undef, align 4
   ret void
 }
 
@@ -31,7 +31,7 @@ define amdgpu_kernel void @ne_t(float %x) {
   %s1 = select i1 %c1, i32 56789, i32 1
   %c2 = icmp ne i32 %s1, 56789
   %s2 = select i1 %c2, float 4.0, float 2.0
-  store float %s2, float* undef, align 4
+  store float %s2, ptr undef, align 4
   ret void
 }
 
@@ -48,7 +48,7 @@ define amdgpu_kernel void @eq_f(float %x) {
   %s1 = select i1 %c1, i32 1, i32 56789
   %c2 = icmp eq i32 %s1, 56789
   %s2 = select i1 %c2, float 4.0, float 2.0
-  store float %s2, float* undef, align 4
+  store float %s2, ptr undef, align 4
   ret void
 }
 
@@ -65,7 +65,7 @@ define amdgpu_kernel void @ne_f(float %x) {
   %s1 = select i1 %c1, i32 1, i32 56789
   %c2 = icmp ne i32 %s1, 56789
   %s2 = select i1 %c2, float 4.0, float 2.0
-  store float %s2, float* undef, align 4
+  store float %s2, ptr undef, align 4
   ret void
 }
 
@@ -79,6 +79,6 @@ define amdgpu_kernel void @
diff erent_constants(float %x) {
   %s1 = select i1 %c1, i32 56789, i32 1
   %c2 = icmp eq i32 %s1, 5678
   %s2 = select i1 %c2, float 4.0, float 2.0
-  store float %s2, float* undef, align 4
+  store float %s2, ptr undef, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/debug-value.ll b/llvm/test/CodeGen/AMDGPU/debug-value.ll
index b9a1b1b60bf41..d75de527d5393 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/debug-value.ll
@@ -2,26 +2,20 @@
 
 %struct.wombat = type { [4 x i32], [4 x i32], [4 x i32] }
 
-define amdgpu_kernel void @wobble(i8 addrspace(1)* nocapture readonly %arg) #0 !dbg !4 {
+define amdgpu_kernel void @wobble(ptr addrspace(1) nocapture readonly %arg) #0 !dbg !4 {
 bb:
-  %tmp = load i32, i32 addrspace(1)* undef, align 4
-  %tmp1 = load <4 x float>, <4 x float> addrspace(1)* undef, align 16
+  %tmp = load i32, ptr addrspace(1) undef, align 4
+  %tmp1 = load <4 x float>, ptr addrspace(1) undef, align 16
   %tmp2 = sext i32 %tmp to i64
   %tmp3 = shufflevector <4 x float> undef, <4 x float> %tmp1, <2 x i32> <i32 3, i32 7>
   %tmp4 = call float @barney() #2
-  %tmp5 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 0
-  %tmp6 = bitcast i8 addrspace(1)* %tmp5 to <2 x float> addrspace(1)*
-  %tmp7 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 0
-  %tmp8 = bitcast i8 addrspace(1)* %tmp7 to %struct.wombat addrspace(1)*
-  %tmp9 = getelementptr inbounds %struct.wombat, %struct.wombat addrspace(1)* %tmp8, i64 %tmp2, i32 2, i64 0
-  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp9 = getelementptr inbounds %struct.wombat, ptr addrspace(1) %arg, i64 %tmp2, i32 2, i64 0
+  %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
   %tmp11 = sext i32 %tmp10 to i64
-  %tmp12 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %tmp6, i64 %tmp11
-  %tmp13 = bitcast <2 x float> addrspace(1)* %tmp12 to i64 addrspace(1)*
-  %tmp14 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 undef
-  %tmp15 = bitcast i8 addrspace(1)* %tmp14 to <4 x float> addrspace(1)*
-  %tmp16 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %tmp15, i64 undef
-  %tmp17 = load <4 x float>, <4 x float> addrspace(1)* %tmp16, align 16
+  %tmp12 = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i64 %tmp11
+  %tmp14 = getelementptr inbounds i8, ptr addrspace(1) %arg, i64 undef
+  %tmp16 = getelementptr inbounds <4 x float>, ptr addrspace(1) %tmp14, i64 undef
+  %tmp17 = load <4 x float>, ptr addrspace(1) %tmp16, align 16
   %tmp18 = fsub <4 x float> %tmp17, %tmp17
   %ext = extractelement <4 x float> %tmp18, i32 1
   %tmp19 = fadd float %ext, 0.000000e+00
@@ -41,18 +35,17 @@ bb25:                                             ; preds = %bb
 
 bb28:                                             ; preds = %bb25, %bb21
   %tmp29 = phi <4 x float> [ %tmp27, %bb25 ], [ %tmp24, %bb21 ]
-  store <4 x float> %tmp29, <4 x float> addrspace(5)* undef, align 16
-  %tmp30 = getelementptr inbounds %struct.wombat, %struct.wombat addrspace(1)* %tmp8, i64 %tmp2, i32 2, i64 2
-  %tmp31 = load i32, i32 addrspace(1)* %tmp30, align 4
+  store <4 x float> %tmp29, ptr addrspace(5) undef, align 16
+  %tmp30 = getelementptr inbounds %struct.wombat, ptr addrspace(1) %arg, i64 %tmp2, i32 2, i64 2
+  %tmp31 = load i32, ptr addrspace(1) %tmp30, align 4
   %tmp32 = sext i32 %tmp31 to i64
-  %tmp33 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %tmp6, i64 %tmp32
-  %tmp34 = bitcast <2 x float> addrspace(1)* %tmp33 to i64 addrspace(1)*
-  %tmp35 = load i64, i64 addrspace(1)* %tmp34, align 8
-  %tmp36 = load i32, i32 addrspace(1)* undef, align 4
+  %tmp33 = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i64 %tmp32
+  %tmp35 = load i64, ptr addrspace(1) %tmp33, align 8
+  %tmp36 = load i32, ptr addrspace(1) undef, align 4
   %tmp37 = sext i32 %tmp36 to i64
-  %tmp38 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* null, i64 %tmp37
-  %tmp39 = load <4 x float>, <4 x float> addrspace(1)* %tmp38, align 16
-  %tmp40 = load <4 x float>, <4 x float> addrspace(1)* undef, align 16
+  %tmp38 = getelementptr inbounds <4 x float>, ptr addrspace(1) null, i64 %tmp37
+  %tmp39 = load <4 x float>, ptr addrspace(1) %tmp38, align 16
+  %tmp40 = load <4 x float>, ptr addrspace(1) undef, align 16
   %tmp41 = fsub <4 x float> zeroinitializer, %tmp40
   %tmp42 = fsub <4 x float> %tmp39, %tmp40
   %tmp43 = extractelement <4 x float> %tmp40, i32 1
@@ -83,8 +76,8 @@ bb28:                                             ; preds = %bb25, %bb21
   %tmp63 = fsub <2 x float> %tmp62, %tmp59
   %tmp64 = extractelement <2 x float> %tmp63, i64 0
   call void @eggs(float %tmp64) #2
-  store <2 x float> %tmp3, <2 x float> addrspace(1)* undef, align 8
-  store float 0.000000e+00, float addrspace(1)* undef, align 4
+  store <2 x float> %tmp3, ptr addrspace(1) undef, align 8
+  store float 0.000000e+00, ptr addrspace(1) undef, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/debug-value2.ll b/llvm/test/CodeGen/AMDGPU/debug-value2.ll
index 18f5253984581..1d4c11de4076c 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-value2.ll
+++ b/llvm/test/CodeGen/AMDGPU/debug-value2.ll
@@ -6,34 +6,33 @@ declare float @llvm.fmuladd.f32(float, float, float)
 
 declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
 
-declare %struct.ShapeData addrspace(1)* @Scene_getSubShapeData(i32, i8 addrspace(1)*, i32 addrspace(1)*) local_unnamed_addr
+declare ptr addrspace(1) @Scene_getSubShapeData(i32, ptr addrspace(1), ptr addrspace(1)) local_unnamed_addr
 
-define <4 x float> @Scene_transformT(i32 %subshapeIdx, <4 x float> %v, float %time, i8 addrspace(1)* %gScene, i32 addrspace(1)* %gSceneOffsets) local_unnamed_addr !dbg !110 {
+define <4 x float> @Scene_transformT(i32 %subshapeIdx, <4 x float> %v, float %time, ptr addrspace(1) %gScene, ptr addrspace(1) %gSceneOffsets) local_unnamed_addr !dbg !110 {
 entry:
   ; CHECK: v_mov_b32_e32 v[[COPIED_ARG_PIECE:[0-9]+]], v9
 
   ; CHECK: ;DEBUG_VALUE: Scene_transformT:gScene <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr6
   ; CHECK: ;DEBUG_VALUE: Scene_transformT:gScene <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr7
-  call void @llvm.dbg.value(metadata i8 addrspace(1)* %gScene, metadata !120, metadata !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef)), !dbg !154
+  call void @llvm.dbg.value(metadata ptr addrspace(1) %gScene, metadata !120, metadata !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef)), !dbg !154
   ; CHECK: ;DEBUG_VALUE: Scene_transformT:gSceneOffsets <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr8
   ; CHECK: ;DEBUG_VALUE: Scene_transformT:gSceneOffsets <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr[[COPIED_ARG_PIECE]]
-  call void @llvm.dbg.value(metadata i32 addrspace(1)* %gSceneOffsets, metadata !121, metadata !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef)), !dbg !155
-  %call = tail call %struct.ShapeData addrspace(1)* @Scene_getSubShapeData(i32 %subshapeIdx, i8 addrspace(1)* %gScene, i32 addrspace(1)* %gSceneOffsets)
-  %m_linearMotion = getelementptr inbounds %struct.ShapeData, %struct.ShapeData addrspace(1)* %call, i64 0, i32 2
-  %tmp = load <4 x float>, <4 x float> addrspace(1)* %m_linearMotion, align 16
-  %m_angularMotion = getelementptr inbounds %struct.ShapeData, %struct.ShapeData addrspace(1)* %call, i64 0, i32 3
-  %tmp1 = load <4 x float>, <4 x float> addrspace(1)* %m_angularMotion, align 16
-  %m_scaleMotion = getelementptr inbounds %struct.ShapeData, %struct.ShapeData addrspace(1)* %call, i64 0, i32 4
-  %tmp2 = load <4 x float>, <4 x float> addrspace(1)* %m_scaleMotion, align 16
+  call void @llvm.dbg.value(metadata ptr addrspace(1) %gSceneOffsets, metadata !121, metadata !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef)), !dbg !155
+  %call = tail call ptr addrspace(1) @Scene_getSubShapeData(i32 %subshapeIdx, ptr addrspace(1) %gScene, ptr addrspace(1) %gSceneOffsets)
+  %m_linearMotion = getelementptr inbounds %struct.ShapeData, ptr addrspace(1) %call, i64 0, i32 2
+  %tmp = load <4 x float>, ptr addrspace(1) %m_linearMotion, align 16
+  %m_angularMotion = getelementptr inbounds %struct.ShapeData, ptr addrspace(1) %call, i64 0, i32 3
+  %tmp1 = load <4 x float>, ptr addrspace(1) %m_angularMotion, align 16
+  %m_scaleMotion = getelementptr inbounds %struct.ShapeData, ptr addrspace(1) %call, i64 0, i32 4
+  %tmp2 = load <4 x float>, ptr addrspace(1) %m_scaleMotion, align 16
   %splat.splatinsert = insertelement <4 x float> undef, float %time, i32 0
   %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
   %tmp3 = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %tmp2, <4 x float> %splat.splat, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>)
-  %m_translation = getelementptr inbounds %struct.ShapeData, %struct.ShapeData addrspace(1)* %call, i64 0, i32 0
-  %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %m_translation, align 16
-  %m_quaternion = getelementptr inbounds %struct.ShapeData, %struct.ShapeData addrspace(1)* %call, i64 0, i32 1
-  %tmp5 = load <4 x float>, <4 x float> addrspace(1)* %m_quaternion, align 16
-  %m_scale = getelementptr inbounds %struct.ShapeData, %struct.ShapeData addrspace(1)* %call, i64 0, i32 8
-  %tmp6 = load <4 x float>, <4 x float> addrspace(1)* %m_scale, align 16
+  %tmp4 = load <4 x float>, ptr addrspace(1) %call, align 16
+  %m_quaternion = getelementptr inbounds %struct.ShapeData, ptr addrspace(1) %call, i64 0, i32 1
+  %tmp5 = load <4 x float>, ptr addrspace(1) %m_quaternion, align 16
+  %m_scale = getelementptr inbounds %struct.ShapeData, ptr addrspace(1) %call, i64 0, i32 8
+  %tmp6 = load <4 x float>, ptr addrspace(1) %m_scale, align 16
   %mul = fmul <4 x float> %tmp6, %v
   %tmp7 = extractelement <4 x float> %tmp5, i64 0
   %sub.i.i = fsub float -0.000000e+00, %tmp7

diff  --git a/llvm/test/CodeGen/AMDGPU/debug.ll b/llvm/test/CodeGen/AMDGPU/debug.ll
index 2fa60a247eb75..5620c70d44f16 100644
--- a/llvm/test/CodeGen/AMDGPU/debug.ll
+++ b/llvm/test/CodeGen/AMDGPU/debug.ll
@@ -6,7 +6,7 @@
 ; SI: test:
 ; SI: BB0_0:
 ; SI: s_endpgm
-define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
-  store i32 0, i32 addrspace(1)* %out
+define amdgpu_kernel void @test(ptr addrspace(1) %out) {
+  store i32 0, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
index 5a033753061ce..67061bcb2a785 100644
--- a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
@@ -15,47 +15,45 @@
 
 ; CHECK-LABEL: @no_flat_workgroup_size(
 ; CHECK: alloca [5 x i32]
-; CHECK: store i32 4, i32 addrspace(5)* %arrayidx1, align 4
-define amdgpu_kernel void @no_flat_workgroup_size(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+; CHECK: store i32 4, ptr addrspace(5) %arrayidx1, align 4
+define amdgpu_kernel void @no_flat_workgroup_size(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4, addrspace(5)
-  %0 = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
-  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
-  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
-  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
-  store i32 %2, i32 addrspace(1)* %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %3 = load i32, i32 addrspace(5)* %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
-  store i32 %3, i32 addrspace(1)* %arrayidx13
+  %0 = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
+  store i32 4, ptr addrspace(5) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
+  store i32 5, ptr addrspace(5) %arrayidx3, align 4
+  %2 = load i32, ptr addrspace(5) %stack, align 4
+  store i32 %2, ptr addrspace(1) %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %3 = load i32, ptr addrspace(5) %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+  store i32 %3, ptr addrspace(1) %arrayidx13
   ret void
 }
 
 ; CHECK-LABEL: @explicit_default_workgroup_size(
 ; CHECK: alloca [5 x i32]
-; CHECK: store i32 4, i32 addrspace(5)* %arrayidx1, align 4
-define amdgpu_kernel void @explicit_default_workgroup_size(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
+; CHECK: store i32 4, ptr addrspace(5) %arrayidx1, align 4
+define amdgpu_kernel void @explicit_default_workgroup_size(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #1 {
 entry:
   %stack = alloca [5 x i32], align 4, addrspace(5)
-  %0 = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
-  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
-  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
-  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
-  store i32 %2, i32 addrspace(1)* %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %3 = load i32, i32 addrspace(5)* %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
-  store i32 %3, i32 addrspace(1)* %arrayidx13
+  %0 = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
+  store i32 4, ptr addrspace(5) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
+  store i32 5, ptr addrspace(5) %arrayidx3, align 4
+  %2 = load i32, ptr addrspace(5) %stack, align 4
+  store i32 %2, ptr addrspace(1) %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %3 = load i32, ptr addrspace(5) %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+  store i32 %3, ptr addrspace(1) %arrayidx13
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/default-fp-mode.ll b/llvm/test/CodeGen/AMDGPU/default-fp-mode.ll
index ddc226506357c..445446a3fd7a2 100644
--- a/llvm/test/CodeGen/AMDGPU/default-fp-mode.ll
+++ b/llvm/test/CodeGen/AMDGPU/default-fp-mode.ll
@@ -3,145 +3,145 @@
 ; GCN-LABEL: {{^}}test_default_si:
 ; GCN: FloatMode: 240
 ; GCN: IeeeMode: 1
-define amdgpu_kernel void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_default_si(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #0 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_default_vi:
 ; GCN: FloatMode: 240
 ; GCN: IeeeMode: 1
-define amdgpu_kernel void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_default_vi(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #1 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_f64_denormals:
 ; GCN: FloatMode: 240
 ; GCN: IeeeMode: 1
-define amdgpu_kernel void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_f64_denormals(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #2 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_f32_denormals:
 ; GCNL: FloatMode: 48
 ; GCN: IeeeMode: 1
-define amdgpu_kernel void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_f32_denormals(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #3 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_f32_f64_denormals:
 ; GCN: FloatMode: 240
 ; GCN: IeeeMode: 1
-define amdgpu_kernel void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_f32_f64_denormals(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #4 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_no_denormals
 ; GCN: FloatMode: 0
 ; GCN: IeeeMode: 1
-define amdgpu_kernel void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_no_denormals(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #5 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_f16_f64_denormals:
 ; GCN: FloatMode: 240
 ; GCN: IeeeMode: 1
-define amdgpu_kernel void @test_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #6 {
-  store half 0.0, half addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_f16_f64_denormals(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #6 {
+  store half 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_no_f16_f64_denormals:
 ; GCN: FloatMode: 48
 ; GCN: IeeeMode: 1
-define amdgpu_kernel void @test_no_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #7 {
-  store half 0.0, half addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_no_f16_f64_denormals(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #7 {
+  store half 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_f32_f16_f64_denormals:
 ; GCN: FloatMode: 240
 ; GCN: IeeeMode: 1
-define amdgpu_kernel void @test_f32_f16_f64_denormals(half addrspace(1)* %out0, float addrspace(1)* %out1, double addrspace(1)* %out2) #8 {
-  store half 0.0, half addrspace(1)* %out0
-  store float 0.0, float addrspace(1)* %out1
-  store double 0.0, double addrspace(1)* %out2
+define amdgpu_kernel void @test_f32_f16_f64_denormals(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2) #8 {
+  store half 0.0, ptr addrspace(1) %out0
+  store float 0.0, ptr addrspace(1) %out1
+  store double 0.0, ptr addrspace(1) %out2
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_just_f32_attr_flush
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define amdgpu_kernel void @test_just_f32_attr_flush(float addrspace(1)* %out0, double addrspace(1)* %out1) #9 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_just_f32_attr_flush(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #9 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_flush_all_outputs:
 ; GCN: FloatMode: 80
 ; GCN: IeeeMode: 1
-define amdgpu_kernel void @test_flush_all_outputs(float addrspace(1)* %out0, double addrspace(1)* %out1) #10 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_flush_all_outputs(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #10 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_flush_all_inputs:
 ; GCN: FloatMode: 160
 ; GCN: IeeeMode: 1
-define amdgpu_kernel void @test_flush_all_inputs(float addrspace(1)* %out0, double addrspace(1)* %out1) #11 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_flush_all_inputs(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #11 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_flush_f32_inputs:
 ; GCN: FloatMode: 224
 ; GCN: IeeeMode: 1
-define amdgpu_kernel void @test_flush_f32_inputs(float addrspace(1)* %out0, double addrspace(1)* %out1) #12 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_flush_f32_inputs(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #12 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_flush_f32_outputs:
 ; GCN: FloatMode: 208
 ; GCN: IeeeMode: 1
-define amdgpu_kernel void @test_flush_f32_outputs(float addrspace(1)* %out0, double addrspace(1)* %out1) #13 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_flush_f32_outputs(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #13 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_flush_f64_inputs:
 ; GCN: FloatMode: 176
 ; GCN: IeeeMode: 1
-define amdgpu_kernel void @test_flush_f64_inputs(float addrspace(1)* %out0, double addrspace(1)* %out1) #14 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_flush_f64_inputs(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #14 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_flush_f64_outputs:
 ; GCN: FloatMode: 112
 ; GCN: IeeeMode: 1
-define amdgpu_kernel void @test_flush_f64_outputs(float addrspace(1)* %out0, double addrspace(1)* %out1) #15 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_flush_f64_outputs(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #15 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
@@ -160,7 +160,7 @@ main_body:
 ; GCN-LABEL: {{^}}kill_vcc_implicit_def:
 ; GCN: FloatMode: 240
 ; GCN: IeeeMode: 0
-define amdgpu_ps float @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(4)* inreg, [17 x <16 x i8>] addrspace(4)* inreg, [17 x <4 x i32>] addrspace(4)* inreg, [34 x <8 x i32>] addrspace(4)* inreg, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) {
+define amdgpu_ps float @kill_vcc_implicit_def(ptr addrspace(4) inreg, ptr addrspace(4) inreg, ptr addrspace(4) inreg, ptr addrspace(4) inreg, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) {
 entry:
   %tmp0 = fcmp olt float %13, 0.0
   call void @llvm.amdgcn.kill(i1 %tmp0)

diff  --git a/llvm/test/CodeGen/AMDGPU/diverge-extra-formal-args.ll b/llvm/test/CodeGen/AMDGPU/diverge-extra-formal-args.ll
index d3e7550aa3eac..af274a9ab1c82 100644
--- a/llvm/test/CodeGen/AMDGPU/diverge-extra-formal-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/diverge-extra-formal-args.ll
@@ -15,8 +15,8 @@ define dllexport amdgpu_vs void @_amdgpu_vs_main(i32 inreg %arg, i32 inreg %arg1
 .entry:
   %tmp = add i32 %arg4, %arg8
   %tmp9 = sext i32 %tmp to i64
-  %tmp10 = getelementptr [6 x <2 x float>], [6 x <2 x float>] addrspace(4)* @0, i64 0, i64 %tmp9
-  %tmp11 = load <2 x float>, <2 x float> addrspace(4)* %tmp10, align 8
+  %tmp10 = getelementptr [6 x <2 x float>], ptr addrspace(4) @0, i64 0, i64 %tmp9
+  %tmp11 = load <2 x float>, ptr addrspace(4) %tmp10, align 8
   %tmp12 = fadd nnan arcp contract <2 x float> zeroinitializer, %tmp11
   %tmp13 = extractelement <2 x float> %tmp12, i32 1
   call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float undef, float %tmp13, float 0.000000e+00, float 1.000000e+00, i1 true, i1 false) #1

diff  --git a/llvm/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll b/llvm/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll
index fcc9d3a64bc89..243618ae5a63e 100644
--- a/llvm/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll
@@ -17,8 +17,8 @@ define dllexport amdgpu_ps void @_amdgpu_ps_main(i32 inreg %arg) local_unnamed_a
   %tmp2 = srem i32 %tmp1, 4
   %tmp3 = select i1 false, i32 undef, i32 %tmp2
   %tmp4 = sext i32 %tmp3 to i64
-  %tmp5 = getelementptr [4 x <4 x float>], [4 x <4 x float>] addrspace(4)* @0, i64 0, i64 %tmp4
-  %tmp6 = load <4 x float>, <4 x float> addrspace(4)* %tmp5, align 16
+  %tmp5 = getelementptr [4 x <4 x float>], ptr addrspace(4) @0, i64 0, i64 %tmp4
+  %tmp6 = load <4 x float>, ptr addrspace(4) %tmp5, align 16
   %tmp7 = extractelement <4 x float> %tmp6, i32 3
   %tmp8 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp7) #1
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> undef, <2 x half> %tmp8, i1 true, i1 true) #2

diff  --git a/llvm/test/CodeGen/AMDGPU/diverge-switch-default.ll b/llvm/test/CodeGen/AMDGPU/diverge-switch-default.ll
index 0018099e68123..b2e19d684873c 100644
--- a/llvm/test/CodeGen/AMDGPU/diverge-switch-default.ll
+++ b/llvm/test/CodeGen/AMDGPU/diverge-switch-default.ll
@@ -4,7 +4,7 @@ target datalayout = "n32"
 
 ; CHECK-LABEL: @switch_unreachable_default
 
-define amdgpu_kernel void @switch_unreachable_default(i32 addrspace(1)* %out, i8 addrspace(1)* %in0, i8 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @switch_unreachable_default(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 centry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   switch i32 %tid, label %sw.default [
@@ -22,8 +22,7 @@ sw.default:
   unreachable
 
 sw.epilog:
-  %ptr = phi i8 addrspace(1)* [%in0, %sw.bb0], [%in1, %sw.bb1]
-  %gep_in = getelementptr inbounds i8, i8 addrspace(1)* %ptr, i64 0
+  %ptr = phi ptr addrspace(1) [%in0, %sw.bb0], [%in1, %sw.bb1]
   br label %sw.while
 
 ; The loop below is necessary to preserve the effect of the
@@ -45,18 +44,18 @@ sw.epilog:
 ; CHECK: br i1 [[LOOP]]
 
 sw.while:
-  %p = phi i8 addrspace(1)* [ %gep_in, %sw.epilog ], [ %incdec.ptr, %sw.while ]
+  %p = phi ptr addrspace(1) [ %ptr, %sw.epilog ], [ %incdec.ptr, %sw.while ]
   %count = phi i32 [ 0, %sw.epilog ], [ %count.inc, %sw.while ]
-  %char = load i8, i8 addrspace(1)* %p, align 1
+  %char = load i8, ptr addrspace(1) %p, align 1
   %tobool = icmp eq i8 %char, 0
-  %incdec.ptr = getelementptr inbounds i8, i8 addrspace(1)* %p, i64 1
+  %incdec.ptr = getelementptr inbounds i8, ptr addrspace(1) %p, i64 1
   %count.inc = add i32 %count, 1
   br i1 %tobool, label %sw.exit, label %sw.while
 
 sw.exit:
   %tid64 = zext i32 %tid to i64
-  %gep_out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid64
-  store i32 %count, i32 addrspace(1)* %gep_out, align 4
+  %gep_out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid64
+  store i32 %count, ptr addrspace(1) %gep_out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-at-use.ll b/llvm/test/CodeGen/AMDGPU/divergence-at-use.ll
index ab418b70e5261..201691ad968b7 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-at-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-at-use.ll
@@ -10,11 +10,11 @@ entry:
   br label %loop
 loop:
   %i = phi i32 [ 0, %entry ], [ %i1, %loop ]
-  %gep = getelementptr i32, i32 addrspace(3)* @local, i32 %i
+  %gep = getelementptr i32, ptr addrspace(3) @local, i32 %i
   %cond = icmp ult i32 %i, %x
   %i1 = add i32 %i, 1
   br i1 %cond, label %loop, label %exit
 exit:
-  %old = atomicrmw add i32 addrspace(3)* %gep, i32 %x acq_rel
+  %old = atomicrmw add ptr addrspace(3) %gep, i32 %x acq_rel
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll
index da977193a16d3..52e17bd747c5e 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll
@@ -3,12 +3,12 @@
 
 ; FUNC-LABEL: {{^}}v_abs_i32:
 ; GCN: S_ABS_I32
-define amdgpu_kernel void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_abs_i32(ptr addrspace(1) %out, i32 %val) nounwind {
   %neg = sub i32 0, %val
   %cond = icmp sgt i32 %val, %neg
   %res = select i1 %cond, i32 %val, i32 %neg
   %res2 = add i32 %res, 2
-  store i32 %res2, i32 addrspace(1)* %out, align 4
+  store i32 %res2, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -16,22 +16,22 @@ define amdgpu_kernel void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind
 ; SI:  V_SUB_CO_U32_e64
 ; GFX900: V_SUB_U32_e64
 ; GCN: V_MAX_I32_e64
-define amdgpu_kernel void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_i32(ptr addrspace(1) %out, ptr addrspace(1) %src) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds i32, i32 addrspace(1)* %src, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in, align 4
+  %gep.in = getelementptr inbounds i32, ptr addrspace(1) %src, i32 %tid
+  %val = load i32, ptr addrspace(1) %gep.in, align 4
   %neg = sub i32 0, %val
   %cond = icmp sgt i32 %val, %neg
   %res = select i1 %cond, i32 %val, i32 %neg
   %res2 = add i32 %res, 2
-  store i32 %res2, i32 addrspace(1)* %out, align 4
+  store i32 %res2, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_abs_v2i32:
 ; GCN: S_ABS_I32
 ; GCN: S_ABS_I32
-define amdgpu_kernel void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind {
+define amdgpu_kernel void @s_abs_v2i32(ptr addrspace(1) %out, <2 x i32> %val) nounwind {
   %z0 = insertelement <2 x i32> undef, i32 0, i32 0
   %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
   %t0 = insertelement <2 x i32> undef, i32 2, i32 0
@@ -40,7 +40,7 @@ define amdgpu_kernel void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
   %cond = icmp sgt <2 x i32> %val, %neg
   %res = select <2 x i1> %cond, <2 x i32> %val, <2 x i32> %neg
   %res2 = add <2 x i32> %res, %t1
-  store <2 x i32> %res2, <2 x i32> addrspace(1)* %out, align 4
+  store <2 x i32> %res2, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -49,19 +49,19 @@ define amdgpu_kernel void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 ; GFX900: V_SUB_U32_e64
 ; GCN: V_MAX_I32_e64
 ; GCN: V_MAX_I32_e64
-define amdgpu_kernel void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %src) nounwind {
   %z0 = insertelement <2 x i32> undef, i32 0, i32 0
   %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
   %t0 = insertelement <2 x i32> undef, i32 2, i32 0
   %t1 = insertelement <2 x i32> %t0, i32 2, i32 1
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %src, i32 %tid
-  %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in, align 4
+  %gep.in = getelementptr inbounds <2 x i32>, ptr addrspace(1) %src, i32 %tid
+  %val = load <2 x i32>, ptr addrspace(1) %gep.in, align 4
   %neg = sub <2 x i32> %z1, %val
   %cond = icmp sgt <2 x i32> %val, %neg
   %res = select <2 x i1> %cond, <2 x i32> %val, <2 x i32> %neg
   %res2 = add <2 x i32> %res, %t1
-  store <2 x i32> %res2, <2 x i32> addrspace(1)* %out, align 4
+  store <2 x i32> %res2, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-bfe-isel.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-bfe-isel.ll
index e40fdb7ba471b..6b1f34e72dac3 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-bfe-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-bfe-isel.ll
@@ -2,21 +2,21 @@
 
 ; GCN-LABEL: @bfe_uniform
 ; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40010
-define amdgpu_kernel void @bfe_uniform(i32 %val, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @bfe_uniform(i32 %val, ptr addrspace(1) %out) {
   %hibits = lshr i32 %val, 16
   %masked = and i32 %hibits, 15
-  store i32 %masked, i32 addrspace(1)* %out
+  store i32 %masked, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: @bfe_divergent
 ; GCN: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 4
-define amdgpu_kernel void @bfe_divergent(i32 %val, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @bfe_divergent(i32 %val, ptr addrspace(1) %out) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %divergent = add i32 %val, %tid
   %hibits = lshr i32 %divergent, 16
   %masked = and i32 %hibits, 15
-  store i32 %masked, i32 addrspace(1)* %out
+  store i32 %masked, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-bitreverse.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-bitreverse.ll
index bd87162264e41..c220620e3bca9 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-bitreverse.ll
@@ -2,39 +2,39 @@
 
 ; GCN-FUNC: uniform_bitreverse_i32
 ; GCN: S_BREV_B32
-define amdgpu_kernel void @uniform_bitreverse_i32(i32 %val, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_bitreverse_i32(i32 %val, ptr addrspace(1) %out) {
   %res = call i32 @llvm.bitreverse.i32(i32 %val)
-  store i32 %res, i32 addrspace(1)* %out
+  store i32 %res, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-FUNC: divergent_bitreverse_i32
 ; GCN: V_BFREV_B32
-define amdgpu_kernel void @divergent_bitreverse_i32(i32 %val, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @divergent_bitreverse_i32(i32 %val, ptr addrspace(1) %out) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %divergent = add i32 %val, %tid
   %res = call i32 @llvm.bitreverse.i32(i32 %divergent)
-  store i32 %res, i32 addrspace(1)* %out
+  store i32 %res, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-FUNC: uniform_bitreverse_i64
 ; GCN: S_BREV_B64
-define amdgpu_kernel void @uniform_bitreverse_i64(i64 %val, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_bitreverse_i64(i64 %val, ptr addrspace(1) %out) {
   %res = call i64 @llvm.bitreverse.i64(i64 %val)
-  store i64 %res, i64 addrspace(1)* %out
+  store i64 %res, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-FUNC: divergent_bitreverse_i64
 ; GCN: V_BFREV_B32
 ; GCN: V_BFREV_B32
-define amdgpu_kernel void @divergent_bitreverse_i64(i64 %val, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @divergent_bitreverse_i64(i64 %val, ptr addrspace(1) %out) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %ext = zext i32 %tid to i64
   %divergent = add i64 %val, %ext
   %res = call i64 @llvm.bitreverse.i64(i64 %divergent)
-  store i64 %res, i64 addrspace(1)* %out
+  store i64 %res, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index bde87ae8e9a0a..843df525b80f5 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefix=GFX906 %s
 
-define amdgpu_kernel void @uniform_vec_0_i16(i32 addrspace(1)* %out, i16 %a) {
+define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) {
 ; GCN-LABEL: uniform_vec_0_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -40,7 +40,7 @@ define amdgpu_kernel void @uniform_vec_0_i16(i32 addrspace(1)* %out, i16 %a) {
   %tmp = insertelement <2 x i16> undef, i16 0, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1
   %val = bitcast <2 x i16> %vec to i32
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -68,7 +68,7 @@ define i32 @divergent_vec_0_i16(i16 %a) {
   ret i32 %val
 }
 
-define amdgpu_kernel void @uniform_vec_i16_0(i32 addrspace(1)* %out, i16 %a) {
+define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) {
 ; GCN-LABEL: uniform_vec_i16_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -105,7 +105,7 @@ define amdgpu_kernel void @uniform_vec_i16_0(i32 addrspace(1)* %out, i16 %a) {
   %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 0, i32 1
   %val = bitcast <2 x i16> %vec to i32
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -133,7 +133,7 @@ define i32 @divergent_vec_i16_0(i16 %a) {
   ret i32 %val
 }
 
-define amdgpu_kernel void @uniform_vec_f16_0(float addrspace(1)* %out, half %a) {
+define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) {
 ; GCN-LABEL: uniform_vec_f16_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -170,7 +170,7 @@ define amdgpu_kernel void @uniform_vec_f16_0(float addrspace(1)* %out, half %a)
   %tmp = insertelement <2 x half> undef, half %a, i32 0
   %vec = insertelement <2 x half> %tmp, half 0.0, i32 1
   %val = bitcast <2 x half> %vec to float
-  store float %val, float addrspace(1)* %out, align 4
+  store float %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -198,7 +198,7 @@ define float @divergent_vec_f16_0(half %a) {
   ret float %val
 }
 
-define amdgpu_kernel void @uniform_vec_i16_LL(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) {
+define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) {
 ; GCN-LABEL: uniform_vec_i16_LL:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -239,8 +239,8 @@ define amdgpu_kernel void @uniform_vec_i16_LL(i32 addrspace(4)* %in0, i32 addrsp
 ; GFX906-NEXT:    ; use s0
 ; GFX906-NEXT:    ;;#ASMEND
 ; GFX906-NEXT:    s_endpgm
-  %val0 = load volatile i32, i32 addrspace(4)* %in0
-  %val1 = load volatile i32, i32 addrspace(4)* %in1
+  %val0 = load volatile i32, ptr addrspace(4) %in0
+  %val1 = load volatile i32, ptr addrspace(4) %in1
   %lo = trunc i32 %val0 to i16
   %hi = trunc i32 %val1 to i16
   %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
@@ -278,7 +278,7 @@ define i32 @divergent_vec_i16_LL(i16 %a, i16 %b) {
   ret i32 %val
 }
 
-define amdgpu_kernel void @uniform_vec_i16_LH(i32 addrspace(1)* %out, i16 %a, i32 %b) {
+define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 %b) {
 ; GCN-LABEL: uniform_vec_i16_LH:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -318,7 +318,7 @@ define amdgpu_kernel void @uniform_vec_i16_LH(i32 addrspace(1)* %out, i16 %a, i3
   %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 %tr, i32 1
   %val = bitcast <2 x i16> %vec to i32
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -351,7 +351,7 @@ define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) {
   ret i32 %val
 }
 
-define amdgpu_kernel void @uniform_vec_i16_HH(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; GCN-LABEL: uniform_vec_i16_HH:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -393,7 +393,7 @@ define amdgpu_kernel void @uniform_vec_i16_HH(i32 addrspace(1)* %out, i32 %a, i3
   %tmp = insertelement <2 x i16> undef, i16 %tr_a, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 %tr_b, i32 1
   %val = bitcast <2 x i16> %vec to i32
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -429,7 +429,7 @@ define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) {
   ret i32 %val
 }
 
-define amdgpu_kernel void @uniform_vec_f16_LL(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) {
+define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspace(4) %in1) {
 ; GCN-LABEL: uniform_vec_f16_LL:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -470,8 +470,8 @@ define amdgpu_kernel void @uniform_vec_f16_LL(i32 addrspace(4)* %in0, i32 addrsp
 ; GFX906-NEXT:    ; use s0
 ; GFX906-NEXT:    ;;#ASMEND
 ; GFX906-NEXT:    s_endpgm
-  %val0 = load volatile i32, i32 addrspace(4)* %in0
-  %val1 = load volatile i32, i32 addrspace(4)* %in1
+  %val0 = load volatile i32, ptr addrspace(4) %in0
+  %val1 = load volatile i32, ptr addrspace(4) %in1
   %lo.i = trunc i32 %val0 to i16
   %hi.i = trunc i32 %val1 to i16
   %lo = bitcast i16 %lo.i to half
@@ -513,7 +513,7 @@ define float @divergent_vec_f16_LL(half %a, half %b) {
   ret float %val
 }
 
-define <2 x i16> @build_vec_v2i16_undeflo_divergent(i16 addrspace(3)* %in) #0 {
+define <2 x i16> @build_vec_v2i16_undeflo_divergent(ptr addrspace(3) %in) #0 {
 ; GCN-LABEL: build_vec_v2i16_undeflo_divergent:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -536,12 +536,12 @@ define <2 x i16> @build_vec_v2i16_undeflo_divergent(i16 addrspace(3)* %in) #0 {
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i16, i16 addrspace(3)* %in
+  %load = load i16, ptr addrspace(3) %in
   %build = insertelement <2 x i16> undef, i16 %load, i32 0
   ret <2 x i16> %build
 }
 
-define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(i16 addrspace(3)* %in, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in, ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: build_vec_v2i16_undeflo_uniform:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x9
@@ -580,9 +580,9 @@ define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(i16 addrspace(3)* %in
 ; GFX906-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
-  %load = load i16, i16 addrspace(3)* %in
+  %load = load i16, ptr addrspace(3) %in
   %build = insertelement <2 x i16> undef, i16 %load, i32 0
   %result = bitcast <2 x i16> %build to i32
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-ctlz-cttz.ll
index 1a5ce43dee578..01cba682aa088 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-ctlz-cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-ctlz-cttz.ll
@@ -2,57 +2,57 @@
 
 ; GCN-LABEL: name:            s_ctlz_i32
 ; GCN: S_FLBIT_I32_B32
-define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind {
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
-  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  store i32 %ctlz, ptr addrspace(1) %out, align 4
   ret void
 }
 ; GCN-LABEL: name:            v_ctlz_i32
 ; GCN: V_FFBH_U32_e64
-define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
-  %val = load i32, i32 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
+  %val = load i32, ptr addrspace(1) %in.gep, align 4
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
-  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  store i32 %ctlz, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: name:            s_cttz_i32
 ; GCN: S_FF1_I32_B32
-define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {	
+define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind {	
   %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
-  store i32 %cttz, i32 addrspace(1)* %out, align 4
+  store i32 %cttz, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: name:            v_cttz_i32
 ; GCN: V_FFBL_B32_e64
-define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
-  %val = load i32, i32 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
+  %val = load i32, ptr addrspace(1) %in.gep, align 4
   %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
-  store i32 %cttz, i32 addrspace(1)* %out, align 4
+  store i32 %cttz, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: name:            s_flbit
 ; GCN: S_FLBIT_I32
-define amdgpu_kernel void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 {
+define amdgpu_kernel void @s_flbit(ptr addrspace(1) noalias %out, i32 %val) #0 {
   %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val)
-  store i32 %r, i32 addrspace(1)* %out, align 4
+  store i32 %r, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: name:            v_flbit
 ; GCN: V_FFBH_I32_e64
-define amdgpu_kernel void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_flbit(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
-  %val = load i32, i32 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
+  %val = load i32, ptr addrspace(1) %in.gep, align 4
   %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val)
-  store i32 %r, i32 addrspace(1)* %out, align 4
+  store i32 %r, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-ctpop.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-ctpop.ll
index 80dc4e16c0b67..0fd948064b081 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-ctpop.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-ctpop.ll
@@ -2,9 +2,9 @@
 
 ; GCN-LABEL: name:            s_ctpop_i32
 ; GCN: S_BCNT1_I32_B32
-define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind {
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
-  store i32 %ctpop, i32 addrspace(1)* %out, align 4
+  store i32 %ctpop, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -13,21 +13,21 @@ define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val)
 ; GCN: %[[SREG1:[0-9]+]]:sreg_32 = COPY %[[BCNT]]
 ; GCN: %[[SREG2:[0-9]+]]:sreg_32 = S_MOV_B32 0
 ; GCN: REG_SEQUENCE killed %[[SREG1]], %subreg.sub0, killed %[[SREG2]], %subreg.sub1
-define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, i64 %val) nounwind {
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
-  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+  store i32 %truncctpop, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: name:            v_ctpop_i32
 ; GCN: V_BCNT_U32_B32_e64
-define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %val = load i32, i32 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %val = load i32, ptr addrspace(1) %in.gep, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
-  store i32 %ctpop, i32 addrspace(1)* %out, align 4
+  store i32 %ctpop, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -36,13 +36,13 @@ define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrs
 ; GCN: %[[BCNT2:[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 killed %{{[0-9]+}}, killed %[[BCNT1]], implicit $exec
 ; GCN: %[[VGPR1:[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
 ; GCN: REG_SEQUENCE killed %[[BCNT2]], %subreg.sub0, killed %[[VGPR1]], %subreg.sub1
-define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %val = load i64, i64 addrspace(1)* %in.gep, align 8
+  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %val = load i64, ptr addrspace(1) %in.gep, align 8
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
-  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+  store i32 %truncctpop, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-min-max.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-min-max.ll
index 1a9f468c80d23..72e4a929f587b 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-min-max.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-min-max.ll
@@ -2,72 +2,72 @@
 
 ; GCN-LABEL: name:            uniform_imin
 ; GCN: S_MIN_I32
-define amdgpu_kernel void @uniform_imin(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @uniform_imin(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
   %cmp = icmp sle i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: name:            divergent_imin
 ; GCN: V_MIN_I32_e64
-define void @divergent_imin(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define void @divergent_imin(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
   %cmp = icmp sle i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: name:            uniform_umin
 ; GCN: S_MIN_U32
-define amdgpu_kernel void @uniform_umin(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @uniform_umin(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
   %tmp = icmp ule i32 %a, %b
   %val = select i1 %tmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 8
+  store i32 %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; GCN-LABEL: name:            divergent_umin
 ; GCN: V_MIN_U32_e64
-define void @divergent_umin(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define void @divergent_umin(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
   %tmp = icmp ule i32 %a, %b
   %val = select i1 %tmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 8
+  store i32 %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; GCN-LABEL: name:            uniform_imax
 ; GCN: S_MAX_I32
-define amdgpu_kernel void @uniform_imax(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @uniform_imax(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp sge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: name:            divergent_imax
 ; GCN: V_MAX_I32_e64
-define void @divergent_imax(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define void @divergent_imax(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp sge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: name:            uniform_umax
 ; GCN: S_MAX_U32
-define amdgpu_kernel void @uniform_umax(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @uniform_umax(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp uge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: name:            divergent_umax
 ; GCN: V_MAX_U32_e64
-define void @divergent_umax(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define void @divergent_umax(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp uge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-negsubinlineconst.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-negsubinlineconst.ll
index 9ba2810b5c967..f0aed60468873 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-negsubinlineconst.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-negsubinlineconst.ll
@@ -3,22 +3,22 @@
 
 ; FUNC-LABEL: {{^}}uniform_add_SIC:
 ; GCN: S_SUB_I32 killed %{{[0-9]+}}, 32
-define amdgpu_kernel void @uniform_add_SIC(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %a = load i32, i32 addrspace(1)* %in
+define amdgpu_kernel void @uniform_add_SIC(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %a = load i32, ptr addrspace(1) %in
   %result = add i32 %a, -32
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}uniform_add_SIC:
 ; SI: V_SUB_CO_U32_e64 killed %{{[0-9]+}}, 32
 ; GFX900: V_SUB_U32_e64 killed %{{[0-9]+}}, 32
-define amdgpu_kernel void @divergent_add_SIC(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @divergent_add_SIC(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
-  %a = load volatile i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
+  %a = load volatile i32, ptr addrspace(1) %gep
   %result = add i32 %a, -32
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-not-isel.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-not-isel.ll
index 64a2f73f96387..209c7d990f213 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-not-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-not-isel.ll
@@ -2,17 +2,17 @@
 
 ; GCN-LABEL: name:            scalar_not_i32
 ; GCN: S_NOT_B32
-define amdgpu_kernel void @scalar_not_i32(i32 addrspace(1)* %out, i32 %val) {
+define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %val) {
   %not.val = xor i32 %val, -1
-  store i32 %not.val, i32 addrspace(1)* %out
+  store i32 %not.val, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: name:            scalar_not_i64
 ; GCN: S_NOT_B64
-define amdgpu_kernel void @scalar_not_i64(i64 addrspace(1)* %out, i64 %val) {
+define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %val) {
   %not.val = xor i64 %val, -1
-  store i64 %not.val, i64 addrspace(1)* %out
+  store i64 %not.val, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll
index 82d72bb3ba6ff..4703fb9ea5bc9 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN:  llc -march=amdgcn  < %s | FileCheck -enable-var-scope --check-prefixes=GCN %s
 
-define amdgpu_kernel void @uniform_sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @uniform_sext_in_reg_i8_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 ; GCN-LABEL: uniform_sext_in_reg_i8_to_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -18,11 +18,11 @@ define amdgpu_kernel void @uniform_sext_in_reg_i8_to_i32(i32 addrspace(1)* %out,
   %c = add i32 %a, %b ; add to prevent folding into extload
   %shl = shl i32 %c, 24
   %ashr = ashr i32 %shl, 24
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  store i32 %ashr, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @divergent_sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @divergent_sext_in_reg_i8_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 ; GCN-LABEL: divergent_sext_in_reg_i8_to_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -41,11 +41,11 @@ define amdgpu_kernel void @divergent_sext_in_reg_i8_to_i32(i32 addrspace(1)* %ou
   %c.divergent = add i32 %c, %tid
   %shl = shl i32 %c.divergent, 24
   %ashr = ashr i32 %shl, 24
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  store i32 %ashr, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @uniform_sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @uniform_sext_in_reg_i16_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 ; GCN-LABEL: uniform_sext_in_reg_i16_to_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -62,11 +62,11 @@ define amdgpu_kernel void @uniform_sext_in_reg_i16_to_i32(i32 addrspace(1)* %out
   %c = add i32 %a, %b ; add to prevent folding into extload
   %shl = shl i32 %c, 16
   %ashr = ashr i32 %shl, 16
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  store i32 %ashr, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @divergent_sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @divergent_sext_in_reg_i16_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 ; GCN-LABEL: divergent_sext_in_reg_i16_to_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -85,7 +85,7 @@ define amdgpu_kernel void @divergent_sext_in_reg_i16_to_i32(i32 addrspace(1)* %o
   %c.divergent = add i32 %c, %tid
   %shl = shl i32 %c.divergent, 16
   %ashr = ashr i32 %shl, 16
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  store i32 %ashr, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll
index 8eae37ea0be1a..d185605403ff2 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll
@@ -3,10 +3,10 @@
 
 ; GCN-LABEL: name:            uniform_xnor_i64
 ; GCN: S_XNOR_B64
-define amdgpu_kernel void @uniform_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @uniform_xnor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
   %xor = xor i64 %a, %b
   %res = xor i64 %xor, -1
-  store i64 %res, i64 addrspace(1)* %out
+  store i64 %res, ptr addrspace(1) %out
   ret void
 }
 ; GCN-LABEL: name:            divergent_xnor_i64
@@ -16,7 +16,7 @@ define amdgpu_kernel void @uniform_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64
 ; GCN: V_NOT_B32_e32
 ; GCN_DL: V_XNOR_B32_e64
 ; GCN_DL: V_XNOR_B32_e64
-define i64 @divergent_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define i64 @divergent_xnor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
   %xor = xor i64 %a, %b
   %res = xor i64 %xor, -1
   ret i64 %res
@@ -24,10 +24,10 @@ define i64 @divergent_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 
 ; GCN-LABEL: name:            uniform_xnor_i32
 ; GCN: S_XNOR_B32
-define amdgpu_kernel void @uniform_xnor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @uniform_xnor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
   %xor = xor i32 %a, %b
   %res = xor i32 %xor, -1
-  store i32 %res, i32 addrspace(1)* %out
+  store i32 %res, ptr addrspace(1) %out
   ret void
 }
 
@@ -35,7 +35,7 @@ define amdgpu_kernel void @uniform_xnor_i32(i32 addrspace(1)* %out, i32 %a, i32
 ; GCN: V_XOR_B32_e64
 ; GCN: V_NOT_B32_e32
 ; GCN_DL: V_XNOR_B32_e64
-define i32 @divergent_xnor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define i32 @divergent_xnor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
   %xor = xor i32 %a, %b
   %res = xor i32 %xor, -1
   ret i32 %res

diff  --git a/llvm/test/CodeGen/AMDGPU/divrem24-assume.ll b/llvm/test/CodeGen/AMDGPU/divrem24-assume.ll
index f2b1a8a9435d5..dc79385d9eaca 100644
--- a/llvm/test/CodeGen/AMDGPU/divrem24-assume.ll
+++ b/llvm/test/CodeGen/AMDGPU/divrem24-assume.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tonga -amdgpu-codegenprepare %s | FileCheck %s
 
-define amdgpu_kernel void @divrem24_assume(i32 addrspace(1)* %arg, i32 %arg1) {
+define amdgpu_kernel void @divrem24_assume(ptr addrspace(1) %arg, i32 %arg1) {
 ; CHECK-LABEL: @divrem24_assume(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
@@ -22,8 +22,8 @@ define amdgpu_kernel void @divrem24_assume(i32 addrspace(1)* %arg, i32 %arg1) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP7]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = and i32 [[TMP12]], 1023
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    store i32 0, i32 addrspace(1)* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 bb:
@@ -32,8 +32,8 @@ bb:
   tail call void @llvm.assume(i1 %tmp2)
   %tmp3 = udiv i32 %tmp, %arg1
   %tmp4 = zext i32 %tmp3 to i64
-  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
-  store i32 0, i32 addrspace(1)* %tmp5, align 4
+  %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp4
+  store i32 0, ptr addrspace(1) %tmp5, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
index 5997e27fd815e..5534cb0b2a517 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
@@ -23,7 +23,7 @@ declare void @llvm.amdgcn.s.barrier() #1
 ; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:34
 ; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256
 ; CHECK: s_endpgm
-define amdgpu_kernel void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
+define amdgpu_kernel void @signed_ds_offset_addressing_loop(ptr addrspace(1) noalias nocapture %out, ptr addrspace(3) noalias nocapture readonly %lptr, i32 %n) #2 {
 entry:
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %mul = shl nsw i32 %x.i, 1
@@ -34,20 +34,20 @@ for.body:                                         ; preds = %for.body, %entry
   %offset.02 = phi i32 [ %mul, %entry ], [ %add14, %for.body ]
   %k.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
   tail call void @llvm.amdgcn.s.barrier() #1
-  %arrayidx = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %offset.02
-  %tmp = load float, float addrspace(3)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %lptr, i32 %offset.02
+  %tmp = load float, ptr addrspace(3) %arrayidx, align 4
   %add1 = add nsw i32 %offset.02, 2
-  %arrayidx2 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add1
-  %tmp1 = load float, float addrspace(3)* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(3) %lptr, i32 %add1
+  %tmp1 = load float, ptr addrspace(3) %arrayidx2, align 4
   %add3 = add nsw i32 %offset.02, 32
-  %arrayidx4 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add3
-  %tmp2 = load float, float addrspace(3)* %arrayidx4, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr addrspace(3) %lptr, i32 %add3
+  %tmp2 = load float, ptr addrspace(3) %arrayidx4, align 4
   %add5 = add nsw i32 %offset.02, 34
-  %arrayidx6 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add5
-  %tmp3 = load float, float addrspace(3)* %arrayidx6, align 4
+  %arrayidx6 = getelementptr inbounds float, ptr addrspace(3) %lptr, i32 %add5
+  %tmp3 = load float, ptr addrspace(3) %arrayidx6, align 4
   %add7 = add nsw i32 %offset.02, 64
-  %arrayidx8 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add7
-  %tmp4 = load float, float addrspace(3)* %arrayidx8, align 4
+  %arrayidx8 = getelementptr inbounds float, ptr addrspace(3) %lptr, i32 %add7
+  %tmp4 = load float, ptr addrspace(3) %arrayidx8, align 4
   %add9 = fadd float %tmp, %tmp1
   %add10 = fadd float %add9, %tmp2
   %add11 = fadd float %add10, %tmp3
@@ -60,8 +60,8 @@ for.body:                                         ; preds = %for.body, %entry
 
 for.end:                                          ; preds = %for.body
   %tmp5 = sext i32 %x.i to i64
-  %arrayidx15 = getelementptr inbounds float, float addrspace(1)* %out, i64 %tmp5
-  store float %add13, float addrspace(1)* %arrayidx15, align 4
+  %arrayidx15 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tmp5
+  store float %add13, ptr addrspace(1) %arrayidx15, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
index 5ff781bc8be00..1fe719984d174 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -44,9 +44,9 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
 entry:
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
   %sub1 = sub i32 0, %x.i
-  %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1
-  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3
-  store i32 123, i32 addrspace(3)* %arrayidx
+  %tmp0 = getelementptr [256 x i32], ptr addrspace(3) @lds.obj, i32 0, i32 %sub1
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(3) %tmp0, i32 3
+  store i32 123, ptr addrspace(3) %arrayidx
   ret void
 }
 
@@ -123,11 +123,11 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
 entry:
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
   %sub1 = sub i32 0, %x.i
-  %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1
-  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3
-  store i32 123, i32 addrspace(3)* %arrayidx
+  %tmp0 = getelementptr [256 x i32], ptr addrspace(3) @lds.obj, i32 0, i32 %sub1
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(3) %tmp0, i32 3
+  store i32 123, ptr addrspace(3) %arrayidx
   %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
-  store volatile float %fmas, float addrspace(1)* null
+  store volatile float %fmas, ptr addrspace(1) null
   ret void
 }
 
@@ -203,11 +203,11 @@ define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy
 ; GFX11-NEXT:    s_endpgm
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
   %sub1 = sub i32 -1, %x.i
-  %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1
-  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 16383
-  store i32 123, i32 addrspace(3)* %arrayidx
+  %tmp0 = getelementptr [256 x i32], ptr addrspace(3) @lds.obj, i32 0, i32 %sub1
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(3) %tmp0, i32 16383
+  store i32 123, ptr addrspace(3) %arrayidx
   %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
-  store volatile float %fmas, float addrspace(1)* null
+  store volatile float %fmas, ptr addrspace(1) null
   ret void
 }
 
@@ -243,8 +243,8 @@ define amdgpu_kernel void @add_x_shl_max_offset() #1 {
   %shl = shl i32 %x.i, 4
   %add = add i32 %shl, 65535
   %z = zext i32 %add to i64
-  %ptr = inttoptr i64 %z to i8 addrspace(3)*
-  store i8 13, i8 addrspace(3)* %ptr, align 1
+  %ptr = inttoptr i64 %z to ptr addrspace(3)
+  store i8 13, ptr addrspace(3) %ptr, align 1
   ret void
 }
 
@@ -287,8 +287,8 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 {
   %.neg = mul i32 %x.i, -4
   %add = add i32 %.neg, 65535
   %z = zext i32 %add to i64
-  %ptr = inttoptr i64 %z to i8 addrspace(3)*
-  store i8 13, i8 addrspace(3)* %ptr, align 1
+  %ptr = inttoptr i64 %z to ptr addrspace(3)
+  store i8 13, ptr addrspace(3) %ptr, align 1
   ret void
 }
 
@@ -331,8 +331,8 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_not_canonical() #1 {
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
   %add = add i32 65535, %shl
-  %ptr = inttoptr i32 %add to i8 addrspace(3)*
-  store i8 13, i8 addrspace(3)* %ptr
+  %ptr = inttoptr i32 %add to ptr addrspace(3)
+  store i8 13, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -373,8 +373,8 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
   %add = add i32 65536, %shl
-  %ptr = inttoptr i32 %add to i8 addrspace(3)*
-  store i8 13, i8 addrspace(3)* %ptr
+  %ptr = inttoptr i32 %add to ptr addrspace(3)
+  store i8 13, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -420,10 +420,10 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 {
   %shl = shl i32 %neg, 2
   %add0 = add i32 123, %shl
   %add1 = add i32 456, %shl
-  %ptr0 = inttoptr i32 %add0 to i32 addrspace(3)*
-  store volatile i32 13, i32 addrspace(3)* %ptr0
-  %ptr1 = inttoptr i32 %add1 to i32 addrspace(3)*
-  store volatile i32 13, i32 addrspace(3)* %ptr1
+  %ptr0 = inttoptr i32 %add0 to ptr addrspace(3)
+  store volatile i32 13, ptr addrspace(3) %ptr0
+  %ptr1 = inttoptr i32 %add1 to ptr addrspace(3)
+  store volatile i32 13, ptr addrspace(3) %ptr1
   ret void
 }
 
@@ -468,9 +468,9 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
   %add = add i32 123, %shl
-  %ptr = inttoptr i32 %add to i32 addrspace(3)*
-  store volatile i32 13, i32 addrspace(3)* %ptr
-  store volatile i32 13, i32 addrspace(3)* %ptr
+  %ptr = inttoptr i32 %add to ptr addrspace(3)
+  store volatile i32 13, ptr addrspace(3) %ptr
+  store volatile i32 13, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -516,8 +516,8 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
   %add = add i32 1019, %shl
-  %ptr = inttoptr i32 %add to i64 addrspace(3)*
-  store i64 123, i64 addrspace(3)* %ptr, align 4
+  %ptr = inttoptr i32 %add to ptr addrspace(3)
+  store i64 123, ptr addrspace(3) %ptr, align 4
   ret void
 }
 
@@ -600,10 +600,10 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
   %add = add i32 1019, %shl
-  %ptr = inttoptr i32 %add to i64 addrspace(3)*
-  store i64 123, i64 addrspace(3)* %ptr, align 4
+  %ptr = inttoptr i32 %add to ptr addrspace(3)
+  store i64 123, ptr addrspace(3) %ptr, align 4
   %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
-  store volatile float %fmas, float addrspace(1)* null
+  store volatile float %fmas, ptr addrspace(1) null
   ret void
 }
 
@@ -649,8 +649,8 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
   %add = add i32 1020, %shl
-  %ptr = inttoptr i32 %add to i64 addrspace(3)*
-  store i64 123, i64 addrspace(3)* %ptr, align 4
+  %ptr = inttoptr i32 %add to ptr addrspace(3)
+  store i64 123, ptr addrspace(3) %ptr, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll
index fc75ef69be032..302b351e229d5 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll
@@ -5,103 +5,103 @@
 ; GCN-LABEL: {{^}}ds1align1:
 ; GCN-COUNT-2: ds_read_u8
 ; GCN-COUNT-2: ds_write_b8
-define amdgpu_kernel void @ds1align1(i8 addrspace(3)* %in, i8 addrspace(3)* %out) {
-  %val1 = load i8, i8 addrspace(3)* %in, align 1
-  %gep1 = getelementptr i8, i8 addrspace(3)* %in, i32 1
-  %val2 = load i8, i8 addrspace(3)* %gep1, align 1
-  store i8 %val1, i8 addrspace(3)* %out, align 1
-  %gep2 = getelementptr i8, i8 addrspace(3)* %out, i32 1
-  store i8 %val2, i8 addrspace(3)* %gep2, align 1
+define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
+  %val1 = load i8, ptr addrspace(3) %in, align 1
+  %gep1 = getelementptr i8, ptr addrspace(3) %in, i32 1
+  %val2 = load i8, ptr addrspace(3) %gep1, align 1
+  store i8 %val1, ptr addrspace(3) %out, align 1
+  %gep2 = getelementptr i8, ptr addrspace(3) %out, i32 1
+  store i8 %val2, ptr addrspace(3) %gep2, align 1
   ret void
 }
 
 ; GCN-LABEL: {{^}}ds2align2:
 ; GCN-COUNT-2: ds_read_u16
 ; GCN-COUNT-2: ds_write_b16
-define amdgpu_kernel void @ds2align2(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
-  %val1 = load i16, i16 addrspace(3)* %in, align 2
-  %gep1 = getelementptr i16, i16 addrspace(3)* %in, i32 1
-  %val2 = load i16, i16 addrspace(3)* %gep1, align 2
-  store i16 %val1, i16 addrspace(3)* %out, align 2
-  %gep2 = getelementptr i16, i16 addrspace(3)* %out, i32 1
-  store i16 %val2, i16 addrspace(3)* %gep2, align 2
+define amdgpu_kernel void @ds2align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
+  %val1 = load i16, ptr addrspace(3) %in, align 2
+  %gep1 = getelementptr i16, ptr addrspace(3) %in, i32 1
+  %val2 = load i16, ptr addrspace(3) %gep1, align 2
+  store i16 %val1, ptr addrspace(3) %out, align 2
+  %gep2 = getelementptr i16, ptr addrspace(3) %out, i32 1
+  store i16 %val2, ptr addrspace(3) %gep2, align 2
   ret void
 }
 
 ; GCN-LABEL: {{^}}ds4align4:
 ; GCN: ds_read2_b32
 ; GCN: ds_write2_b32
-define amdgpu_kernel void @ds4align4(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
-  %val1 = load i32, i32 addrspace(3)* %in, align 4
-  %gep1 = getelementptr i32, i32 addrspace(3)* %in, i32 1
-  %val2 = load i32, i32 addrspace(3)* %gep1, align 4
-  store i32 %val1, i32 addrspace(3)* %out, align 4
-  %gep2 = getelementptr i32, i32 addrspace(3)* %out, i32 1
-  store i32 %val2, i32 addrspace(3)* %gep2, align 4
+define amdgpu_kernel void @ds4align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
+  %val1 = load i32, ptr addrspace(3) %in, align 4
+  %gep1 = getelementptr i32, ptr addrspace(3) %in, i32 1
+  %val2 = load i32, ptr addrspace(3) %gep1, align 4
+  store i32 %val1, ptr addrspace(3) %out, align 4
+  %gep2 = getelementptr i32, ptr addrspace(3) %out, i32 1
+  store i32 %val2, ptr addrspace(3) %gep2, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}ds8align8:
 ; GCN: ds_read2_b64
 ; GCN: ds_write2_b64
-define amdgpu_kernel void @ds8align8(i64 addrspace(3)* %in, i64 addrspace(3)* %out) {
-  %val1 = load i64, i64 addrspace(3)* %in, align 8
-  %gep1 = getelementptr i64, i64 addrspace(3)* %in, i64 1
-  %val2 = load i64, i64 addrspace(3)* %gep1, align 8
-  store i64 %val1, i64 addrspace(3)* %out, align 8
-  %gep2 = getelementptr i64, i64 addrspace(3)* %out, i64 1
-  store i64 %val2, i64 addrspace(3)* %gep2, align 8
+define amdgpu_kernel void @ds8align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
+  %val1 = load i64, ptr addrspace(3) %in, align 8
+  %gep1 = getelementptr i64, ptr addrspace(3) %in, i64 1
+  %val2 = load i64, ptr addrspace(3) %gep1, align 8
+  store i64 %val1, ptr addrspace(3) %out, align 8
+  %gep2 = getelementptr i64, ptr addrspace(3) %out, i64 1
+  store i64 %val2, ptr addrspace(3) %gep2, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}ds1align2:
 ; GCN: ds_read_u16
 ; GCN: ds_write_b16
-define amdgpu_kernel void @ds1align2(i8 addrspace(3)* %in, i8 addrspace(3)* %out) {
-  %val1 = load i8, i8 addrspace(3)* %in, align 2
-  %gep1 = getelementptr i8, i8 addrspace(3)* %in, i32 1
-  %val2 = load i8, i8 addrspace(3)* %gep1, align 2
-  store i8 %val1, i8 addrspace(3)* %out, align 2
-  %gep2 = getelementptr i8, i8 addrspace(3)* %out, i32 1
-  store i8 %val2, i8 addrspace(3)* %gep2, align 2
+define amdgpu_kernel void @ds1align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
+  %val1 = load i8, ptr addrspace(3) %in, align 2
+  %gep1 = getelementptr i8, ptr addrspace(3) %in, i32 1
+  %val2 = load i8, ptr addrspace(3) %gep1, align 2
+  store i8 %val1, ptr addrspace(3) %out, align 2
+  %gep2 = getelementptr i8, ptr addrspace(3) %out, i32 1
+  store i8 %val2, ptr addrspace(3) %gep2, align 2
   ret void
 }
 
 ; GCN-LABEL: {{^}}ds2align4:
 ; GCN: ds_read_b32
 ; GCN: ds_write_b32
-define amdgpu_kernel void @ds2align4(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
-  %val1 = load i16, i16 addrspace(3)* %in, align 4
-  %gep1 = getelementptr i16, i16 addrspace(3)* %in, i32 1
-  %val2 = load i16, i16 addrspace(3)* %gep1, align 4
-  store i16 %val1, i16 addrspace(3)* %out, align 4
-  %gep2 = getelementptr i16, i16 addrspace(3)* %out, i32 1
-  store i16 %val2, i16 addrspace(3)* %gep2, align 4
+define amdgpu_kernel void @ds2align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
+  %val1 = load i16, ptr addrspace(3) %in, align 4
+  %gep1 = getelementptr i16, ptr addrspace(3) %in, i32 1
+  %val2 = load i16, ptr addrspace(3) %gep1, align 4
+  store i16 %val1, ptr addrspace(3) %out, align 4
+  %gep2 = getelementptr i16, ptr addrspace(3) %out, i32 1
+  store i16 %val2, ptr addrspace(3) %gep2, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}ds4align8:
 ; GCN: ds_read_b64
 ; GCN: ds_write_b64
-define amdgpu_kernel void @ds4align8(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
-  %val1 = load i32, i32 addrspace(3)* %in, align 8
-  %gep1 = getelementptr i32, i32 addrspace(3)* %in, i32 1
-  %val2 = load i32, i32 addrspace(3)* %gep1, align 8
-  store i32 %val1, i32 addrspace(3)* %out, align 8
-  %gep2 = getelementptr i32, i32 addrspace(3)* %out, i32 1
-  store i32 %val2, i32 addrspace(3)* %gep2, align 8
+define amdgpu_kernel void @ds4align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
+  %val1 = load i32, ptr addrspace(3) %in, align 8
+  %gep1 = getelementptr i32, ptr addrspace(3) %in, i32 1
+  %val2 = load i32, ptr addrspace(3) %gep1, align 8
+  store i32 %val1, ptr addrspace(3) %out, align 8
+  %gep2 = getelementptr i32, ptr addrspace(3) %out, i32 1
+  store i32 %val2, ptr addrspace(3) %gep2, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}ds8align16:
 ; GCN: ds_read_b128
 ; GCN: ds_write_b128
-define amdgpu_kernel void @ds8align16(i64 addrspace(3)* %in, i64 addrspace(3)* %out) {
-  %val1 = load i64, i64 addrspace(3)* %in, align 16
-  %gep1 = getelementptr i64, i64 addrspace(3)* %in, i64 1
-  %val2 = load i64, i64 addrspace(3)* %gep1, align 16
-  store i64 %val1, i64 addrspace(3)* %out, align 16
-  %gep2 = getelementptr i64, i64 addrspace(3)* %out, i64 1
-  store i64 %val2, i64 addrspace(3)* %gep2, align 16
+define amdgpu_kernel void @ds8align16(ptr addrspace(3) %in, ptr addrspace(3) %out) {
+  %val1 = load i64, ptr addrspace(3) %in, align 16
+  %gep1 = getelementptr i64, ptr addrspace(3) %in, i64 1
+  %val2 = load i64, ptr addrspace(3) %gep1, align 16
+  store i64 %val1, ptr addrspace(3) %out, align 16
+  %gep2 = getelementptr i64, ptr addrspace(3) %out, i64 1
+  store i64 %val2, ptr addrspace(3) %gep2, align 16
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 4a7a328d33c61..8fd43c5811084 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -9,7 +9,7 @@
 @lds = addrspace(3) global [512 x float] undef, align 4
 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
 
-define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 {
 ; CI-LABEL: simple_read2_f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
@@ -34,18 +34,18 @@ define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 {
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
-define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0 {
 ; CI-LABEL: simple_read2_f32_max_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
@@ -70,18 +70,18 @@ define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out)
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 255
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
-define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 {
 ; CI-LABEL: simple_read2_f32_too_far:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
@@ -108,18 +108,18 @@ define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 257
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
-define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 {
 ; CI-LABEL: simple_read2_f32_x2:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
@@ -151,31 +151,31 @@ define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
 ; GFX9-NEXT:    s_endpgm
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 0
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
 
   %idx.1 = add nsw i32 %tid.x, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1
+  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
   %sum.0 = fadd float %val0, %val1
 
   %idx.2 = add nsw i32 %tid.x, 11
-  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
-  %val2 = load float, float addrspace(3)* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2
+  %val2 = load float, ptr addrspace(3) %arrayidx2, align 4
 
   %idx.3 = add nsw i32 %tid.x, 27
-  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
-  %val3 = load float, float addrspace(3)* %arrayidx3, align 4
+  %arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3
+  %val3 = load float, ptr addrspace(3) %arrayidx3, align 4
   %sum.1 = fadd float %val2, %val3
 
   %sum = fadd float %sum.0, %sum.1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %idx.0
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
 ; Make sure there is an instruction between the two sets of reads.
-define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0 {
 ; CI-LABEL: simple_read2_f32_x2_barrier:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
@@ -211,34 +211,34 @@ define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out)
 ; GFX9-NEXT:    s_endpgm
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 0
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
 
   %idx.1 = add nsw i32 %tid.x, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1
+  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
   %sum.0 = fadd float %val0, %val1
 
   call void @llvm.amdgcn.s.barrier() #2
 
   %idx.2 = add nsw i32 %tid.x, 11
-  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
-  %val2 = load float, float addrspace(3)* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2
+  %val2 = load float, ptr addrspace(3) %arrayidx2, align 4
 
   %idx.3 = add nsw i32 %tid.x, 27
-  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
-  %val3 = load float, float addrspace(3)* %arrayidx3, align 4
+  %arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3
+  %val3 = load float, ptr addrspace(3) %arrayidx3, align 4
   %sum.1 = fadd float %val2, %val3
 
   %sum = fadd float %sum.0, %sum.1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %idx.0
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
 ; For some reason adding something to the base address for the first
 ; element results in only folding the inner pair.
-define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %out) #0 {
 ; CI-LABEL: simple_read2_f32_x2_nonzero_base:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
@@ -270,26 +270,26 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)*
 ; GFX9-NEXT:    s_endpgm
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
 
   %idx.1 = add nsw i32 %tid.x, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1
+  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
   %sum.0 = fadd float %val0, %val1
 
   %idx.2 = add nsw i32 %tid.x, 11
-  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
-  %val2 = load float, float addrspace(3)* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2
+  %val2 = load float, ptr addrspace(3) %arrayidx2, align 4
 
   %idx.3 = add nsw i32 %tid.x, 27
-  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
-  %val3 = load float, float addrspace(3)* %arrayidx3, align 4
+  %arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3
+  %val3 = load float, ptr addrspace(3) %arrayidx3, align 4
   %sum.1 = fadd float %val2, %val3
 
   %sum = fadd float %sum.0, %sum.1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %idx.0
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
@@ -298,7 +298,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)*
 ; merge.
 ; Base pointers come from 
diff erent subregister of same super
 ; register. We can't safely merge this.
-define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 {
 ; CI-LABEL: read2_ptr_is_subreg_arg_f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
@@ -333,15 +333,15 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out,
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
-  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
-  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
-  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
-  %val0 = load float, float addrspace(3)* %gep.0, align 4
-  %val1 = load float, float addrspace(3)* %gep.1, align 4
+  %gep = getelementptr inbounds float, <2 x ptr addrspace(3)> %lds.ptr, <2 x i32> %index.1
+  %gep.0 = extractelement <2 x ptr addrspace(3)> %gep, i32 0
+  %gep.1 = extractelement <2 x ptr addrspace(3)> %gep, i32 1
+  %val0 = load float, ptr addrspace(3) %gep.0, align 4
+  %val1 = load float, ptr addrspace(3) %gep.1, align 4
   %add.x = add nsw i32 %x.i, 8
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
@@ -349,7 +349,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out,
 ; are rejecting merges that have the same, constant 0 offset, so make
 ; sure we are really rejecting it because of the 
diff erent
 ; subregisters.
-define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 {
 ; CI-LABEL: read2_ptr_is_subreg_arg_offset_f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
@@ -384,23 +384,23 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
-  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
-  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
-  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
+  %gep = getelementptr inbounds float, <2 x ptr addrspace(3)> %lds.ptr, <2 x i32> %index.1
+  %gep.0 = extractelement <2 x ptr addrspace(3)> %gep, i32 0
+  %gep.1 = extractelement <2 x ptr addrspace(3)> %gep, i32 1
 
   ; Apply an additional offset after the vector that will be more obviously folded.
-  %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8
+  %gep.1.offset = getelementptr float, ptr addrspace(3) %gep.1, i32 8
 
-  %val0 = load float, float addrspace(3)* %gep.0, align 4
-  %val1 = load float, float addrspace(3)* %gep.1.offset, align 4
+  %val0 = load float, ptr addrspace(3) %gep.0, align 4
+  %val1 = load float, ptr addrspace(3) %gep.1.offset, align 4
   %add.x = add nsw i32 %x.i, 8
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
-define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 {
 ; CI-LABEL: read2_ptr_is_subreg_f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
@@ -425,24 +425,24 @@ define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0
-  %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1
+  %ptr.0 = insertelement <2 x ptr addrspace(3)> undef, ptr addrspace(3) @lds, i32 0
+  %ptr.1 = insertelement <2 x ptr addrspace(3)> %ptr.0, ptr addrspace(3) @lds, i32 1
   %x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %x.i.v.1 = insertelement <2 x i32> %x.i.v.0, i32 %x.i, i32 1
   %idx = add <2 x i32> %x.i.v.1, <i32 0, i32 8>
-  %gep = getelementptr inbounds [512 x float], <2 x [512 x float] addrspace(3)*> %ptr.1, <2 x i32> <i32 0, i32 0>, <2 x i32> %idx
-  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
-  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
-  %val0 = load float, float addrspace(3)* %gep.0, align 4
-  %val1 = load float, float addrspace(3)* %gep.1, align 4
+  %gep = getelementptr inbounds [512 x float], <2 x ptr addrspace(3)> %ptr.1, <2 x i32> <i32 0, i32 0>, <2 x i32> %idx
+  %gep.0 = extractelement <2 x ptr addrspace(3)> %gep, i32 0
+  %gep.1 = extractelement <2 x ptr addrspace(3)> %gep, i32 1
+  %val0 = load float, ptr addrspace(3) %gep.0, align 4
+  %val1 = load float, ptr addrspace(3) %gep.1, align 4
   %add.x = add nsw i32 %x.i, 8
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
-define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0 {
 ; CI-LABEL: simple_read2_f32_volatile_0:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
@@ -469,18 +469,18 @@ define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  %val0 = load volatile float, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
-define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 {
 ; CI-LABEL: simple_read2_f32_volatile_1:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
@@ -507,19 +507,19 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load volatile float, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  %val1 = load volatile float, ptr addrspace(3) %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
 ; Can't fold since not correctly aligned.
-define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 ; CI-LABEL: unaligned_read2_f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[0:1], 0x2
@@ -601,18 +601,18 @@ define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float a
 ; GFX9-UNALIGNED-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-UNALIGNED-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 1
+  %arrayidx0 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %x.i
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 1
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 1
+  %arrayidx1 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x
+  %val1 = load float, ptr addrspace(3) %arrayidx1, align 1
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
-define amdgpu_kernel void @unaligned_offset_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 ; CI-LABEL: unaligned_offset_read2_f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[0:1], 0x2
@@ -694,21 +694,18 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(float addrspace(1)* %out,
 ; GFX9-UNALIGNED-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-UNALIGNED-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %base = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
-  %base.i8 = bitcast float addrspace(3)* %base to i8 addrspace(3)*
-  %addr0.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 5
-  %addr0 = bitcast i8 addrspace(3)* %addr0.i8 to float addrspace(3)*
-  %val0 = load float, float addrspace(3)* %addr0, align 1
-  %addr1.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 9
-  %addr1 = bitcast i8 addrspace(3)* %addr1.i8 to float addrspace(3)*
-  %val1 = load float, float addrspace(3)* %addr1, align 1
+  %base = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %x.i
+  %addr0.i8 = getelementptr inbounds i8, ptr addrspace(3) %base, i32 5
+  %val0 = load float, ptr addrspace(3) %addr0.i8, align 1
+  %addr1.i8 = getelementptr inbounds i8, ptr addrspace(3) %base, i32 9
+  %val1 = load float, ptr addrspace(3) %addr1.i8, align 1
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
-define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 ; CI-LABEL: misaligned_2_simple_read2_f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[0:1], 0x2
@@ -767,18 +764,18 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %ou
 ; GFX9-UNALIGNED-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-UNALIGNED-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 2
+  %arrayidx0 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %x.i
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 2
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 2
+  %arrayidx1 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x
+  %val1 = load float, ptr addrspace(3) %arrayidx1, align 2
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
-define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 {
 ; CI-LABEL: simple_read2_f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
@@ -803,18 +800,18 @@ define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 {
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
+  %val0 = load double, ptr addrspace(3) %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
+  %val1 = load double, ptr addrspace(3) %arrayidx1, align 8
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 8
+  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i
+  store double %sum, ptr addrspace(1) %out.gep, align 8
   ret void
 }
 
-define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0 {
 ; CI-LABEL: simple_read2_f64_max_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
@@ -839,18 +836,18 @@ define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
+  %val0 = load double, ptr addrspace(3) %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 255
-  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
+  %val1 = load double, ptr addrspace(3) %arrayidx1, align 8
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 8
+  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i
+  store double %sum, ptr addrspace(1) %out.gep, align 8
   ret void
 }
 
-define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 {
 ; CI-LABEL: simple_read2_f64_too_far:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
@@ -877,19 +874,19 @@ define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
+  %val0 = load double, ptr addrspace(3) %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 257
-  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
+  %val1 = load double, ptr addrspace(3) %arrayidx1, align 8
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 8
+  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i
+  store double %sum, ptr addrspace(1) %out.gep, align 8
   ret void
 }
 
 ; Alignment only 4
-define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 ; CI-LABEL: misaligned_read2_f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[0:1], 0x2
@@ -922,20 +919,20 @@ define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, doubl
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %x.i
+  %val0 = load double, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 7
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x
+  %val1 = load double, ptr addrspace(3) %arrayidx1, align 4
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i
+  store double %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
 @foo = addrspace(3) global [4 x i32] undef, align 4
 
-define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) {
 ; CI-LABEL: load_constant_adjacent_offsets:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
@@ -958,14 +955,14 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
-  %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
-  %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
+  %val0 = load i32, ptr addrspace(3) @foo, align 4
+  %val1 = load i32, ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @foo, i32 0, i32 1), align 4
   %sum = add i32 %val0, %val1
-  store i32 %sum, i32 addrspace(1)* %out, align 4
+  store i32 %sum, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out) {
 ; CI-LABEL: load_constant_disjoint_offsets:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
@@ -988,16 +985,16 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
-  %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
-  %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
+  %val0 = load i32, ptr addrspace(3) @foo, align 4
+  %val1 = load i32, ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @foo, i32 0, i32 2), align 4
   %sum = add i32 %val0, %val1
-  store i32 %sum, i32 addrspace(1)* %out, align 4
+  store i32 %sum, ptr addrspace(1) %out, align 4
   ret void
 }
 
 @bar = addrspace(3) global [4 x i64] undef, align 4
 
-define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) %out) {
 ; CI-LABEL: load_misaligned64_constant_offsets:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
@@ -1022,16 +1019,16 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)*
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
-  %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
-  %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
+  %val0 = load i64, ptr addrspace(3) @bar, align 4
+  %val1 = load i64, ptr addrspace(3) getelementptr inbounds ([4 x i64], ptr addrspace(3) @bar, i32 0, i32 1), align 4
   %sum = add i64 %val0, %val1
-  store i64 %sum, i64 addrspace(1)* %out, align 8
+  store i64 %sum, ptr addrspace(1) %out, align 8
   ret void
 }
 
 @bar.large = addrspace(3) global [4096 x i64] undef, align 4
 
-define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspace(1) %out) {
 ; CI-LABEL: load_misaligned64_constant_large_offsets:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_mov_b32_e32 v2, 0
@@ -1058,17 +1055,17 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspac
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
-  %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
-  %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
+  %val0 = load i64, ptr addrspace(3) getelementptr inbounds ([4096 x i64], ptr addrspace(3) @bar.large, i32 0, i32 2048), align 4
+  %val1 = load i64, ptr addrspace(3) getelementptr inbounds ([4096 x i64], ptr addrspace(3) @bar.large, i32 0, i32 4095), align 4
   %sum = add i64 %val0, %val1
-  store i64 %sum, i64 addrspace(1)* %out, align 8
+  store i64 %sum, ptr addrspace(1) %out, align 8
   ret void
 }
 
 @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
 
-define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
+define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb) #0 {
 ; CI-LABEL: sgemm_inner_loop_read2_sequence:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
@@ -1131,34 +1128,34 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
   %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
-  %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
-  %tmp16 = load float, float addrspace(3)* %arrayidx44, align 4
+  %arrayidx44 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %x.i
+  %tmp16 = load float, ptr addrspace(3) %arrayidx44, align 4
   %add47 = add nsw i32 %x.i, 1
-  %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
-  %tmp17 = load float, float addrspace(3)* %arrayidx48, align 4
+  %arrayidx48 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add47
+  %tmp17 = load float, ptr addrspace(3) %arrayidx48, align 4
   %add51 = add nsw i32 %x.i, 16
-  %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
-  %tmp18 = load float, float addrspace(3)* %arrayidx52, align 4
+  %arrayidx52 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add51
+  %tmp18 = load float, ptr addrspace(3) %arrayidx52, align 4
   %add55 = add nsw i32 %x.i, 17
-  %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
-  %tmp19 = load float, float addrspace(3)* %arrayidx56, align 4
-  %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
-  %tmp20 = load float, float addrspace(3)* %arrayidx60, align 4
+  %arrayidx56 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add55
+  %tmp19 = load float, ptr addrspace(3) %arrayidx56, align 4
+  %arrayidx60 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %y.i
+  %tmp20 = load float, ptr addrspace(3) %arrayidx60, align 4
   %add63 = add nsw i32 %y.i, 1
-  %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
-  %tmp21 = load float, float addrspace(3)* %arrayidx64, align 4
+  %arrayidx64 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add63
+  %tmp21 = load float, ptr addrspace(3) %arrayidx64, align 4
   %add67 = add nsw i32 %y.i, 32
-  %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
-  %tmp22 = load float, float addrspace(3)* %arrayidx68, align 4
+  %arrayidx68 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add67
+  %tmp22 = load float, ptr addrspace(3) %arrayidx68, align 4
   %add71 = add nsw i32 %y.i, 33
-  %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
-  %tmp23 = load float, float addrspace(3)* %arrayidx72, align 4
+  %arrayidx72 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add71
+  %tmp23 = load float, ptr addrspace(3) %arrayidx72, align 4
   %add75 = add nsw i32 %y.i, 64
-  %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
-  %tmp24 = load float, float addrspace(3)* %arrayidx76, align 4
+  %arrayidx76 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add75
+  %tmp24 = load float, ptr addrspace(3) %arrayidx76, align 4
   %add79 = add nsw i32 %y.i, 65
-  %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
-  %tmp25 = load float, float addrspace(3)* %arrayidx80, align 4
+  %arrayidx80 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add79
+  %tmp25 = load float, ptr addrspace(3) %arrayidx80, align 4
   %sum.0 = fadd float %tmp16, %tmp17
   %sum.1 = fadd float %sum.0, %tmp18
   %sum.2 = fadd float %sum.1, %tmp19
@@ -1168,11 +1165,11 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %
   %sum.6 = fadd float %sum.5, %tmp23
   %sum.7 = fadd float %sum.6, %tmp24
   %sum.8 = fadd float %sum.7, %tmp25
-  store float %sum.8, float addrspace(1)* %C, align 4
+  store float %sum.8, ptr addrspace(1) %C, align 4
   ret void
 }
 
-define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
 ; CI-LABEL: misaligned_read2_v2i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[0:1], 0x2
@@ -1198,12 +1195,12 @@ define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out,
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
-  %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
-  store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8
+  %load = load <2 x i32>, ptr addrspace(3) %in, align 4
+  store <2 x i32> %load, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
 ; CI-LABEL: misaligned_read2_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[0:1], 0x2
@@ -1229,8 +1226,8 @@ define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addr
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
-  %load = load i64, i64 addrspace(3)* %in, align 4
-  store i64 %load, i64 addrspace(1)* %out, align 8
+  %load = load i64, ptr addrspace(3) %in, align 4
+  store i64 %load, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -1294,44 +1291,44 @@ define amdgpu_kernel void @ds_read_
diff _base_interleaving(
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    global_store_dword v8, v0, s[0:1] offset:40
 ; GFX9-NEXT:    s_endpgm
-  float addrspace(1)* nocapture %arg,
-  [4 x [4 x float]] addrspace(3)* %arg1,
-  [4 x [4 x float]] addrspace(3)* %arg2,
-  [4 x [4 x float]] addrspace(3)* %arg3,
-  [4 x [4 x float]] addrspace(3)* %arg4) #1 {
+  ptr addrspace(1) nocapture %arg,
+  ptr addrspace(3) %arg1,
+  ptr addrspace(3) %arg2,
+  ptr addrspace(3) %arg3,
+  ptr addrspace(3) %arg4) #1 {
 bb:
-  %tmp = getelementptr float, float addrspace(1)* %arg, i64 10
+  %tmp = getelementptr float, ptr addrspace(1) %arg, i64 10
   %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.x() #2
   %tmp6 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
-  %tmp7 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg1, i32 0, i32 %tmp6, i32 0
-  %tmp8 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg2, i32 0, i32 0, i32 %tmp5
-  %tmp9 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg3, i32 0, i32 %tmp6, i32 0
-  %tmp10 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg4, i32 0, i32 0, i32 %tmp5
-  %tmp11 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg1, i32 0, i32 %tmp6, i32 1
-  %tmp12 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg2, i32 0, i32 1, i32 %tmp5
-  %tmp13 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg3, i32 0, i32 %tmp6, i32 1
-  %tmp14 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg4, i32 0, i32 1, i32 %tmp5
-  %tmp15 = load float, float addrspace(3)* %tmp7
-  %tmp16 = load float, float addrspace(3)* %tmp8
+  %tmp7 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg1, i32 0, i32 %tmp6, i32 0
+  %tmp8 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg2, i32 0, i32 0, i32 %tmp5
+  %tmp9 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg3, i32 0, i32 %tmp6, i32 0
+  %tmp10 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg4, i32 0, i32 0, i32 %tmp5
+  %tmp11 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg1, i32 0, i32 %tmp6, i32 1
+  %tmp12 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg2, i32 0, i32 1, i32 %tmp5
+  %tmp13 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg3, i32 0, i32 %tmp6, i32 1
+  %tmp14 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg4, i32 0, i32 1, i32 %tmp5
+  %tmp15 = load float, ptr addrspace(3) %tmp7
+  %tmp16 = load float, ptr addrspace(3) %tmp8
   %tmp17 = fmul float %tmp15, %tmp16
   %tmp18 = fadd float 2.000000e+00, %tmp17
-  %tmp19 = load float, float addrspace(3)* %tmp9
-  %tmp20 = load float, float addrspace(3)* %tmp10
+  %tmp19 = load float, ptr addrspace(3) %tmp9
+  %tmp20 = load float, ptr addrspace(3) %tmp10
   %tmp21 = fmul float %tmp19, %tmp20
   %tmp22 = fsub float %tmp18, %tmp21
-  %tmp23 = load float, float addrspace(3)* %tmp11
-  %tmp24 = load float, float addrspace(3)* %tmp12
+  %tmp23 = load float, ptr addrspace(3) %tmp11
+  %tmp24 = load float, ptr addrspace(3) %tmp12
   %tmp25 = fmul float %tmp23, %tmp24
   %tmp26 = fsub float %tmp22, %tmp25
-  %tmp27 = load float, float addrspace(3)* %tmp13
-  %tmp28 = load float, float addrspace(3)* %tmp14
+  %tmp27 = load float, ptr addrspace(3) %tmp13
+  %tmp28 = load float, ptr addrspace(3) %tmp14
   %tmp29 = fmul float %tmp27, %tmp28
   %tmp30 = fsub float %tmp26, %tmp29
-  store float %tmp30, float addrspace(1)* %tmp
+  store float %tmp30, ptr addrspace(1) %tmp
   ret void
 }
 
-define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspace(3)* %arg) {
+define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspace(3) %arg) {
 ; CI-LABEL: ds_read_call_read:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_getpc_b64 s[40:41]
@@ -1416,17 +1413,17 @@ define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspa
 ; GFX9-NEXT:    global_store_dword v40, v0, s[34:35]
 ; GFX9-NEXT:    s_endpgm
   %x = call i32 @llvm.amdgcn.workitem.id.x()
-  %arrayidx0 = getelementptr i32, i32 addrspace(3)* %arg, i32 %x
-  %arrayidx1 = getelementptr i32, i32 addrspace(3)* %arrayidx0, i32 1
-  %v0 = load i32, i32 addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr i32, ptr addrspace(3) %arg, i32 %x
+  %arrayidx1 = getelementptr i32, ptr addrspace(3) %arrayidx0, i32 1
+  %v0 = load i32, ptr addrspace(3) %arrayidx0, align 4
   call void @void_func_void()
-  %v1 = load i32, i32 addrspace(3)* %arrayidx1, align 4
+  %v1 = load i32, ptr addrspace(3) %arrayidx1, align 4
   %r = add i32 %v0, %v1
-  store i32 %r, i32 addrspace(1)* %out, align 4
+  store i32 %r, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, float addrspace(3)* %inptr) {
+define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, ptr addrspace(3) %inptr) {
 ; CI-LABEL: ds_read_interp_read:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_mov_b32 m0, -1
@@ -1449,10 +1446,10 @@ define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, float addrsp
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX9-NEXT:    ; return to shader part epilog
-  %v0 = load float, float addrspace(3)* %inptr, align 4
+  %v0 = load float, ptr addrspace(3) %inptr, align 4
   %intrp = call float @llvm.amdgcn.interp.mov(i32 0, i32 0, i32 0, i32 %prims)
-  %ptr1 = getelementptr float, float addrspace(3)* %inptr, i32 4
-  %v1 = load float, float addrspace(3)* %ptr1, align 4
+  %ptr1 = getelementptr float, ptr addrspace(3) %inptr, i32 4
+  %v1 = load float, ptr addrspace(3) %ptr1, align 4
   %v1b = fadd float %v1, %intrp
   %r0 = insertelement <2 x float> undef, float %v0, i32 0
   %r1 = insertelement <2 x float> %r0, float %v1b, i32 1
@@ -1461,7 +1458,7 @@ define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, float addrsp
 
 @v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1
 
-define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) {
 ; CI-LABEL: read2_v2i32_align1_odd_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
@@ -1533,8 +1530,8 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)*
 ; GFX9-UNALIGNED-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-UNALIGNED-NEXT:    s_endpgm
 entry:
-  %load = load <2 x i32>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1
-  store <2 x i32> %load, <2 x i32> addrspace(1)* %out
+  %load = load <2 x i32>, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @v2i32_align1, i32 65), align 1
+  store <2 x i32> %load, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
index 567476ba2c24d..b44aab5d51fd0 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
@@ -10,34 +10,33 @@
 ; SI-DAG: ds_read_b64 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:8
 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:12
 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:6 offset1:248
-define amdgpu_kernel void @offset_order(float addrspace(1)* %out) {
+define amdgpu_kernel void @offset_order(ptr addrspace(1) %out) {
 entry:
-  %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0
-  %val0 = load float, float addrspace(3)* %ptr0
+  %val0 = load float, ptr addrspace(3) @lds
 
-  %ptr1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 256
-  %val1 = load float, float addrspace(3)* %ptr1
+  %ptr1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 256
+  %val1 = load float, ptr addrspace(3) %ptr1
   %add1 = fadd float %val0, %val1
 
-  %ptr2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 3
-  %val2 = load float, float addrspace(3)* %ptr2
+  %ptr2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 3
+  %val2 = load float, ptr addrspace(3) %ptr2
   %add2 = fadd float %add1, %val2
 
-  %ptr3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2
-  %val3 = load float, float addrspace(3)* %ptr3
+  %ptr3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 2
+  %val3 = load float, ptr addrspace(3) %ptr3
   %add3 = fadd float %add2, %val3
 
-  %ptr4 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 12
-  %val4 = load float, float addrspace(3)* %ptr4
+  %ptr4 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 12
+  %val4 = load float, ptr addrspace(3) %ptr4
   %add4 = fadd float %add3, %val4
 
-  %ptr5 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 14
-  %val5 = load float, float addrspace(3)* %ptr5
+  %ptr5 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 14
+  %val5 = load float, ptr addrspace(3) %ptr5
   %add5 = fadd float %add4, %val5
 
-  %ptr6 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11
-  %val6 = load float, float addrspace(3)* %ptr6
+  %ptr6 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 11
+  %val6 = load float, ptr addrspace(3) %ptr6
   %add6 = fadd float %add5, %val6
-  store float %add6, float addrspace(1)* %out
+  store float %add6, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
index fef46fd55bb9e..165b5013c8b02 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
@@ -12,12 +12,12 @@
 ; CI: s_waitcnt lgkmcnt(0)
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; CI: s_endpgm
-define amdgpu_kernel void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v2f32_superreg_align4(ptr addrspace(1) %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds  [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
-  %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0, align 4
-  %out.gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i32 %x.i
-  store <2 x float> %val0, <2 x float> addrspace(1)* %out.gep
+  %arrayidx0 = getelementptr inbounds  [512 x <2 x float>], ptr addrspace(3) @lds.v2, i32 0, i32 %x.i
+  %val0 = load <2 x float>, ptr addrspace(3) %arrayidx0, align 4
+  %out.gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %out, i32 %x.i
+  store <2 x float> %val0, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -26,12 +26,12 @@ define amdgpu_kernel void @simple_read2_v2f32_superreg_align4(<2 x float> addrsp
 ; CI: s_waitcnt lgkmcnt(0)
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; CI: s_endpgm
-define amdgpu_kernel void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v2f32_superreg(ptr addrspace(1) %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
-  %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0
-  %out.gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i32 %x.i
-  store <2 x float> %val0, <2 x float> addrspace(1)* %out.gep
+  %arrayidx0 = getelementptr inbounds [512 x <2 x float>], ptr addrspace(3) @lds.v2, i32 0, i32 %x.i
+  %val0 = load <2 x float>, ptr addrspace(3) %arrayidx0
+  %out.gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %out, i32 %x.i
+  store <2 x float> %val0, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -43,10 +43,10 @@ define amdgpu_kernel void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)*
 ; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD0]], v[[ADD1]]
 ; CI: buffer_store_dword v[[ADD2]]
 ; CI: s_endpgm
-define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(ptr addrspace(1) %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
-  %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x <4 x float>], ptr addrspace(3) @lds.v4, i32 0, i32 %x.i
+  %val0 = load <4 x float>, ptr addrspace(3) %arrayidx0, align 4
   %elt0 = extractelement <4 x float> %val0, i32 0
   %elt1 = extractelement <4 x float> %val0, i32 1
   %elt2 = extractelement <4 x float> %val0, i32 2
@@ -56,8 +56,8 @@ define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1)
   %add1 = fadd float %elt1, %elt3
   %add2 = fadd float %add0, %add1
 
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %add2, float addrspace(1)* %out.gep
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %add2, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -68,10 +68,10 @@ define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1)
 ; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[ADD0]], v[[REG_Y]]
 ; CI: buffer_store_dword v[[ADD1]]
 ; CI: s_endpgm
-define amdgpu_kernel void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v3f32_superreg_align4(ptr addrspace(1) %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x <3 x float>], [512 x <3 x float>] addrspace(3)* @lds.v3, i32 0, i32 %x.i
-  %val0 = load <3 x float>, <3 x float> addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x <3 x float>], ptr addrspace(3) @lds.v3, i32 0, i32 %x.i
+  %val0 = load <3 x float>, ptr addrspace(3) %arrayidx0, align 4
   %elt0 = extractelement <3 x float> %val0, i32 0
   %elt1 = extractelement <3 x float> %val0, i32 1
   %elt2 = extractelement <3 x float> %val0, i32 2
@@ -79,8 +79,8 @@ define amdgpu_kernel void @simple_read2_v3f32_superreg_align4(float addrspace(1)
   %add0 = fadd float %elt0, %elt2
   %add1 = fadd float %add0, %elt1
 
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %add1, float addrspace(1)* %out.gep
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %add1, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -88,12 +88,12 @@ define amdgpu_kernel void @simple_read2_v3f32_superreg_align4(float addrspace(1)
 ; CI: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
 ; CI: buffer_store_dwordx4 [[REG_ZW]]
 ; CI: s_endpgm
-define amdgpu_kernel void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg_align8(ptr addrspace(1) %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
-  %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 8
-  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i32 %x.i
-  store <4 x float> %val0, <4 x float> addrspace(1)* %out.gep
+  %arrayidx0 = getelementptr inbounds [512 x <4 x float>], ptr addrspace(3) @lds.v4, i32 0, i32 %x.i
+  %val0 = load <4 x float>, ptr addrspace(3) %arrayidx0, align 8
+  %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i32 %x.i
+  store <4 x float> %val0, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -101,12 +101,12 @@ define amdgpu_kernel void @simple_read2_v4f32_superreg_align8(<4 x float> addrsp
 ; CI-DAG: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
 ; CI: buffer_store_dwordx4 [[REG_ZW]]
 ; CI: s_endpgm
-define amdgpu_kernel void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg(ptr addrspace(1) %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
-  %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0
-  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i32 %x.i
-  store <4 x float> %val0, <4 x float> addrspace(1)* %out.gep
+  %arrayidx0 = getelementptr inbounds [512 x <4 x float>], ptr addrspace(3) @lds.v4, i32 0, i32 %x.i
+  %val0 = load <4 x float>, ptr addrspace(3) %arrayidx0
+  %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i32 %x.i
+  store <4 x float> %val0, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -117,12 +117,12 @@ define amdgpu_kernel void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)*
 ; CI-DAG: buffer_store_dwordx4 [[VEC_HI]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
 ; CI-DAG: buffer_store_dwordx4 [[VEC_LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}}
 ; CI: s_endpgm
-define amdgpu_kernel void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v8f32_superreg(ptr addrspace(1) %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x <8 x float>], [512 x <8 x float>] addrspace(3)* @lds.v8, i32 0, i32 %x.i
-  %val0 = load <8 x float>, <8 x float> addrspace(3)* %arrayidx0
-  %out.gep = getelementptr inbounds <8 x float>, <8 x float> addrspace(1)* %out, i32 %x.i
-  store <8 x float> %val0, <8 x float> addrspace(1)* %out.gep
+  %arrayidx0 = getelementptr inbounds [512 x <8 x float>], ptr addrspace(3) @lds.v8, i32 0, i32 %x.i
+  %val0 = load <8 x float>, ptr addrspace(3) %arrayidx0
+  %out.gep = getelementptr inbounds <8 x float>, ptr addrspace(1) %out, i32 %x.i
+  store <8 x float> %val0, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -138,12 +138,12 @@ define amdgpu_kernel void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)*
 ; CI-DAG: buffer_store_dwordx4 [[VEC8_11]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32
 ; CI-DAG: buffer_store_dwordx4 [[VEC12_15]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48
 ; CI: s_endpgm
-define amdgpu_kernel void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v16f32_superreg(ptr addrspace(1) %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x <16 x float>], [512 x <16 x float>] addrspace(3)* @lds.v16, i32 0, i32 %x.i
-  %val0 = load <16 x float>, <16 x float> addrspace(3)* %arrayidx0
-  %out.gep = getelementptr inbounds <16 x float>, <16 x float> addrspace(1)* %out, i32 %x.i
-  store <16 x float> %val0, <16 x float> addrspace(1)* %out.gep
+  %arrayidx0 = getelementptr inbounds [512 x <16 x float>], ptr addrspace(3) @lds.v16, i32 0, i32 %x.i
+  %val0 = load <16 x float>, ptr addrspace(3) %arrayidx0
+  %out.gep = getelementptr inbounds <16 x float>, ptr addrspace(1) %out, i32 %x.i
+  store <16 x float> %val0, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -153,19 +153,19 @@ define amdgpu_kernel void @simple_read2_v16f32_superreg(<16 x float> addrspace(1
 ; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}}
 ; CI: buffer_store_dwordx2 v[[[REG_ELT0]]:[[REG_ELT1]]]
 ; CI: s_endpgm
-define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(ptr addrspace(1) %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  %arrayidx1 = getelementptr inbounds float, ptr addrspace(3) %arrayidx0, i32 1
 
-  %val0 = load float, float addrspace(3)* %arrayidx0
-  %val1 = load float, float addrspace(3)* %arrayidx1
+  %val0 = load float, ptr addrspace(3) %arrayidx0
+  %val1 = load float, ptr addrspace(3) %arrayidx1
 
   %vec.0 = insertelement <2 x float> undef, float %val0, i32 0
   %vec.1 = insertelement <2 x float> %vec.0, float %val1, i32 1
 
-  %out.gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i32 %x.i
-  store <2 x float> %vec.1, <2 x float> addrspace(1)* %out.gep
+  %out.gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %out, i32 %x.i
+  store <2 x float> %vec.1, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -176,25 +176,25 @@ define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x
 ; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}}
 ; CI: buffer_store_dwordx4 v[[[REG_ELT0]]:[[REG_ELT3]]]
 ; CI: s_endpgm
-define amdgpu_kernel void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg_scalar_loads_align4(ptr addrspace(1) %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1
-  %arrayidx2 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 2
-  %arrayidx3 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 3
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  %arrayidx1 = getelementptr inbounds float, ptr addrspace(3) %arrayidx0, i32 1
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(3) %arrayidx0, i32 2
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %arrayidx0, i32 3
 
-  %val0 = load float, float addrspace(3)* %arrayidx0
-  %val1 = load float, float addrspace(3)* %arrayidx1
-  %val2 = load float, float addrspace(3)* %arrayidx2
-  %val3 = load float, float addrspace(3)* %arrayidx3
+  %val0 = load float, ptr addrspace(3) %arrayidx0
+  %val1 = load float, ptr addrspace(3) %arrayidx1
+  %val2 = load float, ptr addrspace(3) %arrayidx2
+  %val3 = load float, ptr addrspace(3) %arrayidx3
 
   %vec.0 = insertelement <4 x float> undef, float %val0, i32 0
   %vec.1 = insertelement <4 x float> %vec.0, float %val1, i32 1
   %vec.2 = insertelement <4 x float> %vec.1, float %val2, i32 2
   %vec.3 = insertelement <4 x float> %vec.2, float %val3, i32 3
 
-  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i32 %x.i
-  store <4 x float> %vec.3, <4 x float> addrspace(1)* %out.gep
+  %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i32 %x.i
+  store <4 x float> %vec.3, ptr addrspace(1) %out.gep
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll b/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll
index e24864ef650d2..e330803811170 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll
@@ -14,16 +14,16 @@
 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
 ; CI: buffer_store_dword [[RESULT]]
 ; GFX9: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_0_1(ptr addrspace(1) %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 64
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
@@ -36,17 +36,17 @@ define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0
 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
 ; CI: buffer_store_dword [[RESULT]]
 ; GFX9: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_1_2(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x.0
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
   %add.x.1 = add nsw i32 %x.i, 128
-  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x.1
+  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
@@ -59,17 +59,17 @@ define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, fl
 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
 ; CI: buffer_store_dword [[RESULT]]
 ; GFX9: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x.0
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
   %add.x.1 = add nsw i32 %x.i, 16320
-  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x.1
+  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
@@ -82,17 +82,17 @@ define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %
 ; GCN-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
 ; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}}
 ; GCN: s_endpgm
-define amdgpu_kernel void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_over_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x.0
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
   %add.x.1 = add nsw i32 %x.i, 16384
-  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x.1
+  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
@@ -102,16 +102,16 @@ define amdgpu_kernel void @simple_read2st64_f32_over_max_offset(float addrspace(
 
 ; GCN-NOT: ds_read2st64_b32
 ; GCN: s_endpgm
-define amdgpu_kernel void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @odd_invalid_read2st64_f32_0(ptr addrspace(1) %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 63
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
@@ -121,17 +121,17 @@ define amdgpu_kernel void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out)
 
 ; GCN-NOT: ds_read2st64_b32
 ; GCN: s_endpgm
-define amdgpu_kernel void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @odd_invalid_read2st64_f32_1(ptr addrspace(1) %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x.0
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
   %add.x.1 = add nsw i32 %x.i, 127
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x.1
+  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
   %sum = fadd float %val0, %val1
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
-  store float %sum, float addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
@@ -144,16 +144,16 @@ define amdgpu_kernel void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out)
 ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]]
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; GFX9: global_store_dwordx2 v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_0_1(ptr addrspace(1) %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
+  %val0 = load double, ptr addrspace(3) %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 64
-  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
+  %val1 = load double, ptr addrspace(3) %arrayidx1, align 8
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 8
+  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i
+  store double %sum, ptr addrspace(1) %out.gep, align 8
   ret void
 }
 
@@ -167,17 +167,17 @@ define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #
 
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; GFX9: global_store_dwordx2 v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_1_2(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x.0
+  %val0 = load double, ptr addrspace(3) %arrayidx0, align 8
   %add.x.1 = add nsw i32 %x.i, 128
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x.1
+  %val1 = load double, ptr addrspace(3) %arrayidx1, align 8
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 8
+  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i
+  store double %sum, ptr addrspace(1) %out.gep, align 8
   ret void
 }
 
@@ -190,16 +190,16 @@ define amdgpu_kernel void @simple_read2st64_f64_1_2(double addrspace(1)* %out, d
 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
 ; GCN: s_endpgm
-define amdgpu_kernel void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_read2st64_f64(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %x.i
+  %val0 = load double, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 64
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x
+  %val1 = load double, ptr addrspace(3) %arrayidx1, align 4
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i
+  store double %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
@@ -214,17 +214,17 @@ define amdgpu_kernel void @misaligned_read2st64_f64(double addrspace(1)* %out, d
 
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; GFX9: global_store_dwordx2 v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 256
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x.0
+  %val0 = load double, ptr addrspace(3) %arrayidx0, align 8
   %add.x.1 = add nsw i32 %x.i, 8128
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x.1
+  %val1 = load double, ptr addrspace(3) %arrayidx1, align 8
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 8
+  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i
+  store double %sum, ptr addrspace(1) %out.gep, align 8
   ret void
 }
 
@@ -237,17 +237,17 @@ define amdgpu_kernel void @simple_read2st64_f64_max_offset(double addrspace(1)*
 ; GCN-DAG: v_add_{{i|u}}32_e32 [[BIGADD:v[0-9]+]], {{(vcc, )?}}0x10000, {{v[0-9]+}}
 ; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_over_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x.0
+  %val0 = load double, ptr addrspace(3) %arrayidx0, align 8
   %add.x.1 = add nsw i32 %x.i, 8192
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x.1
+  %val1 = load double, ptr addrspace(3) %arrayidx1, align 8
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 8
+  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i
+  store double %sum, ptr addrspace(1) %out.gep, align 8
   ret void
 }
 
@@ -257,17 +257,17 @@ define amdgpu_kernel void @simple_read2st64_f64_over_max_offset(double addrspace
 
 ; GCN-NOT: ds_read2st64_b64
 ; GCN: s_endpgm
-define amdgpu_kernel void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @invalid_read2st64_f64_odd_offset(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x.0
+  %val0 = load double, ptr addrspace(3) %arrayidx0, align 8
   %add.x.1 = add nsw i32 %x.i, 8129
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x.1
+  %val1 = load double, ptr addrspace(3) %arrayidx1, align 8
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 8
+  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i
+  store double %sum, ptr addrspace(1) %out.gep, align 8
   ret void
 }
 
@@ -281,16 +281,16 @@ define amdgpu_kernel void @invalid_read2st64_f64_odd_offset(double addrspace(1)*
 ; GCN-NOT: ds_read2st_b64
 ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
 ; GCN: s_endpgm
-define amdgpu_kernel void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @byte_size_only_divisible_64_read2_f64(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
-  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %x.i
+  %val0 = load double, ptr addrspace(3) %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
-  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x
+  %val1 = load double, ptr addrspace(3) %arrayidx1, align 8
   %sum = fadd double %val0, %val1
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
-  store double %sum, double addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i
+  store double %sum, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index b95f648bfe50c..db3dc4870bc13 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -6,7 +6,7 @@
 @lds = addrspace(3) global [512 x float] undef, align 4
 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
 
-define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
 ; CI-LABEL: simple_write2_one_val_f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
@@ -31,17 +31,17 @@ define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, flo
 ; GFX9-NEXT:    ds_write2_b32 v0, v1, v1 offset1:8
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
-  %val = load float, float addrspace(1)* %in.gep, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store float %val, float addrspace(3)* %arrayidx0, align 4
+  %in.gep = getelementptr float, ptr addrspace(1) %in, i32 %x.i
+  %val = load float, ptr addrspace(1) %in.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  store float %val, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store float %val, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  store float %val, ptr addrspace(3) %arrayidx1, align 4
   ret void
 }
 
-define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
 ; CI-LABEL: simple_write2_two_val_f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
@@ -70,19 +70,19 @@ define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, flo
 ; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
-  %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
-  %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %in.gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %x.i
+  %in.gep.1 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 1
+  %val0 = load volatile float, ptr addrspace(1) %in.gep.0, align 4
+  %val1 = load volatile float, ptr addrspace(1) %in.gep.1, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  store float %val0, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  store float %val1, ptr addrspace(3) %arrayidx1, align 4
   ret void
 }
 
-define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; CI-LABEL: simple_write2_two_val_f32_volatile_0:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2
@@ -116,19 +116,19 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(
 ; GFX9-NEXT:    ds_write_b32 v0, v2 offset:32
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
-  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
-  %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
-  %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store volatile float %val0, float addrspace(3)* %arrayidx0, align 4
+  %in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i
+  %in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %x.i
+  %val0 = load volatile float, ptr addrspace(1) %in0.gep, align 4
+  %val1 = load volatile float, ptr addrspace(1) %in1.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  store volatile float %val0, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  store float %val1, ptr addrspace(3) %arrayidx1, align 4
   ret void
 }
 
-define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; CI-LABEL: simple_write2_two_val_f32_volatile_1:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2
@@ -162,15 +162,15 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(
 ; GFX9-NEXT:    ds_write_b32 v0, v2 offset:32
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
-  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
-  %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
-  %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i
+  %in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %x.i
+  %val0 = load volatile float, ptr addrspace(1) %in0.gep, align 4
+  %val1 = load volatile float, ptr addrspace(1) %in1.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  store float %val0, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store volatile float %val1, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  store volatile float %val1, ptr addrspace(3) %arrayidx1, align 4
   ret void
 }
 
@@ -179,7 +179,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(
 ;       This should be an s_mov_b32. The v_mov_b32 gets introduced by an
 ;       early legalization of the constant bus constraint on the v_lshl_add_u32,
 ;       and then SIFoldOperands folds in an unlucky order.
-define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
 ; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
@@ -212,21 +212,21 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspa
 ; GFX9-NEXT:    ds_write2_b32 v0, v1, v3 offset1:8
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
-  %val0 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8
-  %val1 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8
+  %in.gep.0 = getelementptr <2 x float>, ptr addrspace(1) %in, i32 %x.i
+  %in.gep.1 = getelementptr <2 x float>, ptr addrspace(1) %in.gep.0, i32 1
+  %val0 = load volatile <2 x float>, ptr addrspace(1) %in.gep.0, align 8
+  %val1 = load volatile <2 x float>, ptr addrspace(1) %in.gep.1, align 8
   %val0.0 = extractelement <2 x float> %val0, i32 0
   %val1.1 = extractelement <2 x float> %val1, i32 1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store float %val0.0, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  store float %val0.0, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store float %val1.1, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  store float %val1.1, ptr addrspace(3) %arrayidx1, align 4
   ret void
 }
 
-define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
 ; CI-LABEL: simple_write2_two_val_subreg2_f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
@@ -253,19 +253,19 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)*
 ; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:8
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
-  %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
+  %in.gep = getelementptr <2 x float>, ptr addrspace(1) %in, i32 %x.i
+  %val = load <2 x float>, ptr addrspace(1) %in.gep, align 8
   %val0 = extractelement <2 x float> %val, i32 0
   %val1 = extractelement <2 x float> %val, i32 1
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  store float %val0, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  store float %val1, ptr addrspace(3) %arrayidx1, align 4
   ret void
 }
 
-define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
 ; CI-LABEL: simple_write2_two_val_subreg4_f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
@@ -292,19 +292,19 @@ define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)*
 ; GFX9-NEXT:    ds_write2_b32 v0, v1, v4 offset1:8
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
-  %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
+  %in.gep = getelementptr <4 x float>, ptr addrspace(1) %in, i32 %x.i
+  %val = load <4 x float>, ptr addrspace(1) %in.gep, align 16
   %val0 = extractelement <4 x float> %val, i32 0
   %val1 = extractelement <4 x float> %val, i32 3
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  store float %val0, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  store float %val1, ptr addrspace(3) %arrayidx1, align 4
   ret void
 }
 
-define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
 ; CI-LABEL: simple_write2_two_val_max_offset_f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
@@ -333,19 +333,19 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(
 ; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:255
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
-  %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
-  %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %in.gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %x.i
+  %in.gep.1 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 1
+  %val0 = load volatile float, ptr addrspace(1) %in.gep.0, align 4
+  %val1 = load volatile float, ptr addrspace(1) %in.gep.1, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  store float %val0, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 255
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  store float %val1, ptr addrspace(3) %arrayidx1, align 4
   ret void
 }
 
-define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; CI-LABEL: simple_write2_two_val_too_far_f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2
@@ -379,19 +379,19 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)*
 ; GFX9-NEXT:    ds_write_b32 v0, v2 offset:1028
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
-  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
-  %val0 = load float, float addrspace(1)* %in0.gep, align 4
-  %val1 = load float, float addrspace(1)* %in1.gep, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i
+  %in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %x.i
+  %val0 = load float, ptr addrspace(1) %in0.gep, align 4
+  %val1 = load float, ptr addrspace(1) %in1.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  store float %val0, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 257
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  store float %val1, ptr addrspace(3) %arrayidx1, align 4
   ret void
 }
 
-define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; CI-LABEL: simple_write2_two_val_f32_x2:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2
@@ -423,31 +423,31 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C,
 ; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
 ; GFX9-NEXT:    s_endpgm
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
-  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
-  %val0 = load float, float addrspace(1)* %in0.gep, align 4
-  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+  %in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %tid.x
+  %in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %tid.x
+  %val0 = load float, ptr addrspace(1) %in0.gep, align 4
+  %val1 = load float, ptr addrspace(1) %in1.gep, align 4
 
   %idx.0 = add nsw i32 %tid.x, 0
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0
+  store float %val0, ptr addrspace(3) %arrayidx0, align 4
 
   %idx.1 = add nsw i32 %tid.x, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1
+  store float %val1, ptr addrspace(3) %arrayidx1, align 4
 
   %idx.2 = add nsw i32 %tid.x, 11
-  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
-  store float %val0, float addrspace(3)* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2
+  store float %val0, ptr addrspace(3) %arrayidx2, align 4
 
   %idx.3 = add nsw i32 %tid.x, 27
-  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
-  store float %val1, float addrspace(3)* %arrayidx3, align 4
+  %arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3
+  store float %val1, ptr addrspace(3) %arrayidx3, align 4
 
   ret void
 }
 
-define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; CI-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2
@@ -479,31 +479,31 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrs
 ; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
 ; GFX9-NEXT:    s_endpgm
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
-  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
-  %val0 = load float, float addrspace(1)* %in0.gep, align 4
-  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+  %in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %tid.x
+  %in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %tid.x
+  %val0 = load float, ptr addrspace(1) %in0.gep, align 4
+  %val1 = load float, ptr addrspace(1) %in1.gep, align 4
 
   %idx.0 = add nsw i32 %tid.x, 3
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0
+  store float %val0, ptr addrspace(3) %arrayidx0, align 4
 
   %idx.1 = add nsw i32 %tid.x, 8
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1
+  store float %val1, ptr addrspace(3) %arrayidx1, align 4
 
   %idx.2 = add nsw i32 %tid.x, 11
-  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
-  store float %val0, float addrspace(3)* %arrayidx2, align 4
+  %arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2
+  store float %val0, ptr addrspace(3) %arrayidx2, align 4
 
   %idx.3 = add nsw i32 %tid.x, 27
-  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
-  store float %val1, float addrspace(3)* %arrayidx3, align 4
+  %arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3
+  store float %val1, ptr addrspace(3) %arrayidx3, align 4
 
   ret void
 }
 
-define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
+define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <2 x ptr addrspace(3)> %lds.ptr) #0 {
 ; CI-LABEL: write2_ptr_subreg_arg_two_val_f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2
@@ -543,27 +543,27 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)*
 ; GFX9-NEXT:    ds_write_b32 v3, v2 offset:32
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
-  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
-  %val0 = load float, float addrspace(1)* %in0.gep, align 4
-  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+  %in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i
+  %in1.gep = getelementptr float, ptr addrspace(1) %in1, i32 %x.i
+  %val0 = load float, ptr addrspace(1) %in0.gep, align 4
+  %val1 = load float, ptr addrspace(1) %in1.gep, align 4
 
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
-  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
-  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
-  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
+  %gep = getelementptr inbounds float, <2 x ptr addrspace(3)> %lds.ptr, <2 x i32> %index.1
+  %gep.0 = extractelement <2 x ptr addrspace(3)> %gep, i32 0
+  %gep.1 = extractelement <2 x ptr addrspace(3)> %gep, i32 1
 
   ; Apply an additional offset after the vector that will be more obviously folded.
-  %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8
-  store float %val0, float addrspace(3)* %gep.0, align 4
+  %gep.1.offset = getelementptr float, ptr addrspace(3) %gep.1, i32 8
+  store float %val0, ptr addrspace(3) %gep.0, align 4
 
   %add.x = add nsw i32 %x.i, 8
-  store float %val1, float addrspace(3)* %gep.1.offset, align 4
+  store float %val1, ptr addrspace(3) %gep.1.offset, align 4
   ret void
 }
 
-define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
 ; CI-LABEL: simple_write2_one_val_f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
@@ -588,17 +588,17 @@ define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, do
 ; GFX9-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset1:8
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
-  %val = load double, double addrspace(1)* %in.gep, align 8
-  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
-  store double %val, double addrspace(3)* %arrayidx0, align 8
+  %in.gep = getelementptr double, ptr addrspace(1) %in, i32 %x.i
+  %val = load double, ptr addrspace(1) %in.gep, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
+  store double %val, ptr addrspace(3) %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
-  store double %val, double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
+  store double %val, ptr addrspace(3) %arrayidx1, align 8
   ret void
 }
 
-define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 {
 ; CI-LABEL: misaligned_simple_write2_one_val_f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2
@@ -629,17 +629,17 @@ define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace
 ; GFX9-NEXT:    ds_write2_b32 v2, v0, v1 offset0:14 offset1:15
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
-  %val = load double, double addrspace(1)* %in.gep, align 8
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
-  store double %val, double addrspace(3)* %arrayidx0, align 4
+  %in.gep = getelementptr double, ptr addrspace(1) %in, i32 %x.i
+  %val = load double, ptr addrspace(1) %in.gep, align 8
+  %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %x.i
+  store double %val, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 7
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
-  store double %val, double addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x
+  store double %val, ptr addrspace(3) %arrayidx1, align 4
   ret void
 }
 
-define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 {
 ; CI-LABEL: unaligned_offset_simple_write2_one_val_f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2
@@ -713,20 +713,17 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double add
 ; GFX9-UNALIGNED-NEXT:    ds_write_b64 v2, v[0:1] offset:9
 ; GFX9-UNALIGNED-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
-  %val = load double, double addrspace(1)* %in.gep, align 8
-  %base = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
-  %base.i8 = bitcast double addrspace(3)* %base to i8 addrspace(3)*
-  %addr0.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 5
-  %addr0 = bitcast i8 addrspace(3)* %addr0.i8 to double addrspace(3)*
-  store double %val, double addrspace(3)* %addr0, align 1
-  %addr1.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 9
-  %addr1 = bitcast i8 addrspace(3)* %addr1.i8 to double addrspace(3)*
-  store double %val, double addrspace(3)* %addr1, align 1
+  %in.gep = getelementptr double, ptr addrspace(1) %in, i32 %x.i
+  %val = load double, ptr addrspace(1) %in.gep, align 8
+  %base = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %x.i
+  %addr0.i8 = getelementptr inbounds i8, ptr addrspace(3) %base, i32 5
+  store double %val, ptr addrspace(3) %addr0.i8, align 1
+  %addr1.i8 = getelementptr inbounds i8, ptr addrspace(3) %base, i32 9
+  store double %val, ptr addrspace(3) %addr1.i8, align 1
   ret void
 }
 
-define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
 ; CI-LABEL: simple_write2_two_val_f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
@@ -755,15 +752,15 @@ define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, do
 ; GFX9-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:8
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
-  %val0 = load volatile double, double addrspace(1)* %in.gep.0, align 8
-  %val1 = load volatile double, double addrspace(1)* %in.gep.1, align 8
-  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
-  store double %val0, double addrspace(3)* %arrayidx0, align 8
+  %in.gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %x.i
+  %in.gep.1 = getelementptr double, ptr addrspace(1) %in.gep.0, i32 1
+  %val0 = load volatile double, ptr addrspace(1) %in.gep.0, align 8
+  %val1 = load volatile double, ptr addrspace(1) %in.gep.1, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
+  store double %val0, ptr addrspace(3) %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
-  store double %val1, double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
+  store double %val1, ptr addrspace(3) %arrayidx1, align 8
   ret void
 }
 
@@ -786,8 +783,8 @@ define amdgpu_kernel void @store_constant_adjacent_offsets() {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
 ; GFX9-NEXT:    s_endpgm
-  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
-  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
+  store i32 123, ptr addrspace(3) @foo, align 4
+  store i32 123, ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @foo, i32 0, i32 1), align 4
   ret void
 }
 
@@ -806,8 +803,8 @@ define amdgpu_kernel void @store_constant_disjoint_offsets() {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    ds_write2_b32 v1, v0, v0 offset1:2
 ; GFX9-NEXT:    s_endpgm
-  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
-  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
+  store i32 123, ptr addrspace(3) @foo, align 4
+  store i32 123, ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @foo, i32 0, i32 2), align 4
   ret void
 }
 
@@ -832,8 +829,8 @@ define amdgpu_kernel void @store_misaligned64_constant_offsets() {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    ds_write_b128 v1, v[0:3]
 ; GFX9-NEXT:    s_endpgm
-  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
-  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
+  store i64 123, ptr addrspace(3) @bar, align 4
+  store i64 123, ptr addrspace(3) getelementptr inbounds ([4 x i64], ptr addrspace(3) @bar, i32 0, i32 1), align 4
   ret void
 }
 
@@ -860,15 +857,15 @@ define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
 ; GFX9-NEXT:    ds_write_b64 v2, v[0:1] offset:16384
 ; GFX9-NEXT:    ds_write_b64 v2, v[0:1] offset:32760
 ; GFX9-NEXT:    s_endpgm
-  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
-  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
+  store i64 123, ptr addrspace(3) getelementptr inbounds ([4096 x i64], ptr addrspace(3) @bar.large, i32 0, i32 2048), align 4
+  store i64 123, ptr addrspace(3) getelementptr inbounds ([4096 x i64], ptr addrspace(3) @bar.large, i32 0, i32 4095), align 4
   ret void
 }
 
 @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
 
-define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb, ptr addrspace(1) %in) #0 {
 ; CI-LABEL: write2_sgemm_sequence:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x4
@@ -913,39 +910,39 @@ define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %ld
 ; GFX9-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
   %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
-  %val = load float, float addrspace(1)* %in
-  %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
-  store float %val, float addrspace(3)* %arrayidx44, align 4
+  %val = load float, ptr addrspace(1) %in
+  %arrayidx44 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %x.i
+  store float %val, ptr addrspace(3) %arrayidx44, align 4
   %add47 = add nsw i32 %x.i, 1
-  %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
-  store float %val, float addrspace(3)* %arrayidx48, align 4
+  %arrayidx48 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add47
+  store float %val, ptr addrspace(3) %arrayidx48, align 4
   %add51 = add nsw i32 %x.i, 16
-  %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
-  store float %val, float addrspace(3)* %arrayidx52, align 4
+  %arrayidx52 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add51
+  store float %val, ptr addrspace(3) %arrayidx52, align 4
   %add55 = add nsw i32 %x.i, 17
-  %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
-  store float %val, float addrspace(3)* %arrayidx56, align 4
-  %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
-  store float %val, float addrspace(3)* %arrayidx60, align 4
+  %arrayidx56 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add55
+  store float %val, ptr addrspace(3) %arrayidx56, align 4
+  %arrayidx60 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %y.i
+  store float %val, ptr addrspace(3) %arrayidx60, align 4
   %add63 = add nsw i32 %y.i, 1
-  %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
-  store float %val, float addrspace(3)* %arrayidx64, align 4
+  %arrayidx64 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add63
+  store float %val, ptr addrspace(3) %arrayidx64, align 4
   %add67 = add nsw i32 %y.i, 32
-  %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
-  store float %val, float addrspace(3)* %arrayidx68, align 4
+  %arrayidx68 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add67
+  store float %val, ptr addrspace(3) %arrayidx68, align 4
   %add71 = add nsw i32 %y.i, 33
-  %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
-  store float %val, float addrspace(3)* %arrayidx72, align 4
+  %arrayidx72 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add71
+  store float %val, ptr addrspace(3) %arrayidx72, align 4
   %add75 = add nsw i32 %y.i, 64
-  %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
-  store float %val, float addrspace(3)* %arrayidx76, align 4
+  %arrayidx76 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add75
+  store float %val, ptr addrspace(3) %arrayidx76, align 4
   %add79 = add nsw i32 %y.i, 65
-  %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
-  store float %val, float addrspace(3)* %arrayidx80, align 4
+  %arrayidx80 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add79
+  store float %val, ptr addrspace(3) %arrayidx80, align 4
   ret void
 }
 
-define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) %out, ptr addrspace(1) %in) #0 {
 ; CI-LABEL: simple_write2_v4f32_superreg_align4:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2
@@ -996,10 +993,10 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrs
 ; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v0, v3, v4 offset1:1
 ; GFX9-UNALIGNED-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
-  %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4
-  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(3)* %out, i32 %x.i
-  store <4 x float> %val0, <4 x float> addrspace(3)* %out.gep, align 4
+  %in.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %in
+  %val0 = load <4 x float>, ptr addrspace(1) %in.gep, align 4
+  %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(3) %out, i32 %x.i
+  store <4 x float> %val0, ptr addrspace(3) %out.gep, align 4
   ret void
 }
 
@@ -1047,7 +1044,7 @@ define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
 ; GFX9-UNALIGNED-NEXT:    ds_write_b64 v2, v[0:1] offset:65
 ; GFX9-UNALIGNED-NEXT:    s_endpgm
 entry:
-  store <2 x i32> <i32 123, i32 456>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1
+  store <2 x i32> <i32 123, i32 456>, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @v2i32_align1, i32 65), align 1
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll b/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
index 2b78d04c3c1a2..ab2d4a9db603a 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
@@ -11,15 +11,15 @@
 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; GCN: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1
 ; GCN: s_endpgm
-define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
-  %val = load float, float addrspace(1)* %in.gep, align 4
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
-  store float %val, float addrspace(3)* %arrayidx0, align 4
+  %in.gep = getelementptr float, ptr addrspace(1) %in, i32 %x.i
+  %val = load float, ptr addrspace(1) %in.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  store float %val, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 64
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
-  store float %val, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  store float %val, ptr addrspace(3) %arrayidx1, align 4
   ret void
 }
 
@@ -37,18 +37,18 @@ define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(float addrspace(1)*
 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; GCN: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5
 ; GCN: s_endpgm
-define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
-  %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
-  %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
+  %in.gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %x.i
+  %in.gep.1 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 1
+  %val0 = load volatile float, ptr addrspace(1) %in.gep.0, align 4
+  %val1 = load volatile float, ptr addrspace(1) %in.gep.1, align 4
   %add.x.0 = add nsw i32 %x.i, 128
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x.0
+  store float %val0, ptr addrspace(3) %arrayidx0, align 4
   %add.x.1 = add nsw i32 %x.i, 320
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x.1
+  store float %val1, ptr addrspace(3) %arrayidx1, align 4
   ret void
 }
 
@@ -66,17 +66,17 @@ define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)*
 ; GCN-DAG: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]]
 ; GCN: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
 ; GCN: s_endpgm
-define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
-  %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
-  %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
-  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
-  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %in.gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %x.i
+  %in.gep.1 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 1
+  %val0 = load volatile float, ptr addrspace(1) %in.gep.0, align 4
+  %val1 = load volatile float, ptr addrspace(1) %in.gep.1, align 4
+  %arrayidx0 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %x.i
+  store float %val0, ptr addrspace(3) %arrayidx0, align 4
   %add.x = add nsw i32 %x.i, 16320
-  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x
+  store float %val1, ptr addrspace(3) %arrayidx1, align 4
   ret void
 }
 
@@ -94,18 +94,18 @@ define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrsp
 ; GCN-DAG: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]]
 ; GCN: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
 ; GCN: s_endpgm
-define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
-  %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
-  %val0 = load volatile double, double addrspace(1)* %in.gep.0, align 8
-  %val1 = load volatile double, double addrspace(1)* %in.gep.1, align 8
+  %in.gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %x.i
+  %in.gep.1 = getelementptr double, ptr addrspace(1) %in.gep.0, i32 1
+  %val0 = load volatile double, ptr addrspace(1) %in.gep.0, align 8
+  %val1 = load volatile double, ptr addrspace(1) %in.gep.1, align 8
   %add.x.0 = add nsw i32 %x.i, 256
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
-  store double %val0, double addrspace(3)* %arrayidx0, align 8
+  %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x.0
+  store double %val0, ptr addrspace(3) %arrayidx0, align 8
   %add.x.1 = add nsw i32 %x.i, 8128
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
-  store double %val1, double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x.1
+  store double %val1, ptr addrspace(3) %arrayidx1, align 8
   ret void
 }
 
@@ -116,15 +116,15 @@ define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f64(double addrs
 ; GCN-NOT: ds_write2st64_b64
 ; GCN: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8
 ; GCN: s_endpgm
-define amdgpu_kernel void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @byte_size_only_divisible_64_write2st64_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
-  %val = load double, double addrspace(1)* %in.gep, align 8
-  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
-  store double %val, double addrspace(3)* %arrayidx0, align 8
+  %in.gep = getelementptr double, ptr addrspace(1) %in, i32 %x.i
+  %val = load double, ptr addrspace(1) %in.gep, align 8
+  %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %x.i
+  store double %val, ptr addrspace(3) %arrayidx0, align 8
   %add.x = add nsw i32 %x.i, 8
-  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
-  store double %val, double addrspace(3)* %arrayidx1, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x
+  store double %val, ptr addrspace(3) %arrayidx1, align 8
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll b/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll
index 31b49060c1174..c007174286709 100644
--- a/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll
+++ b/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll
@@ -12,14 +12,14 @@
 ; CHECK: [[LOOP_LABEL:.L[0-9A-Za-z_]+]]: ; %loop{{$}}
 ; CHECK-NOT: s_or_b64 exec, exec
 ; CHECK: s_cbranch_execnz [[LOOP_LABEL]]
-define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(ptr addrspace(1) %out) {
 entry:
   %cond = call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp0 = icmp eq i32 %cond, 0
   br i1 %tmp0, label %if, label %loop
 
 if:
-  store i32 0, i32 addrspace(1)* %out
+  store i32 0, ptr addrspace(1) %out
   br label %loop
 
 loop:
@@ -29,8 +29,8 @@ loop:
   br i1 %tmp2, label %done, label %loop
 
 done:
-  %tmp3 = getelementptr i32, i32 addrspace(1)* %out, i64 1
-  store i32 %inc, i32 addrspace(1)* %tmp3
+  %tmp3 = getelementptr i32, ptr addrspace(1) %out, i64 1
+  store i32 %inc, ptr addrspace(1) %tmp3
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/expand-atomicrmw-syncscope.ll b/llvm/test/CodeGen/AMDGPU/expand-atomicrmw-syncscope.ll
index f5309792e265a..02d9af27ddbe0 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-atomicrmw-syncscope.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-atomicrmw-syncscope.ll
@@ -2,72 +2,72 @@
 
 ; GCN-LABEL: {{^}}expand_atomicrmw_agent:
 ; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}}
-define void @expand_atomicrmw_agent(float addrspace(1)* nocapture %arg) {
+define void @expand_atomicrmw_agent(ptr addrspace(1) nocapture %arg) {
 entry:
-  %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 syncscope("agent") monotonic, align 4
+  %ret = atomicrmw fadd ptr addrspace(1) %arg, float 1.000000e+00 syncscope("agent") monotonic, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}expand_atomicrmw_workgroup:
 ; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}}
-define void @expand_atomicrmw_workgroup(float addrspace(1)* nocapture %arg) {
+define void @expand_atomicrmw_workgroup(ptr addrspace(1) nocapture %arg) {
 entry:
-  %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 syncscope("workgroup") monotonic, align 4
+  %ret = atomicrmw fadd ptr addrspace(1) %arg, float 1.000000e+00 syncscope("workgroup") monotonic, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}expand_atomicrmw_wavefront:
 ; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}}
-define void @expand_atomicrmw_wavefront(float addrspace(1)* nocapture %arg) {
+define void @expand_atomicrmw_wavefront(ptr addrspace(1) nocapture %arg) {
 entry:
-  %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 syncscope("wavefront") monotonic, align 4
+  %ret = atomicrmw fadd ptr addrspace(1) %arg, float 1.000000e+00 syncscope("wavefront") monotonic, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}expand_atomicrmw_agent_one_as:
 ; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}}
-define void @expand_atomicrmw_agent_one_as(float addrspace(1)* nocapture %arg) {
+define void @expand_atomicrmw_agent_one_as(ptr addrspace(1) nocapture %arg) {
 entry:
-  %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 syncscope("agent-one-as") monotonic, align 4
+  %ret = atomicrmw fadd ptr addrspace(1) %arg, float 1.000000e+00 syncscope("agent-one-as") monotonic, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}expand_atomicrmw_workgroup_one_as:
 ; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}}
-define void @expand_atomicrmw_workgroup_one_as(float addrspace(1)* nocapture %arg) {
+define void @expand_atomicrmw_workgroup_one_as(ptr addrspace(1) nocapture %arg) {
 entry:
-  %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 syncscope("workgroup-one-as") monotonic, align 4
+  %ret = atomicrmw fadd ptr addrspace(1) %arg, float 1.000000e+00 syncscope("workgroup-one-as") monotonic, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}expand_atomicrmw_wavefront_one_as:
 ; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}}
-define void @expand_atomicrmw_wavefront_one_as(float addrspace(1)* nocapture %arg) {
+define void @expand_atomicrmw_wavefront_one_as(ptr addrspace(1) nocapture %arg) {
 entry:
-  %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 syncscope("wavefront-one-as") monotonic, align 4
+  %ret = atomicrmw fadd ptr addrspace(1) %arg, float 1.000000e+00 syncscope("wavefront-one-as") monotonic, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}expand_atomicrmw_singlethread_one_as:
 ; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}}
-define void @expand_atomicrmw_singlethread_one_as(float addrspace(1)* nocapture %arg) {
+define void @expand_atomicrmw_singlethread_one_as(ptr addrspace(1) nocapture %arg) {
 entry:
-  %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 syncscope("singlethread-one-as") monotonic, align 4
+  %ret = atomicrmw fadd ptr addrspace(1) %arg, float 1.000000e+00 syncscope("singlethread-one-as") monotonic, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}expand_atomicrmw_one_as:
 ; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}}
-define void @expand_atomicrmw_one_as(float addrspace(1)* nocapture %arg) {
+define void @expand_atomicrmw_one_as(ptr addrspace(1) nocapture %arg) {
 entry:
-  %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 syncscope("one-as") monotonic, align 4
+  %ret = atomicrmw fadd ptr addrspace(1) %arg, float 1.000000e+00 syncscope("one-as") monotonic, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}expand_atomicrmw_system:
 ; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}}
-define void @expand_atomicrmw_system(float addrspace(1)* nocapture %arg) {
+define void @expand_atomicrmw_system(ptr addrspace(1) nocapture %arg) {
 entry:
-  %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 monotonic, align 4
+  %ret = atomicrmw fadd ptr addrspace(1) %arg, float 1.000000e+00 monotonic, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index e3072396ee96f..eb1f09c3d486d 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -79,7 +79,7 @@ define i32 @s_add_co_select_user() {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, s1, v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  %i = load volatile i32, i32 addrspace(4)* null, align 8
+  %i = load volatile i32, ptr addrspace(4) null, align 8
   %i1 = add i32 %i, %i
   %i2 = icmp ult i32 %i1, %i
   %i3 = zext i1 %i2 to i32
@@ -209,10 +209,10 @@ bb:
   br i1 %i6, label %bb0, label %bb1
 
 bb0:
-  store volatile i32 9, i32 addrspace(1)* null
+  store volatile i32 9, ptr addrspace(1) null
   br label %bb1
 
 bb1:
-  store volatile i32 10, i32 addrspace(1)* null
+  store volatile i32 10, ptr addrspace(1) null
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/extload-align.ll b/llvm/test/CodeGen/AMDGPU/extload-align.ll
index 094ab0aea440e..2777bdcd2cc4c 100644
--- a/llvm/test/CodeGen/AMDGPU/extload-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/extload-align.ll
@@ -10,15 +10,14 @@ target datalayout = "A5"
 ; DEBUG: (volatile load (s16) from %ir.a, addrspace 5)
 ; DEBUG: {{^}}# End machine code for function extload_align.
 
-define amdgpu_kernel void @extload_align(i32 addrspace(5)* %out, i32 %index) #0 {
+define amdgpu_kernel void @extload_align(ptr addrspace(5) %out, i32 %index) #0 {
   %v0 = alloca [4 x i16], addrspace(5)
-  %a1 = getelementptr inbounds [4 x i16], [4 x i16] addrspace(5)* %v0, i32 0, i32 0
-  %a2 = getelementptr inbounds [4 x i16], [4 x i16] addrspace(5)* %v0, i32 0, i32 1
-  store volatile i16 0, i16 addrspace(5)* %a1
-  store volatile i16 1, i16 addrspace(5)* %a2
-  %a = getelementptr inbounds [4 x i16], [4 x i16] addrspace(5)* %v0, i32 0, i32 %index
-  %val = load volatile i16, i16 addrspace(5)* %a
+  %a2 = getelementptr inbounds [4 x i16], ptr addrspace(5) %v0, i32 0, i32 1
+  store volatile i16 0, ptr addrspace(5) %v0
+  store volatile i16 1, ptr addrspace(5) %a2
+  %a = getelementptr inbounds [4 x i16], ptr addrspace(5) %v0, i32 0, i32 %index
+  %val = load volatile i16, ptr addrspace(5) %a
   %eval = sext i16 %val to i32
-  store i32 %eval, i32 addrspace(5)* %out
+  store i32 %eval, ptr addrspace(5) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/extload-private.ll b/llvm/test/CodeGen/AMDGPU/extload-private.ll
index ff6e66af63a9a..b97361f016934 100644
--- a/llvm/test/CodeGen/AMDGPU/extload-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/extload-private.ll
@@ -3,44 +3,44 @@
 
 ; FUNC-LABEL: {{^}}load_i8_sext_private:
 ; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
-define amdgpu_kernel void @load_i8_sext_private(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_i8_sext_private(ptr addrspace(1) %out) {
 entry:
   %tmp0 = alloca i8, addrspace(5)
-  %tmp1 = load i8, i8 addrspace(5)* %tmp0
+  %tmp1 = load i8, ptr addrspace(5) %tmp0
   %tmp2 = sext i8 %tmp1 to i32
-  store i32 %tmp2, i32 addrspace(1)* %out
+  store i32 %tmp2, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}load_i8_zext_private:
 ; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
-define amdgpu_kernel void @load_i8_zext_private(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_i8_zext_private(ptr addrspace(1) %out) {
 entry:
   %tmp0 = alloca i8, addrspace(5)
-  %tmp1 = load i8, i8 addrspace(5)* %tmp0
+  %tmp1 = load i8, ptr addrspace(5) %tmp0
   %tmp2 = zext i8 %tmp1 to i32
-  store i32 %tmp2, i32 addrspace(1)* %out
+  store i32 %tmp2, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}load_i16_sext_private:
 ; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
-define amdgpu_kernel void @load_i16_sext_private(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_i16_sext_private(ptr addrspace(1) %out) {
 entry:
   %tmp0 = alloca i16, addrspace(5)
-  %tmp1 = load i16, i16 addrspace(5)* %tmp0
+  %tmp1 = load i16, ptr addrspace(5) %tmp0
   %tmp2 = sext i16 %tmp1 to i32
-  store i32 %tmp2, i32 addrspace(1)* %out
+  store i32 %tmp2, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}load_i16_zext_private:
 ; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 glc{{$}}
-define amdgpu_kernel void @load_i16_zext_private(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_i16_zext_private(ptr addrspace(1) %out) {
 entry:
   %tmp0 = alloca i16, addrspace(5)
-  %tmp1 = load volatile i16, i16 addrspace(5)* %tmp0
+  %tmp1 = load volatile i16, ptr addrspace(5) %tmp0
   %tmp2 = zext i16 %tmp1 to i32
-  store i32 %tmp2, i32 addrspace(1)* %out
+  store i32 %tmp2, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/extload.ll b/llvm/test/CodeGen/AMDGPU/extload.ll
index cb618db4a3bc2..79de37db22a3f 100644
--- a/llvm/test/CodeGen/AMDGPU/extload.ll
+++ b/llvm/test/CodeGen/AMDGPU/extload.ll
@@ -10,12 +10,10 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
 ; EG: VTX_READ_32 [[VAL]]
-define amdgpu_kernel void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
-  %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
-  %load = load i32, i32 addrspace(1)* %cast
+define amdgpu_kernel void @global_anyext_load_i8(ptr addrspace(1) nocapture noalias %out, ptr addrspace(1) nocapture noalias %src) nounwind {
+  %load = load i32, ptr addrspace(1) %src
   %x = bitcast i32 %load to <4 x i8>
-  %castOut = bitcast i8 addrspace(1)* %out to <4 x i8> addrspace(1)*
-  store <4 x i8> %x, <4 x i8> addrspace(1)* %castOut
+  store <4 x i8> %x, ptr addrspace(1) %out
   ret void
 }
 
@@ -25,12 +23,10 @@ define amdgpu_kernel void @global_anyext_load_i8(i8 addrspace(1)* nocapture noal
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
 ; EG: VTX_READ_32 [[VAL]]
-define amdgpu_kernel void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
-  %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
-  %load = load i32, i32 addrspace(1)* %cast
+define amdgpu_kernel void @global_anyext_load_i16(ptr addrspace(1) nocapture noalias %out, ptr addrspace(1) nocapture noalias %src) nounwind {
+  %load = load i32, ptr addrspace(1) %src
   %x = bitcast i32 %load to <2 x i16>
-  %castOut = bitcast i16 addrspace(1)* %out to <2 x i16> addrspace(1)*
-  store <2 x i16> %x, <2 x i16> addrspace(1)* %castOut
+  store <2 x i16> %x, ptr addrspace(1) %out
   ret void
 }
 
@@ -40,12 +36,10 @@ define amdgpu_kernel void @global_anyext_load_i16(i16 addrspace(1)* nocapture no
 
 ; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
 ; EG: LDS_WRITE * [[VAL]]
-define amdgpu_kernel void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
-  %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
-  %load = load i32, i32 addrspace(3)* %cast
+define amdgpu_kernel void @local_anyext_load_i8(ptr addrspace(3) nocapture noalias %out, ptr addrspace(3) nocapture noalias %src) nounwind {
+  %load = load i32, ptr addrspace(3) %src
   %x = bitcast i32 %load to <4 x i8>
-  %castOut = bitcast i8 addrspace(3)* %out to <4 x i8> addrspace(3)*
-  store <4 x i8> %x, <4 x i8> addrspace(3)* %castOut
+  store <4 x i8> %x, ptr addrspace(3) %out
   ret void
 }
 
@@ -55,11 +49,9 @@ define amdgpu_kernel void @local_anyext_load_i8(i8 addrspace(3)* nocapture noali
 
 ; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
 ; EG: LDS_WRITE * [[VAL]]
-define amdgpu_kernel void @local_anyext_load_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
-  %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
-  %load = load i32, i32 addrspace(3)* %cast
+define amdgpu_kernel void @local_anyext_load_i16(ptr addrspace(3) nocapture noalias %out, ptr addrspace(3) nocapture noalias %src) nounwind {
+  %load = load i32, ptr addrspace(3) %src
   %x = bitcast i32 %load to <2 x i16>
-  %castOut = bitcast i16 addrspace(3)* %out to <2 x i16> addrspace(3)*
-  store <2 x i16> %x, <2 x i16> addrspace(3)* %castOut
+  store <2 x i16> %x, ptr addrspace(3) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
index 7447f84f2d078..e376c3df1ac93 100644
--- a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
@@ -18,7 +18,7 @@
 @lds = internal addrspace(3) global [4096 x i8] undef
 
 define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
-  %ptr = getelementptr [4096 x i8], [4096 x i8] addrspace(3)* @lds, i32 0, i32 %voffset
-  store i8 0, i8 addrspace(3)* %ptr
+  %ptr = getelementptr [4096 x i8], ptr addrspace(3) @lds, i32 0, i32 %voffset
+  store i8 0, ptr addrspace(3) %ptr
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
index 5a4c83d478b0d..575f6143bc6a0 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
@@ -3,7 +3,7 @@
 
 ; FIXME: Inefficient codegen which skips an optimization of load +
 ; extractelement when the vector element type is not byte-sized.
-define i1 @extractloadi1(<8 x i1> *%ptr, i32 %idx) {
+define i1 @extractloadi1(ptr %ptr, i32 %idx) {
 ; CHECK-LABEL: extractloadi1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -30,7 +30,7 @@ define i1 @extractloadi1(<8 x i1> *%ptr, i32 %idx) {
 ; CHECK-NEXT:    buffer_load_ubyte v0, v1, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <8 x i1>, <8 x i1> *%ptr
+  %val = load <8 x i1>, ptr %ptr
   %ret = extractelement <8 x i1> %val, i32 %idx
   ret i1 %ret
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index 9ff7c3bd75385..24df34e94b4e7 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
 
-define <4 x i16> @vec_8xi16_extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) {
+define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
 ; SI-LABEL: vec_8xi16_extract_4xi16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -117,11 +117,11 @@ define <4 x i16> @vec_8xi16_extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16
   br i1 undef, label %T, label %F
 
 T:
-  %t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0
+  %t = load volatile <8 x i16>, ptr addrspace(1) %p0
   br label %exit
 
 F:
-  %f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1
+  %f = load volatile <8 x i16>, ptr addrspace(1) %p1
   br label %exit
 
 exit:
@@ -132,7 +132,7 @@ exit:
   ret <4 x i16> %r2
 }
 
-define <4 x i16> @vec_8xi16_extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) {
+define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
 ; SI-LABEL: vec_8xi16_extract_4xi16_2:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -249,11 +249,11 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i
   br i1 undef, label %T, label %F
 
 T:
-  %t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0
+  %t = load volatile <8 x i16>, ptr addrspace(1) %p0
   br label %exit
 
 F:
-  %f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1
+  %f = load volatile <8 x i16>, ptr addrspace(1) %p1
   br label %exit
 
 exit:
@@ -264,7 +264,7 @@ exit:
   ret <4 x i16> %r2
 }
 
-define <4 x half> @vec_8xf16_extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x half> addrspace(1) * %p1) {
+define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
 ; SI-LABEL: vec_8xf16_extract_4xf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -386,11 +386,11 @@ define <4 x half> @vec_8xf16_extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x h
   br i1 undef, label %T, label %F
 
 T:
-  %t = load volatile <8 x half>, <8 x half> addrspace(1) * %p0
+  %t = load volatile <8 x half>, ptr addrspace(1) %p0
   br label %exit
 
 F:
-  %f = load volatile <8 x half>, <8 x half> addrspace(1) * %p1
+  %f = load volatile <8 x half>, ptr addrspace(1) %p1
   br label %exit
 
 exit:
@@ -401,7 +401,7 @@ exit:
   ret <4 x half> %r2
 }
 
-define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x i16> addrspace(1) * %p1) {
+define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
 ;
 ; SI-LABEL: vec_16xi16_extract_4xi16:
 ; SI:       ; %bb.0:
@@ -555,11 +555,11 @@ define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x
   br i1 undef, label %T, label %F
 
 T:
-  %t = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p0
+  %t = load volatile <16 x i16>, ptr addrspace(1) %p0
   br label %exit
 
 F:
-  %f = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p1
+  %f = load volatile <16 x i16>, ptr addrspace(1) %p1
   br label %exit
 
 exit:
@@ -570,7 +570,7 @@ exit:
   ret <4 x i16> %r2
 }
 
-define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16 x i16> addrspace(1) * %p1) {
+define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
 ;
 ; SI-LABEL: vec_16xi16_extract_4xi16_2:
 ; SI:       ; %bb.0:
@@ -726,11 +726,11 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16
   br i1 undef, label %T, label %F
 
 T:
-  %t = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p0
+  %t = load volatile <16 x i16>, ptr addrspace(1) %p0
   br label %exit
 
 F:
-  %f = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p1
+  %f = load volatile <16 x i16>, ptr addrspace(1) %p1
   br label %exit
 
 exit:
@@ -741,7 +741,7 @@ exit:
   ret <4 x i16> %r2
 }
 
-define <4 x half> @vec_16xf16_extract_4xf16(<16 x half> addrspace(1) * %p0, <16 x half> addrspace(1) * %p1) {
+define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
 ;
 ; SI-LABEL: vec_16xf16_extract_4xf16:
 ; SI:       ; %bb.0:
@@ -902,11 +902,11 @@ define <4 x half> @vec_16xf16_extract_4xf16(<16 x half> addrspace(1) * %p0, <16
   br i1 undef, label %T, label %F
 
 T:
-  %t = load volatile <16 x half>, <16 x half> addrspace(1) * %p0
+  %t = load volatile <16 x half>, ptr addrspace(1) %p0
   br label %exit
 
 F:
-  %f = load volatile <16 x half>, <16 x half> addrspace(1) * %p1
+  %f = load volatile <16 x half>, ptr addrspace(1) %p1
   br label %exit
 
 exit:

diff  --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
index 0285d18252b0e..7d1a37ad719b0 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
@@ -20,15 +20,15 @@
 ; GCN: v_bfe_i32
 ; GCN: v_bfe_i32
 
-define <2 x i16> @extract_2xi16(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) {
+define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
   br i1 undef, label %T, label %F
 
 T:
-  %t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0
+  %t = load volatile <8 x i16>, ptr addrspace(1) %p0
   br label %exit
 
 F:
-  %f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1
+  %f = load volatile <8 x i16>, ptr addrspace(1) %p1
   br label %exit
 
 exit:
@@ -41,15 +41,15 @@ exit:
 
 ; GCN-LABEL: extract_2xi64
 ; GCN-COUNT-2: v_cndmask_b32
-define <2 x i64> @extract_2xi64(<8 x i64> addrspace(1) * %p0, <8 x i64> addrspace(1) * %p1) {
+define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
   br i1 undef, label %T, label %F
 
 T:
-  %t = load volatile <8 x i64>, <8 x i64> addrspace(1) * %p0
+  %t = load volatile <8 x i64>, ptr addrspace(1) %p0
   br label %exit
 
 F:
-  %f = load volatile <8 x i64>, <8 x i64> addrspace(1) * %p1
+  %f = load volatile <8 x i64>, ptr addrspace(1) %p1
   br label %exit
 
 exit:
@@ -62,15 +62,15 @@ exit:
 
 ; GCN-LABEL: extract_4xi64
 ; GCN-COUNT-4: v_cndmask_b32
-define <4 x i64> @extract_4xi64(<8 x i64> addrspace(1) * %p0, <8 x i64> addrspace(1) * %p1) {
+define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
   br i1 undef, label %T, label %F
 
 T:
-  %t = load volatile <8 x i64>, <8 x i64> addrspace(1) * %p0
+  %t = load volatile <8 x i64>, ptr addrspace(1) %p0
   br label %exit
 
 F:
-  %f = load volatile <8 x i64>, <8 x i64> addrspace(1) * %p1
+  %f = load volatile <8 x i64>, ptr addrspace(1) %p1
   br label %exit
 
 exit:
@@ -83,15 +83,15 @@ exit:
 
 ; GCN-LABEL: extract_8xi64
 ; GCN-COUNT-8: v_cndmask_b32
-define <8 x i64> @extract_8xi64(<16 x i64> addrspace(1) * %p0, <16 x i64> addrspace(1) * %p1) {
+define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
   br i1 undef, label %T, label %F
 
 T:
-  %t = load volatile <16 x i64>, <16 x i64> addrspace(1) * %p0
+  %t = load volatile <16 x i64>, ptr addrspace(1) %p0
   br label %exit
 
 F:
-  %f = load volatile <16 x i64>, <16 x i64> addrspace(1) * %p1
+  %f = load volatile <16 x i64>, ptr addrspace(1) %p1
   br label %exit
 
 exit:
@@ -104,15 +104,15 @@ exit:
 
 ; GCN-LABEL: extract_2xf64
 ; GCN-COUNT-2: v_cndmask_b32
-define <2 x double> @extract_2xf64(<8 x double> addrspace(1) * %p0, <8 x double> addrspace(1) * %p1) {
+define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
   br i1 undef, label %T, label %F
 
 T:
-  %t = load volatile <8 x double>, <8 x double> addrspace(1) * %p0
+  %t = load volatile <8 x double>, ptr addrspace(1) %p0
   br label %exit
 
 F:
-  %f = load volatile <8 x double>, <8 x double> addrspace(1) * %p1
+  %f = load volatile <8 x double>, ptr addrspace(1) %p1
   br label %exit
 
 exit:
@@ -125,15 +125,15 @@ exit:
 
 ; GCN-LABEL: extract_4xf64
 ; GCN-COUNT-4: v_cndmask_b32
-define <4 x double> @extract_4xf64(<8 x double> addrspace(1) * %p0, <8 x double> addrspace(1) * %p1) {
+define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
   br i1 undef, label %T, label %F
 
 T:
-  %t = load volatile <8 x double>, <8 x double> addrspace(1) * %p0
+  %t = load volatile <8 x double>, ptr addrspace(1) %p0
   br label %exit
 
 F:
-  %f = load volatile <8 x double>, <8 x double> addrspace(1) * %p1
+  %f = load volatile <8 x double>, ptr addrspace(1) %p1
   br label %exit
 
 exit:
@@ -146,15 +146,15 @@ exit:
 
 ; GCN-LABEL: extract_8xf64
 ; GCN-COUNT-8: v_cndmask_b32
-define <8 x double> @extract_8xf64(<16 x double> addrspace(1) * %p0, <16 x double> addrspace(1) * %p1) {
+define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
   br i1 undef, label %T, label %F
 
 T:
-  %t = load volatile <16 x double>, <16 x double> addrspace(1) * %p0
+  %t = load volatile <16 x double>, ptr addrspace(1) %p0
   br label %exit
 
 F:
-  %f = load volatile <16 x double>, <16 x double> addrspace(1) * %p1
+  %f = load volatile <16 x double>, ptr addrspace(1) %p1
   br label %exit
 
 exit:

diff  --git a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
index 423cd00bce726..3a4999ad17f4b 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
@@ -5,7 +5,7 @@
 ; Specifically, we do not want to see a BUFFER_STORE that says "store into
 ; stack" in the middle.
 
-define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) {
+define amdgpu_hs void @main(ptr addrspace(6) inreg %arg) {
   ; GCN-LABEL: name: main
   ; GCN: bb.0.main_body:
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0

diff  --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index c5dd7a2e39b37..e4f4431cbde48 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -12,10 +12,10 @@
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2.0, [[V1]], [[C2]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], 4.0, [[V2]], [[C3]]
 ; GCN:     store_dword v[{{[0-9:]+}}], [[V3]]
-define amdgpu_kernel void @float4_extelt(float addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @float4_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <4 x float> <float 0.0, float 1.0, float 2.0, float 4.0>, i32 %sel
-  store float %ext, float addrspace(1)* %out
+  store float %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -29,10 +29,10 @@ entry:
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2, [[V1]], vcc
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], 4, [[V2]], vcc
 ; GCN: store_dword v[{{[0-9:]+}}], [[V3]]
-define amdgpu_kernel void @int4_extelt(i32 addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @int4_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 4>, i32 %sel
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -48,10 +48,10 @@ entry:
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x40100a3d, s{{[0-9]+}}
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x70a3d70a, s{{[0-9]+}}
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
-define amdgpu_kernel void @double4_extelt(double addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @double4_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <4 x double> <double 0.01, double 1.01, double 2.01, double 4.01>, i32 %sel
-  store double %ext, double addrspace(1)* %out
+  store double %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -69,10 +69,10 @@ entry:
 ; GCN-DAG: s_cmp_eq_u32 s{{[[0-9]+}}, 4
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x40140a3d, s{{[0-9]+}}
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
-define amdgpu_kernel void @double5_extelt(double addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @double5_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <5 x double> <double 0.01, double 1.01, double 2.01, double 4.01, double 5.01>, i32 %sel
-  store double %ext, double addrspace(1)* %out
+  store double %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -84,10 +84,10 @@ entry:
 ; GCN:     s_lshr_b64 s[[[RL:[0-9]+]]:{{[0-9]+}}], s[[[SL]]:[[SH]]], [[SEL]]
 ; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]]
 ; GCN:     store_short v[{{[0-9:]+}}], v[[VRL]]
-define amdgpu_kernel void @half4_extelt(half addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @half4_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <4 x half> <half 1.0, half 2.0, half 3.0, half 4.0>, i32 %sel
-  store half %ext, half addrspace(1)* %out
+  store half %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -97,10 +97,10 @@ entry:
 ; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1.0, [[C1]]
 ; GCN: store_dword v[{{[0-9:]+}}], [[V1]]
-define amdgpu_kernel void @float2_extelt(float addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @float2_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <2 x float> <float 0.0, float 1.0>, i32 %sel
-  store float %ext, float addrspace(1)* %out
+  store float %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -110,10 +110,10 @@ entry:
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3f847ae1
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x47ae147b
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
-define amdgpu_kernel void @double2_extelt(double addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @double2_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <2 x double> <double 0.01, double 1.01>, i32 %sel
-  store double %ext, double addrspace(1)* %out
+  store double %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -141,10 +141,10 @@ entry:
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]]
 ; GCN:     store_short v[{{[0-9:]+}}], [[V7]]
-define amdgpu_kernel void @half8_extelt(half addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @half8_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <8 x half> <half 1.0, half 2.0, half 3.0, half 4.0, half 5.0, half 6.0, half 7.0, half 8.0>, i32 %sel
-  store half %ext, half addrspace(1)* %out
+  store half %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -172,10 +172,10 @@ entry:
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]]
 ; GCN:     store_short v[{{[0-9:]+}}], [[V7]]
-define amdgpu_kernel void @short8_extelt(i16 addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @short8_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i32 %sel
-  store i16 %ext, i16 addrspace(1)* %out
+  store i16 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -196,10 +196,10 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; GCN:     flat_store_dword v[{{[0-9:]+}}], [[RES]]
-define amdgpu_kernel void @float8_extelt(float addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @float8_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, i32 %sel
-  store float %ext, float addrspace(1)* %out
+  store float %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -212,10 +212,10 @@ entry:
 ; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
 ; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
 ; GCN:     store_dwordx2 v[{{[0-9:]+}}], v[[[RES_LO]]:[[RES_HI]]]
-define amdgpu_kernel void @double8_extelt(double addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @double8_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <8 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0>, i32 %sel
-  store double %ext, double addrspace(1)* %out
+  store double %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -228,10 +228,10 @@ entry:
 ; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
 ; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
 ; GCN:     store_dwordx2 v[{{[0-9:]+}}], v[[[RES_LO]]:[[RES_HI]]]
-define amdgpu_kernel void @double7_extelt(double addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <7 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0>, i32 %sel
-  store double %ext, double addrspace(1)* %out
+  store double %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -256,10 +256,10 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41800000
 ; GCN-DAG: v_movrels_b32_e32 [[RES:v[0-9]+]], [[VLO]]
 ; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]
-define amdgpu_kernel void @float16_extelt(float addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @float16_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %sel
-  store float %ext, float addrspace(1)* %out
+  store float %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -272,10 +272,10 @@ entry:
 ; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
 ; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
 ; GCN:     store_dwordx2 v[{{[0-9:]+}}], v[[[RES_LO]]:[[RES_HI]]]
-define amdgpu_kernel void @double15_extelt(double addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <15 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0, double 9.0, double 10.0, double 11.0, double 12.0, double 13.0, double 14.0, double 15.0>, i32 %sel
-  store double %ext, double addrspace(1)* %out
+  store double %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -288,10 +288,10 @@ entry:
 ; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
 ; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
 ; GCN:     store_dwordx2 v[{{[0-9:]+}}], v[[[RES_LO]]:[[RES_HI]]]
-define amdgpu_kernel void @double16_extelt(double addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <16 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0, double 9.0, double 10.0, double 11.0, double 12.0, double 13.0, double 14.0, double 15.0, double 16.0>, i32 %sel
-  store double %ext, double addrspace(1)* %out
+  store double %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -332,10 +332,10 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x42000000
 ; GCN-DAG: v_movrels_b32_e32 [[RES:v[0-9]+]], [[VLO]]
 ; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]
-define amdgpu_kernel void @float32_extelt(float addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @float32_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 32.0>, i32 %sel
-  store float %ext, float addrspace(1)* %out
+  store float %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -347,10 +347,10 @@ entry:
 ; GCN:     s_lshr_b64 s[[[RL:[0-9]+]]:{{[0-9]+}}], s[[[SL]]:[[SH]]], [[SEL]]
 ; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]]
 ; GCN:     store_byte v[{{[0-9:]+}}], v[[VRL]]
-define amdgpu_kernel void @byte8_extelt(i8 addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @byte8_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i32 %sel
-  store i8 %ext, i8 addrspace(1)* %out
+  store i8 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -402,10 +402,10 @@ entry:
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V14:v[0-9]+]], {{[^,]+}}, [[V13]], [[C14]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V15:v[0-9]+]], {{[^,]+}}, [[V14]], [[C15]]
 ; GCN:     store_byte v[{{[0-9:]+}}], [[V15]]
-define amdgpu_kernel void @byte16_extelt(i8 addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @byte16_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>, i32 %sel
-  store i8 %ext, i8 addrspace(1)* %out
+  store i8 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -419,11 +419,11 @@ entry:
 ; GCN:     buffer_load_ubyte [[LOAD:v[0-9]+]],
 ; GCN:     v_and_b32_e32 [[RES:v[0-9]+]], 1, [[LOAD]]
 ; GCN:     flat_store_dword v[{{[0-9:]+}}], [[RES]]
-define amdgpu_kernel void @bit4_extelt(i32 addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @bit4_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <4 x i1> <i1 0, i1 1, i1 0, i1 1>, i32 %sel
   %zext = zext i1 %ext to i32
-  store i32 %zext, i32 addrspace(1)* %out
+  store i32 %zext, ptr addrspace(1) %out
   ret void
 }
 
@@ -435,11 +435,11 @@ entry:
 ; GCN: v_cndmask_b32_e{{32|64}} [[VL:v[0-9]+]], 0, [[V1]], [[CL]]
 ; GCN:     v_and_b32_e32 [[RES:v[0-9]+]], 1, [[VL]]
 ; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]
-define amdgpu_kernel void @bit128_extelt(i32 addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @bit128_extelt(ptr addrspace(1) %out, i32 %sel) {
 entry:
   %ext = extractelement <128 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, i32 %sel
   %zext = zext i1 %ext to i32
-  store i32 %zext, i32 addrspace(1)* %out
+  store i32 %zext, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index c4d5d5636db76..9b91b06fddeff 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -8,13 +8,13 @@
 ; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN-DAG: buffer_store_short [[VELT0]]
 ; GCN-DAG: buffer_store_short [[VELT1]]
-define amdgpu_kernel void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
-  %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
+define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
+  %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
   %p0 = extractelement <2 x half> %vec, i32 0
   %p1 = extractelement <2 x half> %vec, i32 1
-  %out1 = getelementptr half, half addrspace(1)* %out, i32 10
-  store half %p1, half addrspace(1)* %out, align 2
-  store half %p0, half addrspace(1)* %out1, align 2
+  %out1 = getelementptr half, ptr addrspace(1) %out, i32 10
+  store half %p1, ptr addrspace(1) %out, align 2
+  store half %p0, ptr addrspace(1) %out1, align 2
   ret void
 }
 
@@ -26,10 +26,10 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2
 ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN: buffer_store_short [[VELT1]]
 ; GCN: ScratchSize: 0
-define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr, i32 %idx) #0 {
-  %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
+define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 {
+  %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
   %elt = extractelement <2 x half> %vec, i32 %idx
-  store half %elt, half addrspace(1)* %out, align 2
+  store half %elt, ptr addrspace(1) %out, align 2
   ret void
 }
 
@@ -45,15 +45,15 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(
 ; SI: buffer_store_short [[ELT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]
 ; GCN: ScratchSize: 0{{$}}
-define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(1) %idx.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
-  %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
-  %idx = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %idx.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
+  %idx = load i32, ptr addrspace(1) %gep
   %elt = extractelement <2 x half> %vec, i32 %idx
-  store half %elt, half addrspace(1)* %out.gep, align 2
+  store half %elt, ptr addrspace(1) %out.gep, align 2
   ret void
 }
 
@@ -62,12 +62,12 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(
 
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
-define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo) #0 {
   %p0 = extractelement <3 x half> %foo, i32 0
   %p1 = extractelement <3 x half> %foo, i32 2
-  %out1 = getelementptr half, half addrspace(1)* %out, i32 1
-  store half %p1, half addrspace(1)* %out, align 2
-  store half %p0, half addrspace(1)* %out1, align 2
+  %out1 = getelementptr half, ptr addrspace(1) %out, i32 1
+  store half %p1, ptr addrspace(1) %out, align 2
+  store half %p0, ptr addrspace(1) %out1, align 2
   ret void
 }
 
@@ -84,10 +84,10 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3
 ; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
 
 ; GCN: {{buffer|global}}_store_short
-define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo, i32 %idx) #0 {
   %p0 = extractelement <3 x half> %foo, i32 %idx
-  %out1 = getelementptr half, half addrspace(1)* %out, i32 1
-  store half %p0, half addrspace(1)* %out
+  %out1 = getelementptr half, ptr addrspace(1) %out, i32 1
+  store half %p0, ptr addrspace(1) %out
   ret void
 }
 
@@ -100,14 +100,14 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %
 
 ; GFX9: global_load_dword [[LOAD:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, off offset:4
 ; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[LOAD]]
-define amdgpu_kernel void @v_extractelement_v4f16_2(half addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
-  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <4 x half>, ptr addrspace(1) %in.gep
   %vec.extract = extractelement <4 x half> %vec, i32 2
-  store half %vec.extract, half addrspace(1)* %out.gep
+  store half %vec.extract, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -121,15 +121,15 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(half addrspace(1)* %out, <4
 
 ; SI: v_lshr_b64 v[[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]], v[[[LO]]:[[HI]]], [[SCALED_IDX]]
 ; SI: buffer_store_short v[[SHIFT_LO]]
-define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(half addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
-  %idx.val = load volatile i32, i32 addrspace(1)* undef
-  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
+  %idx.val = load volatile i32, ptr addrspace(1) undef
+  %vec = load <4 x half>, ptr addrspace(1) %in.gep
   %vec.extract = extractelement <4 x half> %vec, i32 %idx.val
-  store half %vec.extract, half addrspace(1)* %out.gep
+  store half %vec.extract, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -139,12 +139,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(half addrspace(1)*
 ; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0
 ; GCN-NOT: {{s|buffer|flat|global}}_load_
 ; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(<16 x half> addrspace(4)* %ptr) #0 {
-  %load = load <16 x half>, <16 x half> addrspace(4)* %ptr
+define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) %ptr) #0 {
+  %load = load <16 x half>, ptr addrspace(4) %ptr
   %elt0 = extractelement <16 x half> %load, i32 0
   %elt1 = extractelement <16 x half> %load, i32 1
-  store volatile half %elt0, half addrspace(1)* undef, align 2
-  store volatile half %elt1, half addrspace(1)* undef, align 2
+  store volatile half %elt0, ptr addrspace(1) undef, align 2
+  store volatile half %elt1, ptr addrspace(1) undef, align 2
   ret void
 }
 
@@ -154,38 +154,38 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(<16 x half> addrs
 ; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], {{0x1|0x4}}
 ; GCN-NOT: {{s|buffer|flat|global}}_load_
 ; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(<16 x half> addrspace(4)* %ptr) #0 {
-  %load = load <16 x half>, <16 x half> addrspace(4)* %ptr
+define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) %ptr) #0 {
+  %load = load <16 x half>, ptr addrspace(4) %ptr
   %elt2 = extractelement <16 x half> %load, i32 2
   %elt3 = extractelement <16 x half> %load, i32 3
-  store volatile half %elt2, half addrspace(1)* undef, align 2
-  store volatile half %elt3, half addrspace(1)* undef, align 2
+  store volatile half %elt2, ptr addrspace(1) undef, align 2
+  store volatile half %elt3, ptr addrspace(1) undef, align 2
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_extractelement_v8f16_dynamic_sgpr:
 ; GCN-COUNT-7: v_cndmask_b32_e32
-define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(half addrspace(1)* %out, <8 x half> addrspace(1)* %in, i32 %n) #0 {
+define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
-  %vec = load <8 x half>, <8 x half> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <8 x half>, ptr addrspace(1) %in.gep
   %vec.extract = extractelement <8 x half> %vec, i32 %n
-  store half %vec.extract, half addrspace(1)* %out.gep
+  store half %vec.extract, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_extractelement_v16f16_dynamic_sgpr:
 ; GCN-COUNT-15: v_cndmask_b32_e32
-define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(half addrspace(1)* %out, <16 x half> addrspace(1)* %in, i32 %n) #0 {
+define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <16 x half>, <16 x half> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
-  %vec = load <16 x half>, <16 x half> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <16 x half>, ptr addrspace(1) %in.gep
   %vec.extract = extractelement <16 x half> %vec, i32 %n
-  store half %vec.extract, half addrspace(1)* %out.gep
+  store half %vec.extract, ptr addrspace(1) %out.gep
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
index ffa9b912eae3a..5002929ec6d13 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
@@ -5,10 +5,10 @@
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_load_dwordx2
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
-  %ld = load volatile <3 x double>, <3 x double> addrspace(1)* %in
+define amdgpu_kernel void @extract_vector_elt_v3f64_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %ld = load volatile <3 x double>, ptr addrspace(1) %in
   %elt = extractelement <3 x double> %ld, i32 2
-  store volatile double %elt, double addrspace(1)* %out
+  store volatile double %elt, ptr addrspace(1) %out
   ret void
 }
 
@@ -21,9 +21,9 @@ define amdgpu_kernel void @extract_vector_elt_v3f64_2(double addrspace(1)* %out,
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
-define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(ptr addrspace(1) %out, <3 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x double> %foo, i32 %elt
-  store volatile double %dynelt, double addrspace(1)* %out
+  store volatile double %dynelt, ptr addrspace(1) %out
   ret void
 }
 
@@ -39,9 +39,9 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %ou
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
-define amdgpu_kernel void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v4f64(ptr addrspace(1) %out, <4 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x double> %foo, i32 %elt
-  store volatile double %dynelt, double addrspace(1)* %out
+  store volatile double %dynelt, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
index 5be7ab008f509..2d520825b4d1a 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
@@ -12,13 +12,13 @@
 ; GFX9: v_mov_b32_e32 [[VVEC:v[0-9]+]], [[VEC]]
 ; GFX9: global_store_short_d16_hi v{{[0-9]+}}, [[VVEC]],
 ; GFX9: buffer_store_short [[VVEC]],
-define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
+define amdgpu_kernel void @extract_vector_elt_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
+  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %p0 = extractelement <2 x i16> %vec, i32 0
   %p1 = extractelement <2 x i16> %vec, i32 1
-  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10
-  store i16 %p1, i16 addrspace(1)* %out, align 2
-  store i16 %p0, i16 addrspace(1)* %out1, align 2
+  %out1 = getelementptr i16, ptr addrspace(1) %out, i32 10
+  store i16 %p1, ptr addrspace(1) %out, align 2
+  store i16 %p0, ptr addrspace(1) %out1, align 2
   ret void
 }
 
@@ -30,10 +30,10 @@ define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x
 ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN: buffer_store_short [[VELT1]]
 ; GCN: ScratchSize: 0
-define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %idx) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
+define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %idx) #0 {
+  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %elt = extractelement <2 x i16> %vec, i32 %idx
-  store i16 %elt, i16 addrspace(1)* %out, align 2
+  store i16 %elt, ptr addrspace(1) %out, align 2
   ret void
 }
 
@@ -48,15 +48,15 @@ define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1
 ; SI: buffer_store_short [[ELT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]
 ; GCN: ScratchSize: 0{{$}}
-define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(1) %idx.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
-  %idx = load volatile i32, i32 addrspace(1)* %gep
-  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %idx.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i16, ptr addrspace(1) %out, i64 %tid.ext
+  %idx = load volatile i32, ptr addrspace(1) %gep
+  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %elt = extractelement <2 x i16> %vec, i32 %idx
-  store i16 %elt, i16 addrspace(1)* %out.gep, align 2
+  store i16 %elt, ptr addrspace(1) %out.gep, align 2
   ret void
 }
 
@@ -67,12 +67,12 @@ define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1
 
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
-define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3i16(ptr addrspace(1) %out, <3 x i16> %foo) #0 {
   %p0 = extractelement <3 x i16> %foo, i32 0
   %p1 = extractelement <3 x i16> %foo, i32 2
-  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
-  store i16 %p1, i16 addrspace(1)* %out, align 2
-  store i16 %p0, i16 addrspace(1)* %out1, align 2
+  %out1 = getelementptr i16, ptr addrspace(1) %out, i32 1
+  store i16 %p1, ptr addrspace(1) %out, align 2
+  store i16 %p0, ptr addrspace(1) %out1, align 2
   ret void
 }
 
@@ -86,12 +86,12 @@ define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x
 ; GFX89-DAG: buffer_store_short [[VLOAD0]], off
 ; GFX89-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], s[[#LOAD + 3]]
 ; GFX89-DAG: buffer_store_short [[VLOAD1]], off
-define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v4i16(ptr addrspace(1) %out, <4 x i16> %foo) #0 {
   %p0 = extractelement <4 x i16> %foo, i32 0
   %p1 = extractelement <4 x i16> %foo, i32 2
-  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10
-  store volatile i16 %p1, i16 addrspace(1)* %out, align 2
-  store volatile i16 %p0, i16 addrspace(1)* %out1, align 2
+  %out1 = getelementptr i16, ptr addrspace(1) %out, i32 10
+  store volatile i16 %p1, ptr addrspace(1) %out, align 2
+  store volatile i16 %p0, ptr addrspace(1) %out1, align 2
   ret void
 }
 
@@ -116,22 +116,22 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x
 ; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
 ; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s
 ; GCN: {{buffer|global}}_store_short
-define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, [8 x i32], <3 x i16> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(ptr addrspace(1) %out, [8 x i32], <3 x i16> %foo, i32 %idx) #0 {
   %p0 = extractelement <3 x i16> %foo, i32 %idx
-  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
-  store i16 %p0, i16 addrspace(1)* %out
+  %out1 = getelementptr i16, ptr addrspace(1) %out, i32 1
+  store i16 %p0, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_insertelement_v4i16_dynamic_sgpr:
-define amdgpu_kernel void @v_insertelement_v4i16_dynamic_sgpr(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %idx) #0 {
+define amdgpu_kernel void @v_insertelement_v4i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
-  %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds i16, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <4 x i16>, ptr addrspace(1) %in.gep
   %vec.extract = extractelement <4 x i16> %vec, i32 %idx
-  store i16 %vec.extract, i16 addrspace(1)* %out.gep
+  store i16 %vec.extract, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -141,12 +141,12 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_sgpr(i16 addrspace(1)*
 ; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0
 ; GCN-NOT: {{s|buffer|flat|global}}_load_
 ; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-define amdgpu_kernel void @reduce_load_vector_v8i16_extract_01(<16 x i16> addrspace(4)* %ptr) #0 {
-  %load = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
+define amdgpu_kernel void @reduce_load_vector_v8i16_extract_01(ptr addrspace(4) %ptr) #0 {
+  %load = load <16 x i16>, ptr addrspace(4) %ptr
   %elt0 = extractelement <16 x i16> %load, i32 0
   %elt1 = extractelement <16 x i16> %load, i32 1
-  store volatile i16 %elt0, i16 addrspace(1)* undef, align 2
-  store volatile i16 %elt1, i16 addrspace(1)* undef, align 2
+  store volatile i16 %elt0, ptr addrspace(1) undef, align 2
+  store volatile i16 %elt1, ptr addrspace(1) undef, align 2
   ret void
 }
 
@@ -156,12 +156,12 @@ define amdgpu_kernel void @reduce_load_vector_v8i16_extract_01(<16 x i16> addrsp
 ; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], {{0x1|0x4}}
 ; GCN-NOT: {{s|buffer|flat|global}}_load_
 ; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-define amdgpu_kernel void @reduce_load_vector_v8i16_extract_23(<16 x i16> addrspace(4)* %ptr) #0 {
-  %load = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
+define amdgpu_kernel void @reduce_load_vector_v8i16_extract_23(ptr addrspace(4) %ptr) #0 {
+  %load = load <16 x i16>, ptr addrspace(4) %ptr
   %elt2 = extractelement <16 x i16> %load, i32 2
   %elt3 = extractelement <16 x i16> %load, i32 3
-  store volatile i16 %elt2, i16 addrspace(1)* undef, align 2
-  store volatile i16 %elt3, i16 addrspace(1)* undef, align 2
+  store volatile i16 %elt2, ptr addrspace(1) undef, align 2
+  store volatile i16 %elt3, ptr addrspace(1) undef, align 2
   ret void
 }
 
@@ -172,14 +172,14 @@ define amdgpu_kernel void @reduce_load_vector_v8i16_extract_23(<16 x i16> addrsp
 ; VI: flat_store_short v[{{[0-9:]+}}], [[RES]]
 ; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:4
 ; GFX9: global_store_short v{{[0-9]+}}, [[RES]]
-define amdgpu_kernel void @v_extractelement_v8i16_2(i16 addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_extractelement_v8i16_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
-  %vec = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds i16, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <8 x i16>, ptr addrspace(1) %in.gep
   %vec.extract = extractelement <8 x i16> %vec, i32 2
-  store i16 %vec.extract, i16 addrspace(1)* %out.gep
+  store i16 %vec.extract, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -190,27 +190,27 @@ define amdgpu_kernel void @v_extractelement_v8i16_2(i16 addrspace(1)* %out, <8 x
 ; VI: flat_store_short v[{{[0-9:]+}}], [[RES]]
 ; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:12
 ; GFX9: global_store_short v{{[0-9]+}}, [[RES]]
-define amdgpu_kernel void @v_extractelement_v8i16_6(i16 addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_extractelement_v8i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
-  %vec = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds i16, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <8 x i16>, ptr addrspace(1) %in.gep
   %vec.extract = extractelement <8 x i16> %vec, i32 6
-  store i16 %vec.extract, i16 addrspace(1)* %out.gep
+  store i16 %vec.extract, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_extractelement_v8i16_dynamic_sgpr:
 ; GCN-COUNT-7: v_cndmask_b32_e32
-define amdgpu_kernel void @v_extractelement_v8i16_dynamic_sgpr(i16 addrspace(1)* %out, <8 x i16> addrspace(1)* %in, i32 %n) #0 {
+define amdgpu_kernel void @v_extractelement_v8i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
-  %vec = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds i16, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <8 x i16>, ptr addrspace(1) %in.gep
   %vec.extract = extractelement <8 x i16> %vec, i32 %n
-  store i16 %vec.extract, i16 addrspace(1)* %out.gep
+  store i16 %vec.extract, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -221,14 +221,14 @@ define amdgpu_kernel void @v_extractelement_v8i16_dynamic_sgpr(i16 addrspace(1)*
 ; VI: flat_store_short v[{{[0-9:]+}}], [[RES]]
 ; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:4
 ; GFX9: global_store_short v{{[0-9]+}}, [[RES]]
-define amdgpu_kernel void @v_extractelement_v16i16_2(i16 addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_extractelement_v16i16_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
-  %vec = load <16 x i16>, <16 x i16> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds i16, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <16 x i16>, ptr addrspace(1) %in.gep
   %vec.extract = extractelement <16 x i16> %vec, i32 2
-  store i16 %vec.extract, i16 addrspace(1)* %out.gep
+  store i16 %vec.extract, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -239,27 +239,27 @@ define amdgpu_kernel void @v_extractelement_v16i16_2(i16 addrspace(1)* %out, <16
 ; VI: flat_store_short v[{{[0-9:]+}}], [[RES]]
 ; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:12
 ; GFX9: global_store_short v{{[0-9]+}}, [[RES]]
-define amdgpu_kernel void @v_extractelement_v16i16_6(i16 addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_extractelement_v16i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
-  %vec = load <16 x i16>, <16 x i16> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds i16, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <16 x i16>, ptr addrspace(1) %in.gep
   %vec.extract = extractelement <16 x i16> %vec, i32 6
-  store i16 %vec.extract, i16 addrspace(1)* %out.gep
+  store i16 %vec.extract, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_extractelement_v16i16_dynamic_sgpr:
 ; GCN-COUNT-15: v_cndmask_b32_e32
-define amdgpu_kernel void @v_extractelement_v16i16_dynamic_sgpr(i16 addrspace(1)* %out, <16 x i16> addrspace(1)* %in, i32 %n) #0 {
+define amdgpu_kernel void @v_extractelement_v16i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
-  %vec = load <16 x i16>, <16 x i16> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds i16, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <16 x i16>, ptr addrspace(1) %in.gep
   %vec.extract = extractelement <16 x i16> %vec, i32 %n
-  store i16 %vec.extract, i16 addrspace(1)* %out.gep
+  store i16 %vec.extract, ptr addrspace(1) %out.gep
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
index b2f5697383f4d..564b57544092f 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
@@ -8,24 +8,24 @@
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) #0 {
+define amdgpu_kernel void @extract_vector_elt_select_error(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %val) #0 {
   %vec = bitcast i64 %val to <2 x i32>
   %elt0 = extractelement <2 x i32> %vec, i32 0
   %elt1 = extractelement <2 x i32> %vec, i32 1
 
-  store volatile i32 %elt0, i32 addrspace(1)* %out
-  store volatile i32 %elt1, i32 addrspace(1)* %out
-  store volatile i64 %val, i64 addrspace(1)* %in
+  store volatile i32 %elt0, ptr addrspace(1) %out
+  store volatile i32 %elt1, ptr addrspace(1) %out
+  store volatile i64 %val, ptr addrspace(1) %in
   ret void
 }
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v2i64:
-define amdgpu_kernel void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i64(ptr addrspace(1) %out, <2 x i64> %foo) #0 {
   %p0 = extractelement <2 x i64> %foo, i32 0
   %p1 = extractelement <2 x i64> %foo, i32 1
-  %out1 = getelementptr i64, i64 addrspace(1)* %out, i32 1
-  store volatile i64 %p1, i64 addrspace(1)* %out
-  store volatile i64 %p0, i64 addrspace(1)* %out1
+  %out1 = getelementptr i64, ptr addrspace(1) %out, i32 1
+  store volatile i64 %p1, ptr addrspace(1) %out
+  store volatile i64 %p0, ptr addrspace(1) %out1
   ret void
 }
 
@@ -35,9 +35,9 @@ define amdgpu_kernel void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
-define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(ptr addrspace(1) %out, <2 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <2 x i64> %foo, i32 %elt
-  store volatile i64 %dynelt, i64 addrspace(1)* %out
+  store volatile i64 %dynelt, ptr addrspace(1) %out
   ret void
 }
 
@@ -49,11 +49,11 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out,
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
-define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 {
-  %load = load volatile <2 x i64>, <2 x i64> addrspace(1)* %foo
+define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(ptr addrspace(1) %out, ptr addrspace(1) %foo, i32 %elt, <2 x i64> %arst) #0 {
+  %load = load volatile <2 x i64>, ptr addrspace(1) %foo
   %or = or <2 x i64> %load, %arst
   %dynelt = extractelement <2 x i64> %or, i32 %elt
-  store volatile i64 %dynelt, i64 addrspace(1)* %out
+  store volatile i64 %dynelt, ptr addrspace(1) %out
   ret void
 }
 
@@ -66,9 +66,9 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
-define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(ptr addrspace(1) %out, <3 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x i64> %foo, i32 %elt
-  store volatile i64 %dynelt, i64 addrspace(1)* %out
+  store volatile i64 %dynelt, ptr addrspace(1) %out
   ret void
 }
 
@@ -84,9 +84,9 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out,
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
-define amdgpu_kernel void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v4i64(ptr addrspace(1) %out, <4 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x i64> %foo, i32 %elt
-  store volatile i64 %dynelt, i64 addrspace(1)* %out
+  store volatile i64 %dynelt, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
index 541631710dff5..331fe26160d41 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
@@ -5,9 +5,9 @@
 ; GCN: s_load_dword [[LOAD:s[0-9]+]]
 ; GCN: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
 ; GCN: buffer_store_byte [[V_LOAD]]
-define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i8> %foo) #0 {
   %p0 = extractelement <1 x i8> %foo, i32 0
-  store i8 %p0, i8 addrspace(1)* %out
+  store i8 %p0, ptr addrspace(1) %out
   ret void
 }
 
@@ -19,12 +19,12 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i
 ; GCN-NOT: {{flat|buffer|global}}
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i8> %foo) #0 {
   %p0 = extractelement <2 x i8> %foo, i32 0
   %p1 = extractelement <2 x i8> %foo, i32 1
-  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store volatile i8 %p1, i8 addrspace(1)* %out
-  store volatile i8 %p0, i8 addrspace(1)* %out1
+  %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
+  store volatile i8 %p1, ptr addrspace(1) %out
+  store volatile i8 %p0, ptr addrspace(1) %out1
   ret void
 }
 
@@ -35,12 +35,12 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i
 ; GCN-NOT: {{flat|buffer|global}}
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i8> %foo) #0 {
   %p0 = extractelement <3 x i8> %foo, i32 0
   %p1 = extractelement <3 x i8> %foo, i32 2
-  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store volatile i8 %p1, i8 addrspace(1)* %out
-  store volatile i8 %p0, i8 addrspace(1)* %out1
+  %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
+  store volatile i8 %p1, ptr addrspace(1) %out
+  store volatile i8 %p0, ptr addrspace(1) %out1
   ret void
 }
 
@@ -51,12 +51,12 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i
 ; GCN-NOT: {{flat|buffer|global}}
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i8> %foo) #0 {
   %p0 = extractelement <4 x i8> %foo, i32 0
   %p1 = extractelement <4 x i8> %foo, i32 2
-  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store volatile i8 %p1, i8 addrspace(1)* %out
-  store volatile i8 %p0, i8 addrspace(1)* %out1
+  %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
+  store volatile i8 %p1, ptr addrspace(1) %out
+  store volatile i8 %p0, ptr addrspace(1) %out1
   ret void
 }
 
@@ -71,8 +71,8 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i
 define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
   %p0 = extractelement <8 x i8> %foo, i32 0
   %p1 = extractelement <8 x i8> %foo, i32 2
-  store volatile i8 %p1, i8 addrspace(1)* null
-  store volatile i8 %p0, i8 addrspace(1)* null
+  store volatile i8 %p1, ptr addrspace(1) null
+  store volatile i8 %p0, ptr addrspace(1) null
   ret void
 }
 
@@ -84,12 +84,12 @@ define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
 ; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
 ; GCN: buffer_store_byte [[V_ELT2]]
 ; GCN: buffer_store_byte [[V_LOAD0]]
-define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x i8> %foo) #0 {
   %p0 = extractelement <16 x i8> %foo, i32 0
   %p1 = extractelement <16 x i8> %foo, i32 2
-  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store volatile i8 %p1, i8 addrspace(1)* %out
-  store volatile i8 %p0, i8 addrspace(1)* %out1
+  %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
+  store volatile i8 %p1, ptr addrspace(1) %out
+  store volatile i8 %p0, ptr addrspace(1) %out1
   ret void
 }
 
@@ -105,8 +105,8 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x
 define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
   %p0 = extractelement <32 x i8> %foo, i32 0
   %p1 = extractelement <32 x i8> %foo, i32 2
-  store volatile i8 %p1, i8 addrspace(1)* null
-  store volatile i8 %p0, i8 addrspace(1)* null
+  store volatile i8 %p1, ptr addrspace(1) null
+  store volatile i8 %p0, ptr addrspace(1) null
   ret void
 }
 
@@ -118,12 +118,12 @@ define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
 ; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
 ; GCN: buffer_store_byte [[V_ELT2]]
 ; GCN: buffer_store_byte [[V_LOAD0]]
-define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x i8> %foo) #0 {
   %p0 = extractelement <64 x i8> %foo, i32 0
   %p1 = extractelement <64 x i8> %foo, i32 2
-  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store volatile i8 %p1, i8 addrspace(1)* %out
-  store volatile i8 %p0, i8 addrspace(1)* %out1
+  %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
+  store volatile i8 %p1, ptr addrspace(1) %out
+  store volatile i8 %p0, ptr addrspace(1) %out1
   ret void
 }
 
@@ -140,9 +140,9 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x
 ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
 ; VI: v_lshrrev_b16_e32 [[ELT:v[0-9]+]], [[SCALED_IDX]], [[V_LOAD]]
 ; VI: buffer_store_byte [[ELT]]
-define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 {
   %elt = extractelement <2 x i8> %foo, i32 %idx
-  store volatile i8 %elt, i8 addrspace(1)* %out
+  store volatile i8 %elt, ptr addrspace(1) %out
   ret void
 }
 
@@ -154,10 +154,10 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out
 ; VI: s_lshr_b32 [[ELT:s[0-9]+]], [[LOAD]], [[SCALED_IDX]]
 ; VI: v_mov_b32_e32 [[V_ELT:v[0-9]+]], [[ELT]]
 ; VI: buffer_store_byte [[V_ELT]]
-define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 {
   %p0 = extractelement <3 x i8> %foo, i32 %idx
-  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store volatile i8 %p0, i8 addrspace(1)* %out
+  %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
+  store volatile i8 %p0, ptr addrspace(1) %out
   ret void
 }
 
@@ -170,11 +170,11 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out
 
 ; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], [[EXTRACT]]
 ; VI: buffer_store_byte [[V_EXTRACT]]
-define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(4)* %vec.ptr, [8 x i32], i32 %idx) #0 {
-  %vec = load <4 x i8>, <4 x i8> addrspace(4)* %vec.ptr
+define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %idx) #0 {
+  %vec = load <4 x i8>, ptr addrspace(4) %vec.ptr
   %p0 = extractelement <4 x i8> %vec, i32 %idx
-  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store volatile i8 %p0, i8 addrspace(1)* %out
+  %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
+  store volatile i8 %p0, ptr addrspace(1) %out
   ret void
 }
 
@@ -186,11 +186,11 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out
 ; VI: s_lshr_b64 s[[[EXTRACT_LO:[0-9]+]]:{{[0-9]+\]}}, [[VEC8]], [[SCALED_IDX]]
 ; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], s[[EXTRACT_LO]]
 ; VI: buffer_store_byte [[V_EXTRACT]]
-define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> addrspace(4)* %vec.ptr, i32 %idx) #0 {
-  %vec = load <8 x i8>, <8 x i8> addrspace(4)* %vec.ptr
+define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 {
+  %vec = load <8 x i8>, ptr addrspace(4) %vec.ptr
   %p0 = extractelement <8 x i8> %vec, i32 %idx
-  %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
-  store volatile i8 %p0, i8 addrspace(1)* %out
+  %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
+  store volatile i8 %p0, ptr addrspace(1) %out
   ret void
 }
 
@@ -202,15 +202,15 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out
 ; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
 ; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 24
 define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(4)* null
+  %load = load <8 x i8>, ptr addrspace(4) null
   %elt0 = extractelement <8 x i8> %load, i32 0
   %elt1 = extractelement <8 x i8> %load, i32 1
   %elt2 = extractelement <8 x i8> %load, i32 2
   %elt3 = extractelement <8 x i8> %load, i32 3
-  store volatile i8 %elt0, i8 addrspace(1)* undef, align 1
-  store volatile i8 %elt1, i8 addrspace(1)* undef, align 1
-  store volatile i8 %elt2, i8 addrspace(1)* undef, align 1
-  store volatile i8 %elt3, i8 addrspace(1)* undef, align 1
+  store volatile i8 %elt0, ptr addrspace(1) undef, align 1
+  store volatile i8 %elt1, ptr addrspace(1) undef, align 1
+  store volatile i8 %elt2, ptr addrspace(1) undef, align 1
+  store volatile i8 %elt3, ptr addrspace(1) undef, align 1
   ret void
 }
 
@@ -221,15 +221,15 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
 ; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
 ; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
 define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(4)* null
+  %load = load <8 x i8>, ptr addrspace(4) null
   %elt0 = extractelement <8 x i8> %load, i32 0
   %elt1 = extractelement <8 x i8> %load, i32 1
   %elt4 = extractelement <8 x i8> %load, i32 4
   %elt5 = extractelement <8 x i8> %load, i32 5
-  store volatile i8 %elt0, i8 addrspace(1)* undef, align 1
-  store volatile i8 %elt1, i8 addrspace(1)* undef, align 1
-  store volatile i8 %elt4, i8 addrspace(1)* undef, align 1
-  store volatile i8 %elt5, i8 addrspace(1)* undef, align 1
+  store volatile i8 %elt0, ptr addrspace(1) undef, align 1
+  store volatile i8 %elt1, ptr addrspace(1) undef, align 1
+  store volatile i8 %elt4, ptr addrspace(1) undef, align 1
+  store volatile i8 %elt5, ptr addrspace(1) undef, align 1
   ret void
 }
 
@@ -240,11 +240,11 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
 ; GCN-NOT: {{s|buffer|flat|global}}_load_
 ; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
 define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(4)* null
+  %load = load <8 x i8>, ptr addrspace(4) null
   %elt4 = extractelement <8 x i8> %load, i32 4
   %elt5 = extractelement <8 x i8> %load, i32 5
-  store volatile i8 %elt4, i8 addrspace(1)* undef, align 1
-  store volatile i8 %elt5, i8 addrspace(1)* undef, align 1
+  store volatile i8 %elt4, ptr addrspace(1) undef, align 1
+  store volatile i8 %elt5, ptr addrspace(1) undef, align 1
   ret void
 }
 
@@ -256,15 +256,15 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
 ; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
 ; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
 define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(4)* null
+  %load = load <16 x i8>, ptr addrspace(4) null
   %elt0 = extractelement <16 x i8> %load, i32 0
   %elt1 = extractelement <16 x i8> %load, i32 1
   %elt4 = extractelement <16 x i8> %load, i32 4
   %elt5 = extractelement <16 x i8> %load, i32 5
-  store volatile i8 %elt0, i8 addrspace(1)* undef, align 1
-  store volatile i8 %elt1, i8 addrspace(1)* undef, align 1
-  store volatile i8 %elt4, i8 addrspace(1)* undef, align 1
-  store volatile i8 %elt5, i8 addrspace(1)* undef, align 1
+  store volatile i8 %elt0, ptr addrspace(1) undef, align 1
+  store volatile i8 %elt1, ptr addrspace(1) undef, align 1
+  store volatile i8 %elt4, ptr addrspace(1) undef, align 1
+  store volatile i8 %elt5, ptr addrspace(1) undef, align 1
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
index e04b5d53feb15..d5464ce6aa8a3 100644
--- a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; Make sure the add and load are reduced to 32-bits even with the
 ; bitcast to vector.
-define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
+define amdgpu_kernel void @bitcast_int_to_vector_extract_0(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %b) {
 ; GCN-LABEL: bitcast_int_to_vector_extract_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -26,16 +26,16 @@ define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %ou
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
-   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-   %a = load i64, i64 addrspace(1)* %gep
+   %gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+   %a = load i64, ptr addrspace(1) %gep
    %add = add i64 %a, %b
    %val.bc = bitcast i64 %add to <2 x i32>
    %extract = extractelement <2 x i32> %val.bc, i32 0
-   store i32 %extract, i32 addrspace(1)* %out
+   store i32 %extract, ptr addrspace(1) %out
    ret void
 }
 
-define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) {
+define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(ptr addrspace(1) %out, ptr addrspace(1) %in, double %b) {
 ; GCN-LABEL: bitcast_fp_to_vector_extract_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -56,16 +56,16 @@ define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
-   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
-   %a = load double, double addrspace(1)* %gep
+   %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid
+   %a = load double, ptr addrspace(1) %gep
    %add = fadd double %a, %b
    %val.bc = bitcast double %add to <2 x i32>
    %extract = extractelement <2 x i32> %val.bc, i32 0
-   store i32 %extract, i32 addrspace(1)* %out
+   store i32 %extract, ptr addrspace(1) %out
    ret void
 }
 
-define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
+define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %b) {
 ; GCN-LABEL: bitcast_int_to_fpvector_extract_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -86,16 +86,16 @@ define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)*
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
-   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-   %a = load i64, i64 addrspace(1)* %gep
+   %gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+   %a = load i64, ptr addrspace(1) %gep
    %add = add i64 %a, %b
    %val.bc = bitcast i64 %add to <2 x float>
    %extract = extractelement <2 x float> %val.bc, i32 0
-   store float %extract, float addrspace(1)* %out
+   store float %extract, ptr addrspace(1) %out
    ret void
 }
 
-define amdgpu_kernel void @no_extract_volatile_load_extract0(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @no_extract_volatile_load_extract0(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: no_extract_volatile_load_extract0:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -113,13 +113,13 @@ define amdgpu_kernel void @no_extract_volatile_load_extract0(i32 addrspace(1)* %
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 entry:
-  %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+  %vec = load volatile <4 x i32>, ptr addrspace(1) %in
   %elt0 = extractelement <4 x i32> %vec, i32 0
-  store i32 %elt0, i32 addrspace(1)* %out
+  store i32 %elt0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @no_extract_volatile_load_extract2(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @no_extract_volatile_load_extract2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: no_extract_volatile_load_extract2:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -137,13 +137,13 @@ define amdgpu_kernel void @no_extract_volatile_load_extract2(i32 addrspace(1)* %
 ; GCN-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 entry:
-  %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+  %vec = load volatile <4 x i32>, ptr addrspace(1) %in
   %elt2 = extractelement <4 x i32> %vec, i32 2
-  store i32 %elt2, i32 addrspace(1)* %out
+  store i32 %elt2, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @no_extract_volatile_load_dynextract(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @no_extract_volatile_load_dynextract(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
 ; GCN-LABEL: no_extract_volatile_load_dynextract:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -171,8 +171,8 @@ define amdgpu_kernel void @no_extract_volatile_load_dynextract(i32 addrspace(1)*
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 entry:
-  %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+  %vec = load volatile <4 x i32>, ptr addrspace(1) %in
   %eltN = extractelement <4 x i32> %vec, i32 %idx
-  store i32 %eltN, i32 addrspace(1)* %out
+  store i32 %eltN, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/fail.llvm.fptrunc.round.ll b/llvm/test/CodeGen/AMDGPU/fail.llvm.fptrunc.round.ll
index 7bfe11fcd30ce..dc8dc80146962 100644
--- a/llvm/test/CodeGen/AMDGPU/fail.llvm.fptrunc.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/fail.llvm.fptrunc.round.ll
@@ -1,10 +1,10 @@
 ; RUN: not --crash llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=FAIL
 ; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=FAIL
 
-define amdgpu_gs void @test_fptrunc_round_legalization(double %a, i32 %data0, <4 x i32> %data1, half addrspace(1)* %out) {
+define amdgpu_gs void @test_fptrunc_round_legalization(double %a, i32 %data0, <4 x i32> %data1, ptr addrspace(1) %out) {
 ; FAIL: LLVM ERROR: Cannot select
   %res = call half @llvm.fptrunc.round.f64(double %a, metadata !"round.upward")
-  store half %res, half addrspace(1)* %out, align 4
+  store half %res, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index b6f87a1f95636..f1542f53461e8 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX11 %s
 
 ; Should not merge this to a dword load
-define i32 @global_load_2xi16_align2(i16 addrspace(1)* %p) #0 {
+define i32 @global_load_2xi16_align2(ptr addrspace(1) %p) #0 {
 ; GFX7-ALIGNED-LABEL: global_load_2xi16_align2:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -63,9 +63,9 @@ define i32 @global_load_2xi16_align2(i16 addrspace(1)* %p) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1
-  %p.0 = load i16, i16 addrspace(1)* %p, align 2
-  %p.1 = load i16, i16 addrspace(1)* %gep.p, align 2
+  %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
+  %p.0 = load i16, ptr addrspace(1) %p, align 2
+  %p.1 = load i16, ptr addrspace(1) %gep.p, align 2
   %zext.0 = zext i16 %p.0 to i32
   %zext.1 = zext i16 %p.1 to i32
   %shl.1 = shl i32 %zext.1, 16
@@ -74,7 +74,7 @@ define i32 @global_load_2xi16_align2(i16 addrspace(1)* %p) #0 {
 }
 
 ; Should not merge this to a dword store
-define amdgpu_kernel void @global_store_2xi16_align2(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
 ; GFX7-ALIGNED-LABEL: global_store_2xi16_align2:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
@@ -140,14 +140,14 @@ define amdgpu_kernel void @global_store_2xi16_align2(i16 addrspace(1)* %p, i16 a
 ; GFX11-NEXT:    global_store_b16 v0, v2, s[0:1] offset:2
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1
-  store i16 1, i16 addrspace(1)* %r, align 2
-  store i16 2, i16 addrspace(1)* %gep.r, align 2
+  %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1
+  store i16 1, ptr addrspace(1) %r, align 2
+  store i16 2, ptr addrspace(1) %gep.r, align 2
   ret void
 }
 
 ; Should produce align 1 dword when legal
-define i32 @global_load_2xi16_align1(i16 addrspace(1)* %p) #0 {
+define i32 @global_load_2xi16_align1(ptr addrspace(1) %p) #0 {
 ; GFX7-ALIGNED-LABEL: global_load_2xi16_align1:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -202,9 +202,9 @@ define i32 @global_load_2xi16_align1(i16 addrspace(1)* %p) #0 {
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1
-  %p.0 = load i16, i16 addrspace(1)* %p, align 1
-  %p.1 = load i16, i16 addrspace(1)* %gep.p, align 1
+  %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
+  %p.0 = load i16, ptr addrspace(1) %p, align 1
+  %p.1 = load i16, ptr addrspace(1) %gep.p, align 1
   %zext.0 = zext i16 %p.0 to i32
   %zext.1 = zext i16 %p.1 to i32
   %shl.1 = shl i32 %zext.1, 16
@@ -213,7 +213,7 @@ define i32 @global_load_2xi16_align1(i16 addrspace(1)* %p) #0 {
 }
 
 ; Should produce align 1 dword when legal
-define amdgpu_kernel void @global_store_2xi16_align1(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
 ; GFX7-ALIGNED-LABEL: global_store_2xi16_align1:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
@@ -277,14 +277,14 @@ define amdgpu_kernel void @global_store_2xi16_align1(i16 addrspace(1)* %p, i16 a
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1
-  store i16 1, i16 addrspace(1)* %r, align 1
-  store i16 2, i16 addrspace(1)* %gep.r, align 1
+  %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1
+  store i16 1, ptr addrspace(1) %r, align 1
+  store i16 2, ptr addrspace(1) %gep.r, align 1
   ret void
 }
 
 ; Should merge this to a dword load
-define i32 @global_load_2xi16_align4(i16 addrspace(1)* %p) #0 {
+define i32 @global_load_2xi16_align4(ptr addrspace(1) %p) #0 {
 ; GFX7-ALIGNED-LABEL: global_load_2xi16_align4:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -321,9 +321,9 @@ define i32 @global_load_2xi16_align4(i16 addrspace(1)* %p) #0 {
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1
-  %p.0 = load i16, i16 addrspace(1)* %p, align 4
-  %p.1 = load i16, i16 addrspace(1)* %gep.p, align 2
+  %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
+  %p.0 = load i16, ptr addrspace(1) %p, align 4
+  %p.1 = load i16, ptr addrspace(1) %gep.p, align 2
   %zext.0 = zext i16 %p.0 to i32
   %zext.1 = zext i16 %p.1 to i32
   %shl.1 = shl i32 %zext.1, 16
@@ -332,7 +332,7 @@ define i32 @global_load_2xi16_align4(i16 addrspace(1)* %p) #0 {
 }
 
 ; Should merge this to a dword store
-define amdgpu_kernel void @global_store_2xi16_align4(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
 ; GFX7-LABEL: global_store_2xi16_align4:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
@@ -389,9 +389,9 @@ define amdgpu_kernel void @global_store_2xi16_align4(i16 addrspace(1)* %p, i16 a
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1
-  store i16 1, i16 addrspace(1)* %r, align 4
-  store i16 2, i16 addrspace(1)* %gep.r, align 2
+  %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1
+  store i16 1, ptr addrspace(1) %r, align 4
+  store i16 2, ptr addrspace(1) %gep.r, align 2
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
index 8ea3eb1480cbc..8184764aa352b 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@@ -9,7 +9,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+unaligned-scratch-access -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX11-FLASTSCR %s
 
 ; Should not merge this to a dword load
-define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 {
+define i32 @private_load_2xi16_align2(ptr addrspace(5) %p) #0 {
 ; GFX7-ALIGNED-LABEL: private_load_2xi16_align2:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -95,9 +95,9 @@ define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 {
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
-  %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
-  %p.0 = load i16, i16 addrspace(5)* %p, align 2
-  %p.1 = load i16, i16 addrspace(5)* %gep.p, align 2
+  %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
+  %p.0 = load i16, ptr addrspace(5) %p, align 2
+  %p.1 = load i16, ptr addrspace(5) %gep.p, align 2
   %zext.0 = zext i16 %p.0 to i32
   %zext.1 = zext i16 %p.1 to i32
   %shl.1 = shl i32 %zext.1, 16
@@ -106,7 +106,7 @@ define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 {
 }
 
 ; Should not merge this to a dword store
-define void @private_store_2xi16_align2(i16 addrspace(5)* %p, i16 addrspace(5)* %r) #0 {
+define void @private_store_2xi16_align2(ptr addrspace(5) %p, ptr addrspace(5) %r) #0 {
 ; GFX7-ALIGNED-LABEL: private_store_2xi16_align2:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -194,14 +194,14 @@ define void @private_store_2xi16_align2(i16 addrspace(5)* %p, i16 addrspace(5)*
 ; GFX11-FLASTSCR-NEXT:    scratch_store_b16 v1, v2, off offset:2
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
-  %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1
-  store i16 1, i16 addrspace(5)* %r, align 2
-  store i16 2, i16 addrspace(5)* %gep.r, align 2
+  %gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
+  store i16 1, ptr addrspace(5) %r, align 2
+  store i16 2, ptr addrspace(5) %gep.r, align 2
   ret void
 }
 
 ; Should produce align 1 dword when legal
-define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 {
+define i32 @private_load_2xi16_align1(ptr addrspace(5) %p) #0 {
 ; GFX7-ALIGNED-LABEL: private_load_2xi16_align1:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -292,9 +292,9 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 {
 ; GFX11-FLASTSCR-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLASTSCR-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
-  %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
-  %p.0 = load i16, i16 addrspace(5)* %p, align 1
-  %p.1 = load i16, i16 addrspace(5)* %gep.p, align 1
+  %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
+  %p.0 = load i16, ptr addrspace(5) %p, align 1
+  %p.1 = load i16, ptr addrspace(5) %gep.p, align 1
   %zext.0 = zext i16 %p.0 to i32
   %zext.1 = zext i16 %p.1 to i32
   %shl.1 = shl i32 %zext.1, 16
@@ -303,7 +303,7 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 {
 }
 
 ; Should produce align 1 dword when legal
-define void @private_store_2xi16_align1(i16 addrspace(5)* %p, i16 addrspace(5)* %r) #0 {
+define void @private_store_2xi16_align1(ptr addrspace(5) %p, ptr addrspace(5) %r) #0 {
 ; GFX7-ALIGNED-LABEL: private_store_2xi16_align1:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -379,14 +379,14 @@ define void @private_store_2xi16_align1(i16 addrspace(5)* %p, i16 addrspace(5)*
 ; GFX11-FLASTSCR-NEXT:    scratch_store_b32 v1, v0, off
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
-  %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1
-  store i16 1, i16 addrspace(5)* %r, align 1
-  store i16 2, i16 addrspace(5)* %gep.r, align 1
+  %gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
+  store i16 1, ptr addrspace(5) %r, align 1
+  store i16 2, ptr addrspace(5) %gep.r, align 1
   ret void
 }
 
 ; Should merge this to a dword load
-define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 {
+define i32 @private_load_2xi16_align4(ptr addrspace(5) %p) #0 {
 ; GFX7-ALIGNED-LABEL: private_load_2xi16_align4:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -446,9 +446,9 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 {
 ; GFX11-FLASTSCR-NEXT:    scratch_load_b32 v0, v0, off
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
-  %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
-  %p.0 = load i16, i16 addrspace(5)* %p, align 4
-  %p.1 = load i16, i16 addrspace(5)* %gep.p, align 2
+  %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
+  %p.0 = load i16, ptr addrspace(5) %p, align 4
+  %p.1 = load i16, ptr addrspace(5) %gep.p, align 2
   %zext.0 = zext i16 %p.0 to i32
   %zext.1 = zext i16 %p.1 to i32
   %shl.1 = shl i32 %zext.1, 16
@@ -457,7 +457,7 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 {
 }
 
 ; Should merge this to a dword store
-define void @private_store_2xi16_align4(i16 addrspace(5)* %p, i16 addrspace(5)* %r) #0 {
+define void @private_store_2xi16_align4(ptr addrspace(5) %p, ptr addrspace(5) %r) #0 {
 ; GFX7-LABEL: private_store_2xi16_align4:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
@@ -535,8 +535,8 @@ define void @private_store_2xi16_align4(i16 addrspace(5)* %p, i16 addrspace(5)*
 ; GFX11-FLASTSCR-NEXT:    scratch_store_b32 v1, v0, off
 ; GFX11-FLASTSCR-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
-  %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1
-  store i16 1, i16 addrspace(5)* %r, align 4
-  store i16 2, i16 addrspace(5)* %gep.r, align 2
+  %gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
+  store i16 1, ptr addrspace(5) %r, align 4
+  store i16 2, ptr addrspace(5) %gep.r, align 2
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index 8881d1d8b23c6..2ba4bba2075d5 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -6,12 +6,12 @@
 ; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32:
 ; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
 ; GFX9: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %v = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %v = load float, ptr addrspace(1) %gep, align 4
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -19,13 +19,13 @@ define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(float addr
 ; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
 ; GCN-NOT: 1.0
-define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = fmul float %load, 15.0
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -34,13 +34,13 @@ define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(float addrspace
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_fmul_legacy_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_fmul_legacy_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = call float @llvm.amdgcn.fmul.legacy(float %load, float 15.0)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -49,13 +49,13 @@ define amdgpu_kernel void @test_fold_canonicalize_fmul_legacy_value_f32(float ad
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = fsub float 15.0, %load
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -64,13 +64,13 @@ define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(float addrspace(
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = fadd float %load, 15.0
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -79,13 +79,13 @@ define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(float addrspace(
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = call float @llvm.sqrt.f32(float %load)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -94,13 +94,13 @@ define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(float addrspace
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = call float @llvm.ceil.f32(float %load)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -109,13 +109,13 @@ define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(float addrspac
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = call float @llvm.floor.f32(float %load)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -125,13 +125,13 @@ define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspac
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = call float @llvm.fma.f32(float %load, float 15.0, float 15.0)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -140,13 +140,13 @@ define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_fmad_ftz_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_fmad_ftz_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = call float @llvm.amdgcn.fmad.ftz.f32(float %load, float 15.0, float 15.0)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -158,13 +158,13 @@ define amdgpu_kernel void @test_fold_canonicalize_fmad_ftz_value_f32(float addrs
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
 ; GCN-NOT: 1.0
-define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = call float @llvm.fmuladd.f32(float %load, float 15.0, float 15.0)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -176,13 +176,13 @@ define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrsp
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = call float @llvm.canonicalize.f32(float %load)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -191,14 +191,14 @@ define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float a
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(float addrspace(1)* %arg, double addrspace(1)* %out) {
+define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(ptr addrspace(1) %arg, ptr addrspace(1) %out) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = fpext float %load to double
   %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
-  %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
-  store double %canonicalized, double addrspace(1)* %gep2, align 8
+  %gep2 = getelementptr inbounds double, ptr addrspace(1) %out, i32 %id
+  store double %canonicalized, ptr addrspace(1) %gep2, align 8
   ret void
 }
 
@@ -207,14 +207,14 @@ define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(float a
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(half addrspace(1)* %arg, float addrspace(1)* %out) {
+define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(ptr addrspace(1) %arg, ptr addrspace(1) %out) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
-  %load = load half, half addrspace(1)* %gep, align 2
+  %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
+  %load = load half, ptr addrspace(1) %gep, align 2
   %v = fpext half %load to float
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
-  store float %canonicalized, float addrspace(1)* %gep2, align 4
+  %gep2 = getelementptr inbounds float, ptr addrspace(1) %out, i32 %id
+  store float %canonicalized, ptr addrspace(1) %gep2, align 4
   ret void
 }
 
@@ -223,14 +223,14 @@ define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(half ad
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16_flushf16(half addrspace(1)* %arg, float addrspace(1)* %out) #2 {
+define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16_flushf16(ptr addrspace(1) %arg, ptr addrspace(1) %out) #2 {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
-  %load = load half, half addrspace(1)* %gep, align 2
+  %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
+  %load = load half, ptr addrspace(1) %gep, align 2
   %v = fpext half %load to float
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
-  store float %canonicalized, float addrspace(1)* %gep2, align 4
+  %gep2 = getelementptr inbounds float, ptr addrspace(1) %out, i32 %id
+  store float %canonicalized, ptr addrspace(1) %gep2, align 4
   ret void
 }
 
@@ -239,14 +239,14 @@ define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16_flushf1
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(double addrspace(1)* %arg, float addrspace(1)* %out) {
+define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(ptr addrspace(1) %arg, ptr addrspace(1) %out) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
-  %load = load double, double addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
+  %load = load double, ptr addrspace(1) %gep, align 8
   %v = fptrunc double %load to float
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
-  store float %canonicalized, float addrspace(1)* %gep2, align 4
+  %gep2 = getelementptr inbounds float, ptr addrspace(1) %out, i32 %id
+  store float %canonicalized, ptr addrspace(1) %gep2, align 4
   ret void
 }
 
@@ -255,14 +255,14 @@ define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(double a
 ; GCN-NOT: v_max
 ; GCN-NOT: v_mul
 ; GCN: {{flat|global}}_store_short v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(float addrspace(1)* %arg, half addrspace(1)* %out) {
+define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(ptr addrspace(1) %arg, ptr addrspace(1) %out) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = fptrunc float %load to half
   %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
-  %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
-  store half %canonicalized, half addrspace(1)* %gep2, align 2
+  %gep2 = getelementptr inbounds half, ptr addrspace(1) %out, i32 %id
+  store half %canonicalized, ptr addrspace(1) %gep2, align 2
   ret void
 }
 
@@ -271,14 +271,14 @@ define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(float ad
 ; GCN-NOT: v_max
 ; GCN-NOT: v_mul
 ; GCN: {{flat|global}}_store_short v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32_flushf16(float addrspace(1)* %arg, half addrspace(1)* %out) #2 {
+define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32_flushf16(ptr addrspace(1) %arg, ptr addrspace(1) %out) #2 {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = fptrunc float %load to half
   %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
-  %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
-  store half %canonicalized, half addrspace(1)* %gep2, align 2
+  %gep2 = getelementptr inbounds half, ptr addrspace(1) %out, i32 %id
+  store half %canonicalized, ptr addrspace(1) %gep2, align 2
   ret void
 }
 
@@ -291,27 +291,27 @@ define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32_flushf16
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x float> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) {
+define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(ptr addrspace(1) %arg, ptr addrspace(1) %out) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %v = fptrunc <2 x float> %load to <2 x half>
   %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v)
-  %gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i32 %id
-  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %gep2, align 4
+  %gep2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i32 %id
+  store <2 x half> %canonicalized, ptr addrspace(1) %gep2, align 4
   ret void
 }
 
 ; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32:
 ; VI:  v_mul_f32_e32 v{{[0-9]+}}, -1.0, v{{[0-9]+}}
 ; GFX9: v_max_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
-define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = fneg float %load
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -320,27 +320,27 @@ define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrsp
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v0 = fadd float %load, 0.0
   %v = fneg float %v0
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
 ; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32:
 ; VI:  v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
 ; GFX9: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
-define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = tail call float @llvm.fabs.f32(float %load)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -350,15 +350,15 @@ define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrsp
 
 ; GCN-NOT: v_mul_
 ; GCN-NOT: v_max_
-define amdgpu_kernel void @test_no_fold_canonicalize_fcopysign_value_f32(float addrspace(1)* %arg, float %sign) {
+define amdgpu_kernel void @test_no_fold_canonicalize_fcopysign_value_f32(ptr addrspace(1) %arg, float %sign) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %canon.load = tail call float @llvm.canonicalize.f32(float %load)
   %copysign = call float @llvm.copysign.f32(float %canon.load, float %sign)
   %v = tail call float @llvm.fabs.f32(float %load)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -367,14 +367,14 @@ define amdgpu_kernel void @test_no_fold_canonicalize_fcopysign_value_f32(float a
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v0 = fadd float %load, 0.0
   %v = tail call float @llvm.fabs.f32(float %v0)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -383,13 +383,13 @@ define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(float addrspace
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = tail call float @llvm.sin.f32(float %load)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -398,13 +398,13 @@ define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(float addrspace(
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = tail call float @llvm.cos.f32(float %load)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -413,13 +413,13 @@ define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(float addrspace(
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_short v{{.+}}, [[V0]]
-define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(half addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
-  %load = load half, half addrspace(1)* %gep, align 2
+  %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
+  %load = load half, ptr addrspace(1) %gep, align 2
   %v = tail call half @llvm.sin.f16(half %load)
   %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
-  store half %canonicalized, half addrspace(1)* %gep, align 2
+  store half %canonicalized, ptr addrspace(1) %gep, align 2
   ret void
 }
 
@@ -428,13 +428,13 @@ define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(half addrspace(1
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_short v{{.+}}, [[V0]]
-define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(half addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
-  %load = load half, half addrspace(1)* %gep, align 2
+  %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
+  %load = load half, ptr addrspace(1) %gep, align 2
   %v = tail call half @llvm.cos.f16(half %load)
   %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
-  store half %canonicalized, half addrspace(1)* %gep, align 2
+  store half %canonicalized, ptr addrspace(1) %gep, align 2
   ret void
 }
 
@@ -443,11 +443,11 @@ define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(half addrspace(1
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
   %canonicalized = tail call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -461,13 +461,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace
 ; GCN-NOT: v_mul
 
 ; GFX9: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -481,13 +481,13 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_iee
 ; GCN-DENORM-NOT: v_mul
 
 ; GFX9: {{flat|global}}_store_dword
-define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(float addrspace(1)* %arg) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(ptr addrspace(1) %arg) #1 {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -496,14 +496,14 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nna
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v0 = fadd float %load, 0.0
   %v = tail call float @llvm.minnum.f32(float %v0, float 0.0)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -513,13 +513,13 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspa
 ; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]]
 ; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[LOAD]]
 ; GFX9: v_max_f32_e32 v{{[0-9]+}}, [[LOAD]], [[LOAD]]
-define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 2139095041 to float))
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -540,13 +540,13 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN:   {{flat|global}}_store_dword v{{.+}}, [[RESULT]]
-define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -563,13 +563,13 @@ define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspa
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN:  {{flat|global}}_store_dword v{{.+}}, [[RESULT]]
-define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v = tail call float @llvm.maxnum.f32(float %load, float 0.0)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -578,14 +578,14 @@ define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32_iee
 ; GCN-NOT: v_max
 ; GCN-NOT: v_mul
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load = load float, ptr addrspace(1) %gep, align 4
   %v0 = fadd float %load, 0.0
   %v = tail call float @llvm.maxnum.f32(float %v0, float 0.0)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -594,14 +594,14 @@ define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(float addrspa
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN:  {{flat|global}}_store_dwordx2 v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(double addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
-  %load = load double, double addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
+  %load = load double, ptr addrspace(1) %gep, align 8
   %v0 = fadd double %load, 0.0
   %v = tail call double @llvm.maxnum.f64(double %v0, double 0.0)
   %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
-  store double %canonicalized, double addrspace(1)* %gep, align 8
+  store double %canonicalized, ptr addrspace(1) %gep, align 8
   ret void
 }
 
@@ -648,13 +648,13 @@ entry:
 ; GFX9-DENORM-NOT: v_max
 ; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
 ; GFX9-FLUSH: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(float addrspace(1)* %arg, float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(ptr addrspace(1) %arg, ptr addrspace(1) %out) #1 {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %v = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %v = load float, ptr addrspace(1) %gep, align 4
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
-  store float %canonicalized, float addrspace(1)* %gep2, align 4
+  %gep2 = getelementptr inbounds float, ptr addrspace(1) %out, i32 %id
+  store float %canonicalized, ptr addrspace(1) %gep2, align 4
   ret void
 }
 
@@ -663,13 +663,13 @@ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(float addr
 ; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[V]]
 ; GCN-NOT: v_mul_
 ; GCN-NOT: v_max_
-define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(double addrspace(1)* %arg, double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(ptr addrspace(1) %arg, ptr addrspace(1) %out) #1 {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
-  %v = load double, double addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
+  %v = load double, ptr addrspace(1) %gep, align 8
   %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
-  %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
-  store double %canonicalized, double addrspace(1)* %gep2, align 8
+  %gep2 = getelementptr inbounds double, ptr addrspace(1) %out, i32 %id
+  store double %canonicalized, ptr addrspace(1) %gep2, align 8
   ret void
 }
 
@@ -678,13 +678,13 @@ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(double add
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
 ; GCN: {{flat|global}}_store_short v{{.+}}, [[V]]
-define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(half addrspace(1)* %arg, half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(ptr addrspace(1) %arg, ptr addrspace(1) %out) #1 {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
-  %v = load half, half addrspace(1)* %gep, align 2
+  %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
+  %v = load half, ptr addrspace(1) %gep, align 2
   %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
-  %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
-  store half %canonicalized, half addrspace(1)* %gep2, align 2
+  %gep2 = getelementptr inbounds half, ptr addrspace(1) %out, i32 %id
+  store half %canonicalized, ptr addrspace(1) %gep2, align 2
   ret void
 }
 
@@ -694,18 +694,18 @@ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(half addrs
 ; GCN: v_cndmask_b32
 ; GCN-NOT: v_mul_
 ; GCN-NOT: v_max_
-define amdgpu_kernel void @test_fold_canonicalize_select_value_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_fold_canonicalize_select_value_f32(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
-  %load0 = load volatile float, float addrspace(1)* %gep, align 4
-  %load1 = load volatile float, float addrspace(1)* %gep, align 4
-  %load2 = load volatile i32, i32 addrspace(1)* undef, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+  %load0 = load volatile float, ptr addrspace(1) %gep, align 4
+  %load1 = load volatile float, ptr addrspace(1) %gep, align 4
+  %load2 = load volatile i32, ptr addrspace(1) undef, align 4
   %v0 = fadd float %load0, 15.0
   %v1 = fadd float %load1, 32.0
   %cond = icmp eq i32 %load2, 0
   %select = select i1 %cond, float %v0, float %v1
   %canonicalized = tail call float @llvm.canonicalize.f32(float %select)
-  store float %canonicalized, float addrspace(1)* %gep, align 4
+  store float %canonicalized, ptr addrspace(1) %gep, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fcmp-cnd.ll b/llvm/test/CodeGen/AMDGPU/fcmp-cnd.ll
index 7f8be804309ee..da53c2151ccdd 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp-cnd.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp-cnd.ll
@@ -4,11 +4,11 @@
 ;registers and literal.x depending on what the optimizer does.
 ;CHECK: CNDE  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define amdgpu_kernel void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %0 = load float, float addrspace(1)* %in
+  %0 = load float, ptr addrspace(1) %in
   %cmp = fcmp oeq float %0, 0.000000e+00
   %value = select i1 %cmp, i32 2, i32 3 
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll b/llvm/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
index 2a848e80b81bf..a823654864f2f 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
@@ -6,11 +6,11 @@
 
 ; CHECK: SET{{[A-Z]+}}_DX10
 
-define amdgpu_kernel void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %0 = load float, float addrspace(1)* %in
+  %0 = load float, ptr addrspace(1) %in
   %cmp = fcmp oeq float %0, 0.000000e+00
   %value = select i1 %cmp, i32 -1, i32 0
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
index faa2d115db22d..8347a0282511e 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -12,15 +12,15 @@
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fcmp_f16_lt(
-    i32 addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
   %r.val = fcmp olt half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -38,17 +38,17 @@ entry:
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fcmp_f16_lt_abs(
-    i32 addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
   %a.abs = call half @llvm.fabs.f16(half %a.val)
   %b.abs = call half @llvm.fabs.f16(half %b.val)
   %r.val = fcmp olt half %a.abs, %b.abs
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -63,15 +63,15 @@ entry:
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fcmp_f16_eq(
-    i32 addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
   %r.val = fcmp oeq half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -86,15 +86,15 @@ entry:
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fcmp_f16_le(
-    i32 addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
   %r.val = fcmp ole half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -109,15 +109,15 @@ entry:
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fcmp_f16_gt(
-    i32 addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
   %r.val = fcmp ogt half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -132,15 +132,15 @@ entry:
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fcmp_f16_lg(
-    i32 addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
   %r.val = fcmp one half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -155,15 +155,15 @@ entry:
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fcmp_f16_ge(
-    i32 addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
   %r.val = fcmp oge half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -178,15 +178,15 @@ entry:
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fcmp_f16_o(
-    i32 addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
   %r.val = fcmp ord half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -201,15 +201,15 @@ entry:
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fcmp_f16_u(
-    i32 addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
   %r.val = fcmp uno half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -224,15 +224,15 @@ entry:
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fcmp_f16_nge(
-    i32 addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
   %r.val = fcmp ult half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -247,15 +247,15 @@ entry:
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fcmp_f16_nlg(
-    i32 addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
   %r.val = fcmp ueq half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -270,15 +270,15 @@ entry:
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fcmp_f16_ngt(
-    i32 addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
   %r.val = fcmp ule half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -293,15 +293,15 @@ entry:
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fcmp_f16_nle(
-    i32 addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
   %r.val = fcmp ugt half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -316,15 +316,15 @@ entry:
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fcmp_f16_neq(
-    i32 addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
   %r.val = fcmp une half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -339,15 +339,15 @@ entry:
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fcmp_f16_nlt(
-    i32 addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
   %r.val = fcmp uge half %a.val, %b.val
   %r.val.sext = sext i1 %r.val to i32
-  store i32 %r.val.sext, i32 addrspace(1)* %r
+  store i32 %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -358,15 +358,15 @@ entry:
 ; VI: v_cmp_lt_f16_e32 vcc,
 ; VI: v_cmp_lt_f16_e32 vcc,
 define amdgpu_kernel void @fcmp_v2f16_lt(
-    <2 x i32> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
   %r.val = fcmp olt <2 x half> %a.val, %b.val
   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
-  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
+  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -377,15 +377,15 @@ entry:
 ; VI:  v_cmp_eq_f16_e32 vcc,
 ; VI:  v_cmp_eq_f16_e32 vcc,
 define amdgpu_kernel void @fcmp_v2f16_eq(
-    <2 x i32> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
   %r.val = fcmp oeq <2 x half> %a.val, %b.val
   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
-  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
+  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -395,15 +395,15 @@ entry:
 ; VI:  v_cmp_le_f16_e32 vcc
 ; VI:  v_cmp_le_f16_e32 vcc
 define amdgpu_kernel void @fcmp_v2f16_le(
-    <2 x i32> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
   %r.val = fcmp ole <2 x half> %a.val, %b.val
   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
-  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
+  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -414,15 +414,15 @@ entry:
 ; VI: v_cmp_gt_f16_e32 vcc,
 ; VI: v_cmp_gt_f16_e32 vcc,
 define amdgpu_kernel void @fcmp_v2f16_gt(
-    <2 x i32> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
   %r.val = fcmp ogt <2 x half> %a.val, %b.val
   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
-  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
+  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -433,15 +433,15 @@ entry:
 ; VI: v_cmp_lg_f16_e32 vcc,
 ; VI: v_cmp_lg_f16_e32 vcc,
 define amdgpu_kernel void @fcmp_v2f16_lg(
-    <2 x i32> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
   %r.val = fcmp one <2 x half> %a.val, %b.val
   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
-  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
+  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -452,15 +452,15 @@ entry:
 ; VI:  v_cmp_ge_f16_e32 vcc,
 ; VI:  v_cmp_ge_f16_e32 vcc,
 define amdgpu_kernel void @fcmp_v2f16_ge(
-    <2 x i32> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
   %r.val = fcmp oge <2 x half> %a.val, %b.val
   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
-  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
+  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -471,15 +471,15 @@ entry:
 ; VI:  v_cmp_o_f16_e32 vcc,
 ; VI:  v_cmp_o_f16_e32 vcc,
 define amdgpu_kernel void @fcmp_v2f16_o(
-    <2 x i32> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
   %r.val = fcmp ord <2 x half> %a.val, %b.val
   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
-  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
+  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -490,15 +490,15 @@ entry:
 ; VI:  v_cmp_u_f16_e32 vcc,
 ; VI:  v_cmp_u_f16_e32 vcc,
 define amdgpu_kernel void @fcmp_v2f16_u(
-    <2 x i32> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
   %r.val = fcmp uno <2 x half> %a.val, %b.val
   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
-  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
+  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -509,15 +509,15 @@ entry:
 ; VI:  v_cmp_nge_f16_e32 vcc,
 ; VI:  v_cmp_nge_f16_e32 vcc,
 define amdgpu_kernel void @fcmp_v2f16_nge(
-    <2 x i32> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
   %r.val = fcmp ult <2 x half> %a.val, %b.val
   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
-  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
+  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -528,15 +528,15 @@ entry:
 ; VI:  v_cmp_nlg_f16_e32 vcc
 ; VI:  v_cmp_nlg_f16_e32 vcc
 define amdgpu_kernel void @fcmp_v2f16_nlg(
-    <2 x i32> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
   %r.val = fcmp ueq <2 x half> %a.val, %b.val
   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
-  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
+  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -547,15 +547,15 @@ entry:
 ; VI:  v_cmp_ngt_f16_e32 vcc,
 ; VI:  v_cmp_ngt_f16_e32 vcc,
 define amdgpu_kernel void @fcmp_v2f16_ngt(
-    <2 x i32> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
   %r.val = fcmp ule <2 x half> %a.val, %b.val
   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
-  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
+  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -566,15 +566,15 @@ entry:
 ; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @fcmp_v2f16_nle(
-    <2 x i32> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
   %r.val = fcmp ugt <2 x half> %a.val, %b.val
   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
-  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
+  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -585,15 +585,15 @@ entry:
 ; VI:  v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:  v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @fcmp_v2f16_neq(
-    <2 x i32> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
   %r.val = fcmp une <2 x half> %a.val, %b.val
   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
-  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
+  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 
@@ -617,15 +617,15 @@ entry:
 ; GCN: buffer_store_dwordx2 v[[[R_I32_0]]:[[R_I32_1]]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fcmp_v2f16_nlt(
-    <2 x i32> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
   %r.val = fcmp uge <2 x half> %a.val, %b.val
   %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
-  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
+  store <2 x i32> %r.val.sext, ptr addrspace(1) %r
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fcmp.ll b/llvm/test/CodeGen/AMDGPU/fcmp.ll
index b548670edb066..7846c02030988 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.ll
@@ -3,14 +3,14 @@
 ; CHECK: {{^}}fcmp_sext:
 ; CHECK: SETE_DX10  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define amdgpu_kernel void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @fcmp_sext(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %0 = load float, float addrspace(1)* %in
-  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %in, i32 1
-  %1 = load float, float addrspace(1)* %arrayidx1
+  %0 = load float, ptr addrspace(1) %in
+  %arrayidx1 = getelementptr inbounds float, ptr addrspace(1) %in, i32 1
+  %1 = load float, ptr addrspace(1) %arrayidx1
   %cmp = fcmp oeq float %0, %1
   %sext = sext i1 %cmp to i32
-  store i32 %sext, i32 addrspace(1)* %out
+  store i32 %sext, ptr addrspace(1) %out
   ret void
 }
 
@@ -22,17 +22,17 @@ entry:
 ; CHECK: SET{{[N]*}}E_DX10 * T{{[0-9]+\.[XYZW],}}
 ; CHECK-NEXT: {{[0-9]+\(5.0}}
 
-define amdgpu_kernel void @fcmp_br(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_br(ptr addrspace(1) %out, float %in) {
 entry:
   %0 = fcmp oeq float %in, 5.0
   br i1 %0, label %IF, label %ENDIF
 
 IF:
-  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  store i32 0, i32 addrspace(1)* %1
+  %1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  store i32 0, ptr addrspace(1) %1
   br label %ENDIF
 
 ENDIF:
-  store i32 0, i32 addrspace(1)* %out
+  store i32 0, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/fcmp64.ll b/llvm/test/CodeGen/AMDGPU/fcmp64.ll
index 236ba53c3ed2e..e38eb0a3a916a 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp64.ll
@@ -3,72 +3,72 @@
 
 ; CHECK-LABEL: {{^}}flt_f64:
 ; CHECK: v_cmp_nge_f64_e32 vcc, {{s\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
-                     double addrspace(1)* %in2) {
-   %r0 = load double, double addrspace(1)* %in1
-   %r1 = load double, double addrspace(1)* %in2
+define amdgpu_kernel void @flt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
+                     ptr addrspace(1) %in2) {
+   %r0 = load double, ptr addrspace(1) %in1
+   %r1 = load double, ptr addrspace(1) %in2
    %r2 = fcmp ult double %r0, %r1
    %r3 = zext i1 %r2 to i32
-   store i32 %r3, i32 addrspace(1)* %out
+   store i32 %r3, ptr addrspace(1) %out
    ret void
 }
 
 ; CHECK-LABEL: {{^}}fle_f64:
 ; CHECK: v_cmp_ngt_f64_e32 vcc, {{s\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
-                     double addrspace(1)* %in2) {
-   %r0 = load double, double addrspace(1)* %in1
-   %r1 = load double, double addrspace(1)* %in2
+define amdgpu_kernel void @fle_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
+                     ptr addrspace(1) %in2) {
+   %r0 = load double, ptr addrspace(1) %in1
+   %r1 = load double, ptr addrspace(1) %in2
    %r2 = fcmp ule double %r0, %r1
    %r3 = zext i1 %r2 to i32
-   store i32 %r3, i32 addrspace(1)* %out
+   store i32 %r3, ptr addrspace(1) %out
    ret void
 }
 
 ; CHECK-LABEL: {{^}}fgt_f64:
 ; CHECK: v_cmp_nle_f64_e32 vcc, {{s\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
-                     double addrspace(1)* %in2) {
-   %r0 = load double, double addrspace(1)* %in1
-   %r1 = load double, double addrspace(1)* %in2
+define amdgpu_kernel void @fgt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
+                     ptr addrspace(1) %in2) {
+   %r0 = load double, ptr addrspace(1) %in1
+   %r1 = load double, ptr addrspace(1) %in2
    %r2 = fcmp ugt double %r0, %r1
    %r3 = zext i1 %r2 to i32
-   store i32 %r3, i32 addrspace(1)* %out
+   store i32 %r3, ptr addrspace(1) %out
    ret void
 }
 
 ; CHECK-LABEL: {{^}}fge_f64:
 ; CHECK: v_cmp_nlt_f64_e32 vcc, {{s\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
-                     double addrspace(1)* %in2) {
-   %r0 = load double, double addrspace(1)* %in1
-   %r1 = load double, double addrspace(1)* %in2
+define amdgpu_kernel void @fge_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
+                     ptr addrspace(1) %in2) {
+   %r0 = load double, ptr addrspace(1) %in1
+   %r1 = load double, ptr addrspace(1) %in2
    %r2 = fcmp uge double %r0, %r1
    %r3 = zext i1 %r2 to i32
-   store i32 %r3, i32 addrspace(1)* %out
+   store i32 %r3, ptr addrspace(1) %out
    ret void
 }
 
 ; CHECK-LABEL: {{^}}fne_f64:
 ; CHECK: v_cmp_neq_f64_e32 vcc, {{s\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
-                     double addrspace(1)* %in2) {
-   %r0 = load double, double addrspace(1)* %in1
-   %r1 = load double, double addrspace(1)* %in2
+define amdgpu_kernel void @fne_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
+                     ptr addrspace(1) %in2) {
+   %r0 = load double, ptr addrspace(1) %in1
+   %r1 = load double, ptr addrspace(1) %in2
    %r2 = fcmp une double %r0, %r1
    %r3 = select i1 %r2, double %r0, double %r1
-   store double %r3, double addrspace(1)* %out
+   store double %r3, ptr addrspace(1) %out
    ret void
 }
 
 ; CHECK-LABEL: {{^}}feq_f64:
 ; CHECK: v_cmp_nlg_f64_e32 vcc, {{s\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
-                     double addrspace(1)* %in2) {
-   %r0 = load double, double addrspace(1)* %in1
-   %r1 = load double, double addrspace(1)* %in2
+define amdgpu_kernel void @feq_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
+                     ptr addrspace(1) %in2) {
+   %r0 = load double, ptr addrspace(1) %in1
+   %r1 = load double, ptr addrspace(1) %in2
    %r2 = fcmp ueq double %r0, %r1
    %r3 = select i1 %r2, double %r0, double %r1
-   store double %r3, double addrspace(1)* %out
+   store double %r3, ptr addrspace(1) %out
    ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 3df4edef5f945..138f80dfec16f 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -26,14 +26,14 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @test_copysign_f16(
-  half addrspace(1)* %arg_out,
-  half addrspace(1)* %arg_mag,
-  half addrspace(1)* %arg_sign) {
+  ptr addrspace(1) %arg_out,
+  ptr addrspace(1) %arg_mag,
+  ptr addrspace(1) %arg_sign) {
 entry:
-  %mag = load volatile half, half addrspace(1)* %arg_mag
-  %sign = load volatile half, half addrspace(1)* %arg_sign
+  %mag = load volatile half, ptr addrspace(1) %arg_mag
+  %sign = load volatile half, ptr addrspace(1) %arg_sign
   %out = call half @llvm.copysign.f16(half %mag, half %sign)
-  store half %out, half addrspace(1)* %arg_out
+  store half %out, ptr addrspace(1) %arg_out
   ret void
 }
 
@@ -46,18 +46,18 @@ entry:
 ; GCN: buffer_store_dword v[[OUT]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @test_copysign_out_f32_mag_f16_sign_f32(
-  float addrspace(1)* %arg_out,
-  half addrspace(1)* %arg_mag,
-  float addrspace(1)* %arg_sign) {
+  ptr addrspace(1) %arg_out,
+  ptr addrspace(1) %arg_mag,
+  ptr addrspace(1) %arg_sign) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
-  %mag = load half, half addrspace(1)* %arg_mag_gep
+  %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
+  %mag = load half, ptr addrspace(1) %arg_mag_gep
   %mag.ext = fpext half %mag to float
-  %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid
-  %sign = load float, float addrspace(1)* %arg_sign_gep
+  %arg_sign_gep = getelementptr float, ptr addrspace(1) %arg_sign, i32 %tid
+  %sign = load float, ptr addrspace(1) %arg_sign_gep
   %out = call float @llvm.copysign.f32(float %mag.ext, float %sign)
-  store float %out, float addrspace(1)* %arg_out
+  store float %out, ptr addrspace(1) %arg_out
   ret void
 }
 
@@ -71,18 +71,18 @@ entry:
 ; GCN: buffer_store_dwordx2 v[[[MAG_EXT_LO]]:[[OUT_HI]]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @test_copysign_out_f64_mag_f16_sign_f64(
-  double addrspace(1)* %arg_out,
-  half addrspace(1)* %arg_mag,
-  double addrspace(1)* %arg_sign) {
+  ptr addrspace(1) %arg_out,
+  ptr addrspace(1) %arg_mag,
+  ptr addrspace(1) %arg_sign) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
-  %mag = load half, half addrspace(1)* %arg_mag_gep
+  %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
+  %mag = load half, ptr addrspace(1) %arg_mag_gep
   %mag.ext = fpext half %mag to double
-  %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid
-  %sign = load double, double addrspace(1)* %arg_sign_gep
+  %arg_sign_gep = getelementptr double, ptr addrspace(1) %arg_sign, i32 %tid
+  %sign = load double, ptr addrspace(1) %arg_sign_gep
   %out = call double @llvm.copysign.f64(double %mag.ext, double %sign)
-  store double %out, double addrspace(1)* %arg_out
+  store double %out, ptr addrspace(1) %arg_out
   ret void
 }
 
@@ -97,18 +97,18 @@ entry:
 ; GCN: buffer_store_dword v[[OUT]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @test_copysign_out_f32_mag_f32_sign_f16(
-  float addrspace(1)* %arg_out,
-  float addrspace(1)* %arg_mag,
-  half addrspace(1)* %arg_sign) {
+  ptr addrspace(1) %arg_out,
+  ptr addrspace(1) %arg_mag,
+  ptr addrspace(1) %arg_sign) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid
-  %mag = load float, float addrspace(1)* %arg_mag_gep
-  %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid
-  %sign = load half, half addrspace(1)* %arg_sign_gep
+  %arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid
+  %mag = load float, ptr addrspace(1) %arg_mag_gep
+  %arg_sign_gep = getelementptr half, ptr addrspace(1) %arg_sign, i32 %tid
+  %sign = load half, ptr addrspace(1) %arg_sign_gep
   %sign.ext = fpext half %sign to float
   %out = call float @llvm.copysign.f32(float %mag, float %sign.ext)
-  store float %out, float addrspace(1)* %arg_out
+  store float %out, ptr addrspace(1) %arg_out
   ret void
 }
 
@@ -123,18 +123,18 @@ entry:
 ; GCN: buffer_store_dwordx2 v[[[MAG_LO]]:[[OUT_HI]]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @test_copysign_out_f64_mag_f64_sign_f16(
-  double addrspace(1)* %arg_out,
-  double addrspace(1)* %arg_mag,
-  half addrspace(1)* %arg_sign) {
+  ptr addrspace(1) %arg_out,
+  ptr addrspace(1) %arg_mag,
+  ptr addrspace(1) %arg_sign) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %arg_mag_gep = getelementptr double, double addrspace(1)* %arg_mag, i32 %tid
-  %mag = load double, double addrspace(1)* %arg_mag_gep
-  %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid
-  %sign = load half, half addrspace(1)* %arg_sign_gep
+  %arg_mag_gep = getelementptr double, ptr addrspace(1) %arg_mag, i32 %tid
+  %mag = load double, ptr addrspace(1) %arg_mag_gep
+  %arg_sign_gep = getelementptr half, ptr addrspace(1) %arg_sign, i32 %tid
+  %sign = load half, ptr addrspace(1) %arg_sign_gep
   %sign.ext = fpext half %sign to double
   %out = call double @llvm.copysign.f64(double %mag, double %sign.ext)
-  store double %out, double addrspace(1)* %arg_out
+  store double %out, ptr addrspace(1) %arg_out
   ret void
 }
 
@@ -151,18 +151,18 @@ entry:
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f32(
-  half addrspace(1)* %arg_out,
-  half addrspace(1)* %arg_mag,
-  float addrspace(1)* %arg_sign) {
+  ptr addrspace(1) %arg_out,
+  ptr addrspace(1) %arg_mag,
+  ptr addrspace(1) %arg_sign) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
-  %mag = load half, half addrspace(1)* %arg_mag_gep
-  %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid
-  %sign = load float, float addrspace(1)* %arg_sign_gep
+  %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
+  %mag = load half, ptr addrspace(1) %arg_mag_gep
+  %arg_sign_gep = getelementptr float, ptr addrspace(1) %arg_sign, i32 %tid
+  %sign = load float, ptr addrspace(1) %arg_sign_gep
   %sign.trunc = fptrunc float %sign to half
   %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
-  store half %out, half addrspace(1)* %arg_out
+  store half %out, ptr addrspace(1) %arg_out
   ret void
 }
 
@@ -179,18 +179,18 @@ entry:
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f64(
-  half addrspace(1)* %arg_out,
-  half addrspace(1)* %arg_mag,
-  double addrspace(1)* %arg_sign) {
+  ptr addrspace(1) %arg_out,
+  ptr addrspace(1) %arg_mag,
+  ptr addrspace(1) %arg_sign) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
-  %mag = load half, half addrspace(1)* %arg_mag
-  %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid
-  %sign = load double, double addrspace(1)* %arg_sign_gep
+  %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
+  %mag = load half, ptr addrspace(1) %arg_mag
+  %arg_sign_gep = getelementptr double, ptr addrspace(1) %arg_sign, i32 %tid
+  %sign = load double, ptr addrspace(1) %arg_sign_gep
   %sign.trunc = fptrunc double %sign to half
   %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
-  store half %out, half addrspace(1)* %arg_out
+  store half %out, ptr addrspace(1) %arg_out
   ret void
 }
 
@@ -209,18 +209,18 @@ entry:
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @test_copysign_out_f16_mag_f32_sign_f16(
-  half addrspace(1)* %arg_out,
-  float addrspace(1)* %arg_mag,
-  half addrspace(1)* %arg_sign) {
+  ptr addrspace(1) %arg_out,
+  ptr addrspace(1) %arg_mag,
+  ptr addrspace(1) %arg_sign) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid
-  %mag = load float, float addrspace(1)* %arg_mag_gep
+  %arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid
+  %mag = load float, ptr addrspace(1) %arg_mag_gep
   %mag.trunc = fptrunc float %mag to half
-  %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid
-  %sign = load half, half addrspace(1)* %arg_sign_gep
+  %arg_sign_gep = getelementptr half, ptr addrspace(1) %arg_sign, i32 %tid
+  %sign = load half, ptr addrspace(1) %arg_sign_gep
   %out = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
-  store half %out, half addrspace(1)* %arg_out
+  store half %out, ptr addrspace(1) %arg_out
   ret void
 }
 
@@ -228,15 +228,15 @@ entry:
 ; GCN: v_bfi_b32
 ; GCN: s_endpgm
 define amdgpu_kernel void @test_copysign_out_f16_mag_f64_sign_f16(
-  half addrspace(1)* %arg_out,
-  double addrspace(1)* %arg_mag,
-  half addrspace(1)* %arg_sign) {
+  ptr addrspace(1) %arg_out,
+  ptr addrspace(1) %arg_mag,
+  ptr addrspace(1) %arg_sign) {
 entry:
-  %mag = load double, double addrspace(1)* %arg_mag
+  %mag = load double, ptr addrspace(1) %arg_mag
   %mag.trunc = fptrunc double %mag to half
-  %sign = load half, half addrspace(1)* %arg_sign
+  %sign = load half, ptr addrspace(1) %arg_sign
   %out = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
-  store half %out, half addrspace(1)* %arg_out
+  store half %out, ptr addrspace(1) %arg_out
   ret void
 }
 
@@ -246,12 +246,12 @@ entry:
 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GCN: s_endpgm
 define amdgpu_kernel void @test_copysign_v2f16(
-  <2 x half> addrspace(1)* %arg_out,
+  ptr addrspace(1) %arg_out,
   <2 x half> %arg_mag,
   <2 x half> %arg_sign) {
 entry:
   %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %arg_mag, <2 x half> %arg_sign)
-  store <2 x half> %out, <2 x half> addrspace(1)* %arg_out
+  store <2 x half> %out, ptr addrspace(1) %arg_out
   ret void
 }
 
@@ -261,12 +261,12 @@ entry:
 ; GCN: v_bfi_b32
 ; GCN: s_endpgm
 define amdgpu_kernel void @test_copysign_v3f16(
-  <3 x half> addrspace(1)* %arg_out,
+  ptr addrspace(1) %arg_out,
   <3 x half> %arg_mag,
   <3 x half> %arg_sign) {
 entry:
   %out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %arg_mag, <3 x half> %arg_sign)
-  store <3 x half> %out, <3 x half> addrspace(1)* %arg_out
+  store <3 x half> %out, ptr addrspace(1) %arg_out
   ret void
 }
 
@@ -277,11 +277,11 @@ entry:
 ; GCN: v_bfi_b32
 ; GCN: s_endpgm
 define amdgpu_kernel void @test_copysign_v4f16(
-  <4 x half> addrspace(1)* %arg_out,
+  ptr addrspace(1) %arg_out,
   <4 x half> %arg_mag,
   <4 x half> %arg_sign) {
 entry:
   %out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign)
-  store <4 x half> %out, <4 x half> addrspace(1)* %arg_out
+  store <4 x half> %out, ptr addrspace(1) %arg_out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index aaa185e23e82f..e02230d4e421b 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -19,9 +19,9 @@ declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind read
 ; GCN: s_endpgm
 
 ; EG: BFI_INT
-define amdgpu_kernel void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f32(ptr addrspace(1) %out, float %mag, float %sign) nounwind {
   %result = call float @llvm.copysign.f32(float %mag, float %sign)
-  store float %result, float addrspace(1)* %out, align 4
+  store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -30,9 +30,9 @@ define amdgpu_kernel void @test_copysign_f32(float addrspace(1)* %out, float %ma
 
 ; EG: BFI_INT
 ; EG: BFI_INT
-define amdgpu_kernel void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v2f32(ptr addrspace(1) %out, <2 x float> %mag, <2 x float> %sign) nounwind {
   %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
-  store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8
+  store <2 x float> %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -43,9 +43,9 @@ define amdgpu_kernel void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <
 ; EG: BFI_INT
 ; EG: BFI_INT
 ; EG: BFI_INT
-define amdgpu_kernel void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v4f32(ptr addrspace(1) %out, <4 x float> %mag, <4 x float> %sign) nounwind {
   %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign)
-  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
+  store <4 x float> %result, ptr addrspace(1) %out, align 16
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
index 292eb1fa0b31e..7ca233446732b 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -17,9 +17,9 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind r
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
 ; GCN: buffer_store_dwordx2 v[[[VMAG_LO]]:[[VRESULT_HI]]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, [8 x i32], double %mag, [8 x i32], double %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f64(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], double %sign) nounwind {
   %result = call double @llvm.copysign.f64(double %mag, double %sign)
-  store double %result, double addrspace(1)* %out, align 8
+  store double %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -33,25 +33,25 @@ define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, [8 x i32
 ; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN]]
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
 ; GCN: buffer_store_dwordx2 v[[[VMAG_LO]]:[[VRESULT_HI]]]
-define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, [8 x i32], double %mag, float %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i32], double %mag, float %sign) nounwind {
   %c = fpext float %sign to double
   %result = call double @llvm.copysign.f64(double %mag, double %c)
-  store double %result, double addrspace(1)* %out, align 8
+  store double %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}test_copysign_v2f64:
 ; GCN: s_endpgm
-define amdgpu_kernel void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v2f64(ptr addrspace(1) %out, <2 x double> %mag, <2 x double> %sign) nounwind {
   %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
-  store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8
+  store <2 x double> %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}test_copysign_v4f64:
 ; GCN: s_endpgm
-define amdgpu_kernel void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v4f64(ptr addrspace(1) %out, <4 x double> %mag, <4 x double> %sign) nounwind {
   %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
-  store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8
+  store <4 x double> %result, ptr addrspace(1) %out, align 8
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/fdot2.ll b/llvm/test/CodeGen/AMDGPU/fdot2.ll
index 7da5dbd987166..8573cd4d1fe13 100644
--- a/llvm/test/CodeGen/AMDGPU/fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdot2.ll
@@ -21,12 +21,12 @@
 
 ; GFX906-CONTRACT: v_mac_f16_e32
 ; GFX906-DENORM-CONTRACT: v_fma_f16
-define amdgpu_kernel void @dotproduct_f16(<2 x half> addrspace(1)* %src1,
-                                          <2 x half> addrspace(1)* %src2,
-                                          half addrspace(1)* nocapture %dst) {
+define amdgpu_kernel void @dotproduct_f16(ptr addrspace(1) %src1,
+                                          ptr addrspace(1) %src2,
+                                          ptr addrspace(1) nocapture %dst) {
 entry:
-  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
-  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
+  %src1.vec = load <2 x half>, ptr addrspace(1) %src1
+  %src2.vec = load <2 x half>, ptr addrspace(1) %src2
 
   %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
   %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
@@ -36,10 +36,10 @@ entry:
 
   %mul2 = fmul half %src1.el2, %src2.el2
   %mul1 = fmul half %src1.el1, %src2.el1
-  %acc = load half, half addrspace(1)* %dst, align 2
+  %acc = load half, ptr addrspace(1) %dst, align 2
   %acc1 = fadd half %mul2, %acc
   %acc2 = fadd half %mul1, %acc1
-  store half %acc2, half addrspace(1)* %dst, align 2
+  store half %acc2, ptr addrspace(1) %dst, align 2
   ret void
 }
 
@@ -59,12 +59,12 @@ entry:
 ; GFX906-CONTRACT: v_dot2_f32_f16
 
 ; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
-define amdgpu_kernel void @dotproduct_f16_f32(<2 x half> addrspace(1)* %src1,
-                                              <2 x half> addrspace(1)* %src2,
-                                              float addrspace(1)* nocapture %dst) {
+define amdgpu_kernel void @dotproduct_f16_f32(ptr addrspace(1) %src1,
+                                              ptr addrspace(1) %src2,
+                                              ptr addrspace(1) nocapture %dst) {
 entry:
-  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
-  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
+  %src1.vec = load <2 x half>, ptr addrspace(1) %src1
+  %src2.vec = load <2 x half>, ptr addrspace(1) %src2
 
   %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
   %csrc1.el1 = fpext half %src1.el1 to float
@@ -78,10 +78,10 @@ entry:
 
   %mul2 = fmul float %csrc1.el2, %csrc2.el2
   %mul1 = fmul float %csrc1.el1, %csrc2.el1
-  %acc = load float, float addrspace(1)* %dst, align 4
+  %acc = load float, ptr addrspace(1) %dst, align 4
   %acc1 = fadd float %mul2, %acc
   %acc2 = fadd float %mul1, %acc1
-  store float %acc2, float addrspace(1)* %dst, align 4
+  store float %acc2, ptr addrspace(1) %dst, align 4
   ret void
 }
 
@@ -99,12 +99,12 @@ entry:
 
 ; GFX906-CONTRACT: v_dot2_f32_f16
 ; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
-define amdgpu_kernel void @dotproduct_
diff vecorder(<2 x half> addrspace(1)* %src1,
-                                                   <2 x half> addrspace(1)* %src2,
-                                                   float addrspace(1)* nocapture %dst) {
+define amdgpu_kernel void @dotproduct_
diff vecorder(ptr addrspace(1) %src1,
+                                                   ptr addrspace(1) %src2,
+                                                   ptr addrspace(1) nocapture %dst) {
 entry:
-  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
-  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
+  %src1.vec = load <2 x half>, ptr addrspace(1) %src1
+  %src2.vec = load <2 x half>, ptr addrspace(1) %src2
 
   %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
   %csrc1.el1 = fpext half %src1.el1 to float
@@ -118,10 +118,10 @@ entry:
 
   %mul2 = fmul float %csrc2.el2, %csrc1.el2
   %mul1 = fmul float %csrc1.el1, %csrc2.el1
-  %acc = load float, float addrspace(1)* %dst, align 4
+  %acc = load float, ptr addrspace(1) %dst, align 4
   %acc1 = fadd float %mul2, %acc
   %acc2 = fadd float %mul1, %acc1
-  store float %acc2, float addrspace(1)* %dst, align 4
+  store float %acc2, ptr addrspace(1) %dst, align 4
   ret void
 }
 
@@ -136,12 +136,12 @@ entry:
 
 ; GFX906-CONTRACT: v_fma_mix_f32
 ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
-define amdgpu_kernel void @dotproduct_v4f16(<4 x half> addrspace(1)* %src1,
-                                            <4 x half> addrspace(1)* %src2,
-                                            float addrspace(1)* nocapture %dst) {
+define amdgpu_kernel void @dotproduct_v4f16(ptr addrspace(1) %src1,
+                                            ptr addrspace(1) %src2,
+                                            ptr addrspace(1) nocapture %dst) {
 entry:
-  %src1.vec = load <4 x half>, <4 x half> addrspace(1)* %src1
-  %src2.vec = load <4 x half>, <4 x half> addrspace(1)* %src2
+  %src1.vec = load <4 x half>, ptr addrspace(1) %src1
+  %src2.vec = load <4 x half>, ptr addrspace(1) %src2
 
   %src1.el1 = extractelement <4 x half> %src1.vec, i64 0
   %csrc1.el1 = fpext half %src1.el1 to float
@@ -155,10 +155,10 @@ entry:
 
   %mul2 = fmul float %csrc1.el2, %csrc2.el2
   %mul1 = fmul float %csrc1.el1, %csrc2.el1
-  %acc = load float, float addrspace(1)* %dst, align 4
+  %acc = load float, ptr addrspace(1) %dst, align 4
   %acc1 = fadd float %mul2, %acc
   %acc2 = fadd float %mul1, %acc1
-  store float %acc2, float addrspace(1)* %dst, align 4
+  store float %acc2, ptr addrspace(1) %dst, align 4
   ret void
 }
 
@@ -173,12 +173,12 @@ entry:
 
 ; GFX906-CONTRACT: v_fma_mix_f32
 ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
-define amdgpu_kernel void @NotAdotproduct(<2 x half> addrspace(1)* %src1,
-                                          <2 x half> addrspace(1)* %src2,
-                                          float addrspace(1)* nocapture %dst) {
+define amdgpu_kernel void @NotAdotproduct(ptr addrspace(1) %src1,
+                                          ptr addrspace(1) %src2,
+                                          ptr addrspace(1) nocapture %dst) {
 entry:
-  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
-  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
+  %src1.vec = load <2 x half>, ptr addrspace(1) %src1
+  %src2.vec = load <2 x half>, ptr addrspace(1) %src2
 
   %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
   %csrc1.el1 = fpext half %src1.el1 to float
@@ -192,10 +192,10 @@ entry:
 
   %mul2 = fmul float %csrc1.el2, %csrc1.el1
   %mul1 = fmul float %csrc2.el1, %csrc2.el2
-  %acc = load float, float addrspace(1)* %dst, align 4
+  %acc = load float, ptr addrspace(1) %dst, align 4
   %acc1 = fadd float %mul2, %acc
   %acc2 = fadd float %mul1, %acc1
-  store float %acc2, float addrspace(1)* %dst, align 4
+  store float %acc2, ptr addrspace(1) %dst, align 4
   ret void
 }
 
@@ -210,12 +210,12 @@ entry:
 
 ; GFX906-CONTRACT: v_fma_mix_f32
 ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
-define amdgpu_kernel void @Diff_Idx_NotAdotproduct(<2 x half> addrspace(1)* %src1,
-                                                   <2 x half> addrspace(1)* %src2,
-                                                   float addrspace(1)* nocapture %dst) {
+define amdgpu_kernel void @Diff_Idx_NotAdotproduct(ptr addrspace(1) %src1,
+                                                   ptr addrspace(1) %src2,
+                                                   ptr addrspace(1) nocapture %dst) {
 entry:
-  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
-  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
+  %src1.vec = load <2 x half>, ptr addrspace(1) %src1
+  %src2.vec = load <2 x half>, ptr addrspace(1) %src2
 
   %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
   %csrc1.el1 = fpext half %src1.el1 to float
@@ -229,9 +229,9 @@ entry:
 
   %mul2 = fmul float %csrc1.el2, %csrc2.el1
   %mul1 = fmul float %csrc1.el1, %csrc2.el2
-  %acc = load float, float addrspace(1)* %dst, align 4
+  %acc = load float, ptr addrspace(1) %dst, align 4
   %acc1 = fadd float %mul2, %acc
   %acc2 = fadd float %mul1, %acc1
-  store float %acc2, float addrspace(1)* %dst, align 4
+  store float %acc2, ptr addrspace(1) %dst, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
index 47287f67b01ef..a56ff8c3afb36 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
@@ -7,7 +7,7 @@
 
 %struct.Data = type { [20 x i32] }
 
-define i32 @fp_save_restore_in_temp_sgpr(%struct.Data addrspace(5)* nocapture readonly byval(%struct.Data) align 4 %arg) #0 {
+define i32 @fp_save_restore_in_temp_sgpr(ptr addrspace(5) nocapture readonly byval(%struct.Data) align 4 %arg) #0 {
   ; GCN-LABEL: name: fp_save_restore_in_temp_sgpr
   ; GCN: bb.0.begin:
   ; GCN:   liveins: $sgpr11
@@ -32,8 +32,8 @@ lp_end:                                                ; preds = %lp_begin
 
 lp_begin:                                                ; preds = %lp_end, %begin
   %idx = phi i32 [ 0, %begin ], [ %cur_idx, %lp_end ]
-  %ptr = getelementptr inbounds %struct.Data, %struct.Data addrspace(5)* %arg, i32 0, i32 0, i32 %idx
-  %data = load i32, i32 addrspace(5)* %ptr, align 4
+  %ptr = getelementptr inbounds %struct.Data, ptr addrspace(5) %arg, i32 0, i32 0, i32 %idx
+  %data = load i32, ptr addrspace(5) %ptr, align 4
   %data_cmp = icmp eq i32 %data, %idx
   br i1 %data_cmp, label %lp_end, label %end
 

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
index 2575c041c0a8e..e16b846d5701d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -13,43 +13,43 @@
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
 ; GCN: flat_store_{{dword|b32}} v[[[LO_VREG]]:[[HI_VREG]]], v[[DATA]]
-define amdgpu_kernel void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
-  %fptr = addrspacecast i32 addrspace(1)* %gptr to i32*
-  store volatile i32 %x, i32* %fptr, align 4
+define amdgpu_kernel void @store_flat_i32(ptr addrspace(1) %gptr, i32 %x) #0 {
+  %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
+  store volatile i32 %x, ptr %fptr, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_flat_i64:
 ; GCN: flat_store_{{dwordx2|b64}}
-define amdgpu_kernel void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
-  %fptr = addrspacecast i64 addrspace(1)* %gptr to i64*
-  store volatile i64 %x, i64* %fptr, align 8
+define amdgpu_kernel void @store_flat_i64(ptr addrspace(1) %gptr, i64 %x) #0 {
+  %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
+  store volatile i64 %x, ptr %fptr, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_flat_v4i32:
 ; GCN: flat_store_{{dwordx4|b128}}
-define amdgpu_kernel void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
-  %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32>*
-  store volatile <4 x i32> %x, <4 x i32>* %fptr, align 16
+define amdgpu_kernel void @store_flat_v4i32(ptr addrspace(1) %gptr, <4 x i32> %x) #0 {
+  %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
+  store volatile <4 x i32> %x, ptr %fptr, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_flat_trunc_i16:
 ; GCN: flat_store_{{short|b16}}
-define amdgpu_kernel void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
-  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16*
+define amdgpu_kernel void @store_flat_trunc_i16(ptr addrspace(1) %gptr, i32 %x) #0 {
+  %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
   %y = trunc i32 %x to i16
-  store volatile i16 %y, i16* %fptr, align 2
+  store volatile i16 %y, ptr %fptr, align 2
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_flat_trunc_i8:
 ; GCN: flat_store_{{byte|b8}}
-define amdgpu_kernel void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
-  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8*
+define amdgpu_kernel void @store_flat_trunc_i8(ptr addrspace(1) %gptr, i32 %x) #0 {
+  %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
   %y = trunc i32 %x to i8
-  store volatile i8 %y, i8* %fptr, align 2
+  store volatile i8 %y, ptr %fptr, align 2
   ret void
 }
 
@@ -57,68 +57,68 @@ define amdgpu_kernel void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #
 
 ; GCN-LABEL: load_flat_i32:
 ; GCN: flat_load_{{dword|b32}}
-define amdgpu_kernel void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
-  %fptr = addrspacecast i32 addrspace(1)* %gptr to i32*
-  %fload = load volatile i32, i32* %fptr, align 4
-  store i32 %fload, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @load_flat_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
+  %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
+  %fload = load volatile i32, ptr %fptr, align 4
+  store i32 %fload, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: load_flat_i64:
 ; GCN: flat_load_{{dwordx2|b64}}
-define amdgpu_kernel void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
-  %fptr = addrspacecast i64 addrspace(1)* %gptr to i64*
-  %fload = load volatile i64, i64* %fptr, align 8
-  store i64 %fload, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @load_flat_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
+  %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
+  %fload = load volatile i64, ptr %fptr, align 8
+  store i64 %fload, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; GCN-LABEL: load_flat_v4i32:
 ; GCN: flat_load_{{dwordx4|b128}}
-define amdgpu_kernel void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
-  %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32>*
-  %fload = load volatile <4 x i32>, <4 x i32>* %fptr, align 32
-  store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
+define amdgpu_kernel void @load_flat_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
+  %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
+  %fload = load volatile <4 x i32>, ptr %fptr, align 32
+  store <4 x i32> %fload, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; GCN-LABEL: sextload_flat_i8:
 ; GCN: flat_load_{{sbyte|i8}}
-define amdgpu_kernel void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
-  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8*
-  %fload = load volatile i8, i8* %fptr, align 4
+define amdgpu_kernel void @sextload_flat_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
+  %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
+  %fload = load volatile i8, ptr %fptr, align 4
   %ext = sext i8 %fload to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
+  store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: zextload_flat_i8:
 ; GCN: flat_load_{{ubyte|u8}}
-define amdgpu_kernel void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
-  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8*
-  %fload = load volatile i8, i8* %fptr, align 4
+define amdgpu_kernel void @zextload_flat_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
+  %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
+  %fload = load volatile i8, ptr %fptr, align 4
   %ext = zext i8 %fload to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
+  store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: sextload_flat_i16:
 ; GCN: flat_load_{{sshort|i16}}
-define amdgpu_kernel void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
-  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16*
-  %fload = load volatile i16, i16* %fptr, align 4
+define amdgpu_kernel void @sextload_flat_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
+  %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
+  %fload = load volatile i16, ptr %fptr, align 4
   %ext = sext i16 %fload to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
+  store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: zextload_flat_i16:
 ; GCN: flat_load_{{ushort|u16}}
-define amdgpu_kernel void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
-  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16*
-  %fload = load volatile i16, i16* %fptr, align 4
+define amdgpu_kernel void @zextload_flat_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
+  %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
+  %fload = load volatile i16, ptr %fptr, align 4
   %ext = zext i16 %fload to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
+  store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -129,9 +129,9 @@ define amdgpu_kernel void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16
 ; GCN: flat_load_{{ubyte|u8}}
 define amdgpu_kernel void @flat_scratch_unaligned_load() {
   %scratch = alloca i32, addrspace(5)
-  %fptr = addrspacecast i32 addrspace(5)* %scratch to i32*
-  store volatile i32* %fptr, i32* addrspace(3)* null
-  %ld = load volatile i32, i32* %fptr, align 1
+  %fptr = addrspacecast ptr addrspace(5) %scratch to ptr
+  store volatile ptr %fptr, ptr addrspace(3) null
+  %ld = load volatile i32, ptr %fptr, align 1
   ret void
 }
 
@@ -142,9 +142,9 @@ define amdgpu_kernel void @flat_scratch_unaligned_load() {
 ; GCN: flat_store_{{byte|b8}}
 define amdgpu_kernel void @flat_scratch_unaligned_store() {
   %scratch = alloca i32, addrspace(5)
-  %fptr = addrspacecast i32 addrspace(5)* %scratch to i32*
-  store volatile i32* %fptr, i32* addrspace(3)* null
-  store volatile i32 0, i32* %fptr, align 1
+  %fptr = addrspacecast ptr addrspace(5) %scratch to ptr
+  store volatile ptr %fptr, ptr addrspace(3) null
+  store volatile i32 0, ptr %fptr, align 1
   ret void
 }
 
@@ -156,8 +156,8 @@ define amdgpu_kernel void @flat_scratch_unaligned_store() {
 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
 define amdgpu_kernel void @flat_scratch_multidword_load() {
   %scratch = alloca <2 x i32>, addrspace(5)
-  %fptr = addrspacecast <2 x i32> addrspace(5)* %scratch to <2 x i32>*
-  %ld = load volatile <2 x i32>, <2 x i32>* %fptr
+  %fptr = addrspacecast ptr addrspace(5) %scratch to ptr
+  %ld = load volatile <2 x i32>, ptr %fptr
   ret void
 }
 
@@ -169,25 +169,25 @@ define amdgpu_kernel void @flat_scratch_multidword_load() {
 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
 define amdgpu_kernel void @flat_scratch_multidword_store() {
   %scratch = alloca <2 x i32>, addrspace(5)
-  %fptr = addrspacecast <2 x i32> addrspace(5)* %scratch to <2 x i32>*
-  store volatile <2 x i32> zeroinitializer, <2 x i32>* %fptr
+  %fptr = addrspacecast ptr addrspace(5) %scratch to ptr
+  store volatile <2 x i32> zeroinitializer, ptr %fptr
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_flat_i8_max_offset:
 ; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
 ; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:4095{{$}}
-define amdgpu_kernel void @store_flat_i8_max_offset(i8* %fptr, i8 %x) #0 {
-  %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4095
-  store volatile i8 %x, i8* %fptr.offset
+define amdgpu_kernel void @store_flat_i8_max_offset(ptr %fptr, i8 %x) #0 {
+  %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4095
+  store volatile i8 %x, ptr %fptr.offset
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_flat_i8_max_offset_p1:
 ; GCN: flat_store_{{byte|b8}} v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{( dlc)?}}{{$}}
-define amdgpu_kernel void @store_flat_i8_max_offset_p1(i8* %fptr, i8 %x) #0 {
-  %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096
-  store volatile i8 %x, i8* %fptr.offset
+define amdgpu_kernel void @store_flat_i8_max_offset_p1(ptr %fptr, i8 %x) #0 {
+  %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4096
+  store volatile i8 %x, ptr %fptr.offset
   ret void
 }
 
@@ -197,9 +197,9 @@ define amdgpu_kernel void @store_flat_i8_max_offset_p1(i8* %fptr, i8 %x) #0 {
 ; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s
 ; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1,
 ; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
-define amdgpu_kernel void @store_flat_i8_neg_offset(i8* %fptr, i8 %x) #0 {
-  %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2
-  store volatile i8 %x, i8* %fptr.offset
+define amdgpu_kernel void @store_flat_i8_neg_offset(ptr %fptr, i8 %x) #0 {
+  %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 -2
+  store volatile i8 %x, ptr %fptr.offset
   ret void
 }
 
@@ -208,9 +208,9 @@ define amdgpu_kernel void @store_flat_i8_neg_offset(i8* %fptr, i8 %x) #0 {
 ; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc{{$}}
 ; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
 ; GFX11: flat_load_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc dlc{{$}}
-define amdgpu_kernel void @load_flat_i8_max_offset(i8* %fptr) #0 {
-  %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4095
-  %val = load volatile i8, i8* %fptr.offset
+define amdgpu_kernel void @load_flat_i8_max_offset(ptr %fptr) #0 {
+  %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4095
+  %val = load volatile i8, ptr %fptr.offset
   ret void
 }
 
@@ -218,9 +218,9 @@ define amdgpu_kernel void @load_flat_i8_max_offset(i8* %fptr) #0 {
 ; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GFX10PLUS: flat_load_{{ubyte|u8}} v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
-define amdgpu_kernel void @load_flat_i8_max_offset_p1(i8* %fptr) #0 {
-  %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096
-  %val = load volatile i8, i8* %fptr.offset
+define amdgpu_kernel void @load_flat_i8_max_offset_p1(ptr %fptr) #0 {
+  %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4096
+  %val = load volatile i8, ptr %fptr.offset
   ret void
 }
 
@@ -230,9 +230,9 @@ define amdgpu_kernel void @load_flat_i8_max_offset_p1(i8* %fptr) #0 {
 ; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s
 ; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1,
 ; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
-define amdgpu_kernel void @load_flat_i8_neg_offset(i8* %fptr) #0 {
-  %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2
-  %val = load volatile i8, i8* %fptr.offset
+define amdgpu_kernel void @load_flat_i8_neg_offset(ptr %fptr) #0 {
+  %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 -2
+  %val = load volatile i8, ptr %fptr.offset
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll
index 3f5708f927b42..f2872eb0ae3bf 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX940 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s
 
-define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(float* %ptr, float %data) {
+define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %data) {
   ; GFX940-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic
   ; GFX940: bb.0 (%ir-block.0):
   ; GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -25,11 +25,11 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(float* %ptr, float
   ; GFX11-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
   ; GFX11-NEXT:   FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
   ; GFX11-NEXT:   S_ENDPGM 0
-  %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float* %ptr, float %data)
+  %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data)
   ret void
 }
 
-define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(float* %ptr, float %data) {
+define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data) {
   ; GFX940-LABEL: name: flat_atomic_fadd_f32_rtn_intrinsic
   ; GFX940: bb.0 (%ir-block.0):
   ; GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -54,11 +54,11 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(float* %ptr, float %d
   ; GFX11-NEXT:   [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
   ; GFX11-NEXT:   $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
-  %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float* %ptr, float %data)
+  %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data)
   ret float %ret
 }
 
-define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(float* %ptr, float %data) #0 {
+define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %data) #0 {
   ; GFX940-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw
   ; GFX940: bb.0 (%ir-block.0):
   ; GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -81,11 +81,11 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(float* %ptr, float
   ; GFX11-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
   ; GFX11-NEXT:   FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
   ; GFX11-NEXT:   S_ENDPGM 0
-  %ret = atomicrmw fadd float* %ptr, float %data syncscope("wavefront") monotonic
+  %ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic
   ret void
 }
 
-define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(float* %ptr, float %data) #0 {
+define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(ptr %ptr, float %data) #0 {
   ; GFX940-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw
   ; GFX940: bb.0 (%ir-block.0):
   ; GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -110,10 +110,10 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(float* %ptr, float %d
   ; GFX11-NEXT:   [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
   ; GFX11-NEXT:   $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
-  %ret = atomicrmw fadd float* %ptr, float %data syncscope("wavefront") monotonic
+  %ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic
   ret float %ret
 }
 
-declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float*, float)
+declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr, float)
 
 attributes #0 = {"amdgpu-unsafe-fp-atomics"="true" }

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
index 8670cf168fd3c..169e96594fc80 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
 
-define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_intrinsic(double* %ptr, double %data) {
+define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_intrinsic(ptr %ptr, double %data) {
   ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -17,11 +17,11 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_intrinsic(double* %ptr, doubl
   ; GFX90A_GFX940-NEXT:   [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
   ; GFX90A_GFX940-NEXT:   FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s64) on %ir.ptr)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
-  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr %ptr, double %data)
   ret void
 }
 
-define amdgpu_ps double @flat_atomic_fadd_f64_rtn_intrinsic(double* %ptr, double %data) {
+define amdgpu_ps double @flat_atomic_fadd_f64_rtn_intrinsic(ptr %ptr, double %data) {
   ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -40,11 +40,11 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_intrinsic(double* %ptr, double
   ; GFX90A_GFX940-NEXT:   $sgpr0 = COPY [[COPY6]]
   ; GFX90A_GFX940-NEXT:   $sgpr1 = COPY [[COPY7]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
-  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr %ptr, double %data)
   ret double %ret
 }
 
-define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(double* %ptr, double %data) #0 {
+define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %data) #0 {
   ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_atomicrmw
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -59,11 +59,11 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(double* %ptr, doubl
   ; GFX90A_GFX940-NEXT:   [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
   ; GFX90A_GFX940-NEXT:   FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
-  %ret = atomicrmw fadd double* %ptr, double %data syncscope("wavefront") monotonic
+  %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic
   ret void
 }
 
-define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(double* %ptr, double %data) #0 {
+define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %data) #0 {
   ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -82,10 +82,10 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(double* %ptr, double
   ; GFX90A_GFX940-NEXT:   $sgpr0 = COPY [[COPY6]]
   ; GFX90A_GFX940-NEXT:   $sgpr1 = COPY [[COPY7]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
-  %ret = atomicrmw fadd double* %ptr, double %data syncscope("wavefront") monotonic
+  %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic
   ret double %ret
 }
 
-declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double*, double)
+declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr, double)
 
 attributes #0 = {"amdgpu-unsafe-fp-atomics"="true" }

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll
index 7cb03f4e71343..2a89301d3ea32 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX940 %s
 
-define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn_intrinsic(<2 x half>* %ptr, <2 x half> %data) {
+define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn_intrinsic(ptr %ptr, <2 x half> %data) {
   ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_no_rtn_intrinsic
   ; GFX940: bb.0 (%ir-block.0):
   ; GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -13,11 +13,11 @@ define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn_intrinsic(<2 x half>* %ptr,
   ; GFX940-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
   ; GFX940-NEXT:   FLAT_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
   ; GFX940-NEXT:   S_ENDPGM 0
-  %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data)
+  %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr %ptr, <2 x half> %data)
   ret void
 }
 
-define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn_intrinsic(<2 x half>* %ptr, <2 x half> %data) {
+define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn_intrinsic(ptr %ptr, <2 x half> %data) {
   ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_rtn_intrinsic
   ; GFX940: bb.0 (%ir-block.0):
   ; GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -30,8 +30,8 @@ define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn_intrinsic(<2 x half>* %p
   ; GFX940-NEXT:   [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
   ; GFX940-NEXT:   $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]]
   ; GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
-  %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data)
+  %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr %ptr, <2 x half> %data)
   ret <2 x half> %ret
 }
 
-declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half>*, <2 x half>)
+declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr, <2 x half>)

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-error-unsupported-gpu-hsa.ll b/llvm/test/CodeGen/AMDGPU/flat-error-unsupported-gpu-hsa.ll
index 21654bdb07f0f..61784f9fde2d6 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-error-unsupported-gpu-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-error-unsupported-gpu-hsa.ll
@@ -9,7 +9,7 @@
 
 ; ERROR: LLVM ERROR: Cannot select: {{0x[0-9,a-f]+|t[0-9]+}}: i32,ch = load<(volatile load (s32) from %ir.flat.ptr.load)>
 ; HSA-DEFAULT: flat_load_dword
-define amdgpu_kernel void @load_flat_i32(i32* %flat.ptr) {
-  %load = load volatile i32, i32* %flat.ptr, align 4
+define amdgpu_kernel void @load_flat_i32(ptr %flat.ptr) {
+  %load = load volatile i32, ptr %flat.ptr, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
index adb435db60ebe..11bf14d594b8f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
@@ -23,9 +23,9 @@
 ; NOHSA-DEFAULT: buffer_store_dword
 ; NOHSA-NODEFAULT: flat_store_dword
 ; NOHSA-NOADDR64: flat_store_dword
-define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(ptr addrspace(1) %out) {
 entry:
-  store i32 0, i32 addrspace(1)* %out
+  store i32 0, ptr addrspace(1) %out
   ret void
 }
 
@@ -36,19 +36,18 @@ entry:
 ; NOHSA-DEFAULT: buffer_store_dword
 ; NOHSA-NODEFAULT: flat_store_dword
 ; NOHSA-NOADDR64: flat_store_dword
-define amdgpu_kernel void @test_addr64(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_addr64(ptr addrspace(1) %out) {
 entry:
-  %out.addr = alloca i32 addrspace(1)*, align 4, addrspace(5)
+  %out.addr = alloca ptr addrspace(1), align 4, addrspace(5)
 
-  store i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(5)* %out.addr, align 4
-  %ld0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %out.addr, align 4
+  store ptr addrspace(1) %out, ptr addrspace(5) %out.addr, align 4
+  %ld0 = load ptr addrspace(1), ptr addrspace(5) %out.addr, align 4
 
-  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %ld0, i32 0
-  store i32 1, i32 addrspace(1)* %arrayidx, align 4
+  store i32 1, ptr addrspace(1) %ld0, align 4
 
-  %ld1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %out.addr, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %ld1, i32 1
-  store i32 2, i32 addrspace(1)* %arrayidx1, align 4
+  %ld1 = load ptr addrspace(1), ptr addrspace(5) %out.addr, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr addrspace(1) %ld1, i32 1
+  store i32 2, ptr addrspace(1) %arrayidx1, align 4
 
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll
index cdde9f3cf9e78..9383dbe7bf343 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll
@@ -7,80 +7,80 @@
 ; GFX9_11: flat_store_{{dword|b32}} v[{{[0-9:]+}}], v{{[0-9]+}} offset:4
 ; GFX10: flat_load_dword v{{[0-9]+}}, v[{{[0-9:]+}}]{{$}}
 ; GFX10: flat_store_dword v[{{[0-9:]+}}], v{{[0-9]+}}{{$}}
-define void @flat_inst_offset(i32* nocapture %p) {
-  %gep = getelementptr inbounds i32, i32* %p, i64 1
-  %load = load i32, i32* %gep, align 4
+define void @flat_inst_offset(ptr nocapture %p) {
+  %gep = getelementptr inbounds i32, ptr %p, i64 1
+  %load = load i32, ptr %gep, align 4
   %inc = add nsw i32 %load, 1
-  store i32 %inc, i32* %gep, align 4
+  store i32 %inc, ptr %gep, align 4
   ret void
 }
 
 ; GCN-LABEL: global_inst_offset:
 ; GCN: global_load_{{dword|b32}} v{{[0-9]+}}, v[{{[0-9:]+}}], off offset:4
 ; GCN: global_store_{{dword|b32}} v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:4
-define void @global_inst_offset(i32 addrspace(1)* nocapture %p) {
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 1
-  %load = load i32, i32 addrspace(1)* %gep, align 4
+define void @global_inst_offset(ptr addrspace(1) nocapture %p) {
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %p, i64 1
+  %load = load i32, ptr addrspace(1) %gep, align 4
   %inc = add nsw i32 %load, 1
-  store i32 %inc, i32 addrspace(1)* %gep, align 4
+  store i32 %inc, ptr addrspace(1) %gep, align 4
   ret void
 }
 
 ; GCN-LABEL: load_i16_lo:
 ; GFX9_11: flat_load_{{short_d16|d16_b16}} v{{[0-9]+}}, v[{{[0-9:]+}}] offset:8{{$}}
 ; GFX10: flat_load_short_d16 v{{[0-9]+}}, v[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @load_i16_lo(i16* %arg, <2 x i16>* %out) {
-  %gep = getelementptr inbounds i16, i16* %arg, i32 4
-  %ld = load i16, i16* %gep, align 2
+define amdgpu_kernel void @load_i16_lo(ptr %arg, ptr %out) {
+  %gep = getelementptr inbounds i16, ptr %arg, i32 4
+  %ld = load i16, ptr %gep, align 2
   %vec = insertelement <2 x i16> <i16 undef, i16 0>, i16 %ld, i32 0
   %v = add <2 x i16> %vec, %vec
-  store <2 x i16> %v, <2 x i16>* %out, align 4
+  store <2 x i16> %v, ptr %out, align 4
   ret void
 }
 
 ; GCN-LABEL: load_i16_hi:
 ; GFX9_11: flat_load_{{short_d16_hi|d16_hi_b16}} v{{[0-9]+}}, v[{{[0-9:]+}}] offset:8{{$}}
 ; GFX10: flat_load_short_d16_hi v{{[0-9]+}}, v[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @load_i16_hi(i16* %arg, <2 x i16>* %out) {
-  %gep = getelementptr inbounds i16, i16* %arg, i32 4
-  %ld = load i16, i16* %gep, align 2
+define amdgpu_kernel void @load_i16_hi(ptr %arg, ptr %out) {
+  %gep = getelementptr inbounds i16, ptr %arg, i32 4
+  %ld = load i16, ptr %gep, align 2
   %vec = insertelement <2 x i16> <i16 0, i16 undef>, i16 %ld, i32 1
   %v = add <2 x i16> %vec, %vec
-  store <2 x i16> %v, <2 x i16>* %out, align 4
+  store <2 x i16> %v, ptr %out, align 4
   ret void
 }
 
 ; GCN-LABEL: load_half_lo:
 ; GFX9_11: flat_load_{{short_d16|d16_b16}} v{{[0-9]+}}, v[{{[0-9:]+}}] offset:8{{$}}
 ; GFX10: flat_load_short_d16 v{{[0-9]+}}, v[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @load_half_lo(half* %arg, <2 x half>* %out) {
-  %gep = getelementptr inbounds half, half* %arg, i32 4
-  %ld = load half, half* %gep, align 2
+define amdgpu_kernel void @load_half_lo(ptr %arg, ptr %out) {
+  %gep = getelementptr inbounds half, ptr %arg, i32 4
+  %ld = load half, ptr %gep, align 2
   %vec = insertelement <2 x half> <half undef, half 0xH0000>, half %ld, i32 0
   %v = fadd <2 x half> %vec, %vec
-  store <2 x half> %v, <2 x half>* %out, align 4
+  store <2 x half> %v, ptr %out, align 4
   ret void
 }
 
 ; GCN-LABEL: load_half_hi:
 ; GFX9_11: flat_load_{{short_d16_hi|d16_hi_b16}} v{{[0-9]+}}, v[{{[0-9:]+}}] offset:8{{$}}
 ; GFX10: flat_load_short_d16_hi v{{[0-9]+}}, v[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @load_half_hi(half* %arg, <2 x half>* %out) {
-  %gep = getelementptr inbounds half, half* %arg, i32 4
-  %ld = load half, half* %gep, align 2
+define amdgpu_kernel void @load_half_hi(ptr %arg, ptr %out) {
+  %gep = getelementptr inbounds half, ptr %arg, i32 4
+  %ld = load half, ptr %gep, align 2
   %vec = insertelement <2 x half> <half 0xH0000, half undef>, half %ld, i32 1
   %v = fadd <2 x half> %vec, %vec
-  store <2 x half> %v, <2 x half>* %out, align 4
+  store <2 x half> %v, ptr %out, align 4
   ret void
 }
 
 ; GCN-LABEL: load_float_lo:
 ; GFX9_11: flat_load_{{dword|b32}} v{{[0-9]+}}, v[{{[0-9:]+}}] offset:16{{$}}
 ; GFX10: flat_load_dword v{{[0-9]+}}, v[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @load_float_lo(float* %arg, float* %out) {
-  %gep = getelementptr inbounds float, float* %arg, i32 4
-  %ld = load float, float* %gep, align 4
+define amdgpu_kernel void @load_float_lo(ptr %arg, ptr %out) {
+  %gep = getelementptr inbounds float, ptr %arg, i32 4
+  %ld = load float, ptr %gep, align 4
   %v = fadd float %ld, %ld
-  store float %v, float* %out, align 4
+  store float %v, ptr %out, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
index 90d4325d969f2..45d685d7664e1 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
@@ -29,8 +29,8 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; FLAT_SCR_ARCH-NEXT:    s_endpgm
   %alloca = alloca i32, addrspace(5)
-  %cast = addrspacecast i32 addrspace(5)* %alloca to i32*
-  store volatile i32 0, i32* %cast
+  %cast = addrspacecast ptr addrspace(5) %alloca to ptr
+  store volatile i32 0, ptr %cast
   ret void
 }
 
@@ -55,7 +55,7 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; FLAT_SCR_ARCH-NEXT:    s_endpgm
   %alloca = alloca i32, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %alloca
+  store volatile i32 0, ptr addrspace(5) %alloca
   ret void
 }
 
@@ -109,7 +109,7 @@ define amdgpu_kernel void @kernel_calls_no_stack() {
   ret void
 }
 
-define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; FLAT_SCR_OPT-LABEL: test:
 ; FLAT_SCR_OPT:       ; %bb.0:
 ; FLAT_SCR_OPT-NEXT:    s_add_u32 s2, s2, s5
@@ -400,7 +400,7 @@ define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) {
   call void asm sideeffect "", "~{v[240:247]}" ()
   call void asm sideeffect "", "~{v[248:255]}" ()
 
-  store i32 %in, i32 addrspace(1)* %out
+  store i32 %in, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index 98b5ea06ce4ce..4d162e26ebd05 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -77,16 +77,16 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 bb:
   %soff1 = mul i32 %soff, 1
   %a = alloca i8, i32 64, align 4, addrspace(5)
-  %as = getelementptr i8, i8 addrspace(5)* %a, i32 %soff1
+  %as = getelementptr i8, ptr addrspace(5) %a, i32 %soff1
   %voff = call i32 @llvm.amdgcn.workitem.id.x()
   %voff1 = mul i32 %voff, 1
-  %asv = getelementptr i8, i8 addrspace(5)* %as, i32 %voff1
-  %p1 = getelementptr i8, i8 addrspace(5)* %asv, i32 1
-  store volatile i8 1, i8 addrspace(5)* %p1
-  %p2 = getelementptr i8, i8 addrspace(5)* %asv, i32 2
-  store volatile i8 2, i8 addrspace(5)* %p2
-  %p4 = getelementptr i8, i8 addrspace(5)* %asv, i32 4
-  store volatile i8 4, i8 addrspace(5)* %p4
+  %asv = getelementptr i8, ptr addrspace(5) %as, i32 %voff1
+  %p1 = getelementptr i8, ptr addrspace(5) %asv, i32 1
+  store volatile i8 1, ptr addrspace(5) %p1
+  %p2 = getelementptr i8, ptr addrspace(5) %asv, i32 2
+  store volatile i8 2, ptr addrspace(5) %p2
+  %p4 = getelementptr i8, ptr addrspace(5) %asv, i32 4
+  store volatile i8 4, ptr addrspace(5) %p4
   ret void
 }
 
@@ -162,16 +162,16 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 bb:
   %soff1 = mul i32 %soff, 1
   %a = alloca i8, i32 64, align 4, addrspace(5)
-  %as = getelementptr i8, i8 addrspace(5)* %a, i32 %soff1
+  %as = getelementptr i8, ptr addrspace(5) %a, i32 %soff1
   %voff = call i32 @llvm.amdgcn.workitem.id.x()
   %voff2 = mul i32 %voff, 2
-  %asv = getelementptr i8, i8 addrspace(5)* %as, i32 %voff2
-  %p1 = getelementptr i8, i8 addrspace(5)* %asv, i32 1
-  store volatile i8 1, i8 addrspace(5)* %p1
-  %p2 = getelementptr i8, i8 addrspace(5)* %asv, i32 2
-  store volatile i8 2, i8 addrspace(5)* %p2
-  %p4 = getelementptr i8, i8 addrspace(5)* %asv, i32 4
-  store volatile i8 4, i8 addrspace(5)* %p4
+  %asv = getelementptr i8, ptr addrspace(5) %as, i32 %voff2
+  %p1 = getelementptr i8, ptr addrspace(5) %asv, i32 1
+  store volatile i8 1, ptr addrspace(5) %p1
+  %p2 = getelementptr i8, ptr addrspace(5) %asv, i32 2
+  store volatile i8 2, ptr addrspace(5) %p2
+  %p4 = getelementptr i8, ptr addrspace(5) %asv, i32 4
+  store volatile i8 4, ptr addrspace(5) %p4
   ret void
 }
 
@@ -246,16 +246,16 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 bb:
   %soff1 = mul i32 %soff, 1
   %a = alloca i8, i32 64, align 4, addrspace(5)
-  %as = getelementptr i8, i8 addrspace(5)* %a, i32 %soff1
+  %as = getelementptr i8, ptr addrspace(5) %a, i32 %soff1
   %voff = call i32 @llvm.amdgcn.workitem.id.x()
   %voff4 = mul i32 %voff, 4
-  %asv = getelementptr i8, i8 addrspace(5)* %as, i32 %voff4
-  %p1 = getelementptr i8, i8 addrspace(5)* %asv, i32 1
-  store volatile i8 1, i8 addrspace(5)* %p1
-  %p2 = getelementptr i8, i8 addrspace(5)* %asv, i32 2
-  store volatile i8 2, i8 addrspace(5)* %p2
-  %p4 = getelementptr i8, i8 addrspace(5)* %asv, i32 4
-  store volatile i8 4, i8 addrspace(5)* %p4
+  %asv = getelementptr i8, ptr addrspace(5) %as, i32 %voff4
+  %p1 = getelementptr i8, ptr addrspace(5) %asv, i32 1
+  store volatile i8 1, ptr addrspace(5) %p1
+  %p2 = getelementptr i8, ptr addrspace(5) %asv, i32 2
+  store volatile i8 2, ptr addrspace(5) %p2
+  %p4 = getelementptr i8, ptr addrspace(5) %asv, i32 4
+  store volatile i8 4, ptr addrspace(5) %p4
   ret void
 }
 
@@ -333,16 +333,16 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 bb:
   %soff2 = mul i32 %soff, 2
   %a = alloca i8, i32 64, align 4, addrspace(5)
-  %as = getelementptr i8, i8 addrspace(5)* %a, i32 %soff2
+  %as = getelementptr i8, ptr addrspace(5) %a, i32 %soff2
   %voff = call i32 @llvm.amdgcn.workitem.id.x()
   %voff1 = mul i32 %voff, 1
-  %asv = getelementptr i8, i8 addrspace(5)* %as, i32 %voff1
-  %p1 = getelementptr i8, i8 addrspace(5)* %asv, i32 1
-  store volatile i8 1, i8 addrspace(5)* %p1
-  %p2 = getelementptr i8, i8 addrspace(5)* %asv, i32 2
-  store volatile i8 2, i8 addrspace(5)* %p2
-  %p4 = getelementptr i8, i8 addrspace(5)* %asv, i32 4
-  store volatile i8 4, i8 addrspace(5)* %p4
+  %asv = getelementptr i8, ptr addrspace(5) %as, i32 %voff1
+  %p1 = getelementptr i8, ptr addrspace(5) %asv, i32 1
+  store volatile i8 1, ptr addrspace(5) %p1
+  %p2 = getelementptr i8, ptr addrspace(5) %asv, i32 2
+  store volatile i8 2, ptr addrspace(5) %p2
+  %p4 = getelementptr i8, ptr addrspace(5) %asv, i32 4
+  store volatile i8 4, ptr addrspace(5) %p4
   ret void
 }
 
@@ -422,16 +422,16 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 bb:
   %soff2 = mul i32 %soff, 2
   %a = alloca i8, i32 64, align 4, addrspace(5)
-  %as = getelementptr i8, i8 addrspace(5)* %a, i32 %soff2
+  %as = getelementptr i8, ptr addrspace(5) %a, i32 %soff2
   %voff = call i32 @llvm.amdgcn.workitem.id.x()
   %voff2 = mul i32 %voff, 2
-  %asv = getelementptr i8, i8 addrspace(5)* %as, i32 %voff2
-  %p1 = getelementptr i8, i8 addrspace(5)* %asv, i32 1
-  store volatile i8 1, i8 addrspace(5)* %p1
-  %p2 = getelementptr i8, i8 addrspace(5)* %asv, i32 2
-  store volatile i8 2, i8 addrspace(5)* %p2
-  %p4 = getelementptr i8, i8 addrspace(5)* %asv, i32 4
-  store volatile i8 4, i8 addrspace(5)* %p4
+  %asv = getelementptr i8, ptr addrspace(5) %as, i32 %voff2
+  %p1 = getelementptr i8, ptr addrspace(5) %asv, i32 1
+  store volatile i8 1, ptr addrspace(5) %p1
+  %p2 = getelementptr i8, ptr addrspace(5) %asv, i32 2
+  store volatile i8 2, ptr addrspace(5) %p2
+  %p4 = getelementptr i8, ptr addrspace(5) %asv, i32 4
+  store volatile i8 4, ptr addrspace(5) %p4
   ret void
 }
 
@@ -511,16 +511,16 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 bb:
   %soff2 = mul i32 %soff, 2
   %a = alloca i8, i32 64, align 4, addrspace(5)
-  %as = getelementptr i8, i8 addrspace(5)* %a, i32 %soff2
+  %as = getelementptr i8, ptr addrspace(5) %a, i32 %soff2
   %voff = call i32 @llvm.amdgcn.workitem.id.x()
   %voff4 = mul i32 %voff, 4
-  %asv = getelementptr i8, i8 addrspace(5)* %as, i32 %voff4
-  %p1 = getelementptr i8, i8 addrspace(5)* %asv, i32 1
-  store volatile i8 1, i8 addrspace(5)* %p1
-  %p2 = getelementptr i8, i8 addrspace(5)* %asv, i32 2
-  store volatile i8 2, i8 addrspace(5)* %p2
-  %p4 = getelementptr i8, i8 addrspace(5)* %asv, i32 4
-  store volatile i8 4, i8 addrspace(5)* %p4
+  %asv = getelementptr i8, ptr addrspace(5) %as, i32 %voff4
+  %p1 = getelementptr i8, ptr addrspace(5) %asv, i32 1
+  store volatile i8 1, ptr addrspace(5) %p1
+  %p2 = getelementptr i8, ptr addrspace(5) %asv, i32 2
+  store volatile i8 2, ptr addrspace(5) %p2
+  %p4 = getelementptr i8, ptr addrspace(5) %asv, i32 4
+  store volatile i8 4, ptr addrspace(5) %p4
   ret void
 }
 
@@ -599,16 +599,16 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 bb:
   %soff4 = mul i32 %soff, 4
   %a = alloca i8, i32 64, align 4, addrspace(5)
-  %as = getelementptr i8, i8 addrspace(5)* %a, i32 %soff4
+  %as = getelementptr i8, ptr addrspace(5) %a, i32 %soff4
   %voff = call i32 @llvm.amdgcn.workitem.id.x()
   %voff1 = mul i32 %voff, 1
-  %asv = getelementptr i8, i8 addrspace(5)* %as, i32 %voff1
-  %p1 = getelementptr i8, i8 addrspace(5)* %asv, i32 1
-  store volatile i8 1, i8 addrspace(5)* %p1
-  %p2 = getelementptr i8, i8 addrspace(5)* %asv, i32 2
-  store volatile i8 2, i8 addrspace(5)* %p2
-  %p4 = getelementptr i8, i8 addrspace(5)* %asv, i32 4
-  store volatile i8 4, i8 addrspace(5)* %p4
+  %asv = getelementptr i8, ptr addrspace(5) %as, i32 %voff1
+  %p1 = getelementptr i8, ptr addrspace(5) %asv, i32 1
+  store volatile i8 1, ptr addrspace(5) %p1
+  %p2 = getelementptr i8, ptr addrspace(5) %asv, i32 2
+  store volatile i8 2, ptr addrspace(5) %p2
+  %p4 = getelementptr i8, ptr addrspace(5) %asv, i32 4
+  store volatile i8 4, ptr addrspace(5) %p4
   ret void
 }
 
@@ -690,16 +690,16 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 bb:
   %soff4 = mul i32 %soff, 4
   %a = alloca i8, i32 64, align 4, addrspace(5)
-  %as = getelementptr i8, i8 addrspace(5)* %a, i32 %soff4
+  %as = getelementptr i8, ptr addrspace(5) %a, i32 %soff4
   %voff = call i32 @llvm.amdgcn.workitem.id.x()
   %voff2 = mul i32 %voff, 2
-  %asv = getelementptr i8, i8 addrspace(5)* %as, i32 %voff2
-  %p1 = getelementptr i8, i8 addrspace(5)* %asv, i32 1
-  store volatile i8 1, i8 addrspace(5)* %p1
-  %p2 = getelementptr i8, i8 addrspace(5)* %asv, i32 2
-  store volatile i8 2, i8 addrspace(5)* %p2
-  %p4 = getelementptr i8, i8 addrspace(5)* %asv, i32 4
-  store volatile i8 4, i8 addrspace(5)* %p4
+  %asv = getelementptr i8, ptr addrspace(5) %as, i32 %voff2
+  %p1 = getelementptr i8, ptr addrspace(5) %asv, i32 1
+  store volatile i8 1, ptr addrspace(5) %p1
+  %p2 = getelementptr i8, ptr addrspace(5) %asv, i32 2
+  store volatile i8 2, ptr addrspace(5) %p2
+  %p4 = getelementptr i8, ptr addrspace(5) %asv, i32 4
+  store volatile i8 4, ptr addrspace(5) %p4
   ret void
 }
 
@@ -779,15 +779,15 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 bb:
   %soff4 = mul i32 %soff, 4
   %a = alloca i8, i32 64, align 4, addrspace(5)
-  %as = getelementptr i8, i8 addrspace(5)* %a, i32 %soff4
+  %as = getelementptr i8, ptr addrspace(5) %a, i32 %soff4
   %voff = call i32 @llvm.amdgcn.workitem.id.x()
   %voff4 = mul i32 %voff, 4
-  %asv = getelementptr i8, i8 addrspace(5)* %as, i32 %voff4
-  %p1 = getelementptr i8, i8 addrspace(5)* %asv, i32 1
-  store volatile i8 1, i8 addrspace(5)* %p1
-  %p2 = getelementptr i8, i8 addrspace(5)* %asv, i32 2
-  store volatile i8 2, i8 addrspace(5)* %p2
-  %p4 = getelementptr i8, i8 addrspace(5)* %asv, i32 4
-  store volatile i8 4, i8 addrspace(5)* %p4
+  %asv = getelementptr i8, ptr addrspace(5) %as, i32 %voff4
+  %p1 = getelementptr i8, ptr addrspace(5) %asv, i32 1
+  store volatile i8 1, ptr addrspace(5) %p1
+  %p2 = getelementptr i8, ptr addrspace(5) %asv, i32 2
+  store volatile i8 2, ptr addrspace(5) %p2
+  %p4 = getelementptr i8, ptr addrspace(5) %asv, i32 4
+  store volatile i8 4, ptr addrspace(5) %p4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 06ee6b4998eaf..0cc45f28fc501 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -177,8 +177,7 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX11-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PAL-NEXT:    s_endpgm
   %alloca = alloca [32 x i16], align 2, addrspace(5)
-  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
-  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+  call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
   ret void
 }
 
@@ -326,8 +325,7 @@ define void @zero_init_foo() {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca [32 x i16], align 2, addrspace(5)
-  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
-  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+  call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
   ret void
 }
 
@@ -481,14 +479,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GCN-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
-  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
-  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
-  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
-  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
+  store volatile i32 15, ptr addrspace(5) %i7, align 4
   %i9 = and i32 %idx, 15
-  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
-  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
-  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
+  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
   ret void
 }
 
@@ -627,14 +622,11 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GCN-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
-  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
-  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
-  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
-  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
+  store volatile i32 15, ptr addrspace(5) %i7, align 4
   %i9 = and i32 %idx, 15
-  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
-  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
-  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
+  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
   ret void
 }
 
@@ -753,16 +745,13 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GCN-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
-  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
   %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %i3 = zext i32 %i2 to i64
-  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
-  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
-  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2
+  store volatile i32 15, ptr addrspace(5) %i7, align 4
   %i9 = sub nsw i32 31, %i2
-  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
-  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
-  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
+  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
   ret void
 }
 
@@ -877,18 +866,15 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
-  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
-  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
-  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
-  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
+  store volatile i32 15, ptr addrspace(5) %i7, align 4
   %i9 = and i32 %idx, 15
-  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
-  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
-  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
+  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
   ret void
 }
 
-define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
+define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: private_ptr_foo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -955,8 +941,8 @@ define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
 ; GCN-NEXT:    scratch_store_dword v0, v1, off offset:4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1
-  store float 1.000000e+01, float addrspace(5)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
+  store float 1.000000e+01, ptr addrspace(5) %gep, align 4
   ret void
 }
 
@@ -1149,10 +1135,9 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX11-PAL-NEXT:    s_endpgm
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %alloca = alloca [32 x i16], align 2, addrspace(5)
-  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
-  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
-  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
-  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+  %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
+  call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
   ret void
 }
 
@@ -1317,10 +1302,9 @@ define void @zero_init_small_offset_foo() {
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %alloca = alloca [32 x i16], align 2, addrspace(5)
-  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
-  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
-  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
-  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+  %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
+  call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
   ret void
 }
 
@@ -1503,16 +1487,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
-  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
-  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
-  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
-  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
-  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
-  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
+  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
+  store volatile i32 15, ptr addrspace(5) %i7, align 4
   %i9 = and i32 %idx, 15
-  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
-  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
-  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
+  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
   ret void
 }
 
@@ -1681,16 +1662,13 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
-  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
-  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
-  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
-  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
-  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
-  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
+  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
+  store volatile i32 15, ptr addrspace(5) %i7, align 4
   %i9 = and i32 %idx, 15
-  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
-  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
-  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
+  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
   ret void
 }
 
@@ -1838,18 +1816,15 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
-  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
-  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
-  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
+  %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
   %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %i3 = zext i32 %i2 to i64
-  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
-  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
-  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2
+  store volatile i32 15, ptr addrspace(5) %i7, align 4
   %i9 = sub nsw i32 31, %i2
-  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
-  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
-  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
+  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
   ret void
 }
 
@@ -1985,16 +1960,13 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
-  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
-  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
-  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
-  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
-  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
-  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
+  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
+  store volatile i32 15, ptr addrspace(5) %i7, align 4
   %i9 = and i32 %idx, 15
-  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
-  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
-  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
+  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
   ret void
 }
 
@@ -2208,10 +2180,9 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX11-PAL-NEXT:    s_endpgm
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %alloca = alloca [32 x i16], align 2, addrspace(5)
-  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
-  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
-  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
-  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+  %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
+  call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
   ret void
 }
 
@@ -2387,10 +2358,9 @@ define void @zero_init_large_offset_foo() {
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %alloca = alloca [32 x i16], align 2, addrspace(5)
-  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
-  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
-  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
-  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+  %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
+  call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
   ret void
 }
 
@@ -2573,16 +2543,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
-  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
-  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
-  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
-  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
-  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
-  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
+  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
+  store volatile i32 15, ptr addrspace(5) %i7, align 4
   %i9 = and i32 %idx, 15
-  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
-  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
-  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
+  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
   ret void
 }
 
@@ -2751,16 +2718,13 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
-  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
-  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
-  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
-  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
-  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
-  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
+  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
+  store volatile i32 15, ptr addrspace(5) %i7, align 4
   %i9 = and i32 %idx, 15
-  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
-  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
-  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
+  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
   ret void
 }
 
@@ -2911,18 +2875,15 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
-  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
-  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
-  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
+  %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
   %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %i3 = zext i32 %i2 to i64
-  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
-  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
-  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2
+  store volatile i32 15, ptr addrspace(5) %i7, align 4
   %i9 = sub nsw i32 31, %i2
-  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
-  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
-  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
+  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
   ret void
 }
 
@@ -3068,16 +3029,13 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
-  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
-  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
-  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
-  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
-  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
-  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
+  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
+  store volatile i32 15, ptr addrspace(5) %i7, align 4
   %i9 = and i32 %idx, 15
-  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
-  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
-  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
+  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
   ret void
 }
 
@@ -3224,12 +3182,12 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX11-PAL-NEXT:    s_endpgm
 bb:
   %i = alloca [4096 x i32], align 4, addrspace(5)
-  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
-  store volatile i32 13, i32 addrspace(5)* %i1, align 4
-  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
-  store volatile i32 15, i32 addrspace(5)* %i7, align 4
-  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
-  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
+  %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef
+  store volatile i32 13, ptr addrspace(5) %i1, align 4
+  %i7 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000
+  store volatile i32 15, ptr addrspace(5) %i7, align 4
+  %i10 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000
+  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
   ret void
 }
 
@@ -3356,12 +3314,12 @@ define void @store_load_large_imm_offset_foo() {
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = alloca [4096 x i32], align 4, addrspace(5)
-  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
-  store volatile i32 13, i32 addrspace(5)* %i1, align 4
-  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
-  store volatile i32 15, i32 addrspace(5)* %i7, align 4
-  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
-  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
+  %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef
+  store volatile i32 13, ptr addrspace(5) %i1, align 4
+  %i7 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000
+  store volatile i32 15, ptr addrspace(5) %i7, align 4
+  %i10 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000
+  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
   ret void
 }
 
@@ -3492,13 +3450,13 @@ bb:
   %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
   %add1 = add nsw i32 %sidx, %vidx
   %add2 = add nsw i32 %add1, 256
-  %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2
-  store volatile i32 15, i32 addrspace(5)* %gep, align 4
-  %load = load volatile i32, i32 addrspace(5)* %gep, align 4
+  %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %add2
+  store volatile i32 15, ptr addrspace(5) %gep, align 4
+  %load = load volatile i32, ptr addrspace(5) %gep, align 4
   ret void
 }
 
-define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
+define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: store_load_i64_aligned:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3588,12 +3546,12 @@ define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  store volatile i64 15, i64 addrspace(5)* %arg, align 8
-  %load = load volatile i64, i64 addrspace(5)* %arg, align 8
+  store volatile i64 15, ptr addrspace(5) %arg, align 8
+  %load = load volatile i64, ptr addrspace(5) %arg, align 8
   ret void
 }
 
-define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
+define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: store_load_i64_unaligned:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3683,12 +3641,12 @@ define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  store volatile i64 15, i64 addrspace(5)* %arg, align 1
-  %load = load volatile i64, i64 addrspace(5)* %arg, align 1
+  store volatile i64 15, ptr addrspace(5) %arg, align 1
+  %load = load volatile i64, ptr addrspace(5) %arg, align 1
   ret void
 }
 
-define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) {
+define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: store_load_v3i32_unaligned:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3786,12 +3744,12 @@ define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1
-  %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1
+  store volatile <3 x i32> <i32 1, i32 2, i32 3>, ptr addrspace(5) %arg, align 1
+  %load = load volatile <3 x i32>, ptr addrspace(5) %arg, align 1
   ret void
 }
 
-define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) {
+define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: store_load_v4i32_unaligned:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3895,12 +3853,12 @@ define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1
-  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1
+  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %arg, align 1
+  %load = load volatile <4 x i32>, ptr addrspace(5) %arg, align 1
   ret void
 }
 
-define void @store_load_i32_negative_unaligned(i8 addrspace(5)* nocapture %arg) {
+define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: store_load_i32_negative_unaligned:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3990,13 +3948,13 @@ define void @store_load_i32_negative_unaligned(i8 addrspace(5)* nocapture %arg)
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -1
-  store volatile i8 1, i8 addrspace(5)* %ptr, align 1
-  %load = load volatile i8, i8 addrspace(5)* %ptr, align 1
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -1
+  store volatile i8 1, ptr addrspace(5) %ptr, align 1
+  %load = load volatile i8, ptr addrspace(5) %ptr, align 1
   ret void
 }
 
-define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture %arg) {
+define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4088,9 +4046,9 @@ define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -4225
-  store volatile i8 1, i8 addrspace(5)* %ptr, align 1
-  %load = load volatile i8, i8 addrspace(5)* %ptr, align 1
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -4225
+  store volatile i8 1, ptr addrspace(5) %ptr, align 1
+  %load = load volatile i8, ptr addrspace(5) %ptr, align 1
   ret void
 }
 
@@ -4267,13 +4225,13 @@ define amdgpu_ps void @large_offset() {
 bb:
   %alloca = alloca [128 x <4 x i32>], align 16, addrspace(5)
   %alloca2 = alloca [128 x <4 x i32>], align 16, addrspace(5)
-  %gep = getelementptr inbounds [128 x <4 x i32>], [128 x <4 x i32>] addrspace(5)* %alloca2, i32 0, i32 60
-  store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(5)* %gep, align 16
-  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %gep, align 16
-  call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca) #0
-  call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca2) #0
+  %gep = getelementptr inbounds [128 x <4 x i32>], ptr addrspace(5) %alloca2, i32 0, i32 60
+  store volatile <4 x i32> zeroinitializer, ptr addrspace(5) %gep, align 16
+  %load = load volatile <4 x i32>, ptr addrspace(5) %gep, align 16
+  call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %alloca) #0
+  call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %alloca2) #0
   ret void
 }
 
-declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg)
+declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg)
 declare i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index b6337a3c903dc..a15854c061825 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN3 %s
 
 ; GCN-LABEL: {{^}}atomic_add_i32_offset:
-define amdgpu_kernel void @atomic_add_i32_offset(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_add_i32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -51,13 +51,13 @@ define amdgpu_kernel void @atomic_add_i32_offset(i32* %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile add i32* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile add ptr %gep, i32 %in seq_cst
   ret void
 }
 
 ; GCN-LABEL: {{^}}atomic_add_i32_max_offset:
-define amdgpu_kernel void @atomic_add_i32_max_offset(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_add_i32_max_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -104,12 +104,12 @@ define amdgpu_kernel void @atomic_add_i32_max_offset(i32* %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 1023
-  %val = atomicrmw volatile add i32* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr %out, i32 1023
+  %val = atomicrmw volatile add ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i32_max_offset_p1(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_add_i32_max_offset_p1:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -158,12 +158,12 @@ define amdgpu_kernel void @atomic_add_i32_max_offset_p1(i32* %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 1024
-  %val = atomicrmw volatile add i32* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr %out, i32 1024
+  %val = atomicrmw volatile add ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_add_i32_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -219,14 +219,14 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile add i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile add ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
 ; GCN-LABEL: {{^}}atomic_add_i32_addr64_offset:
-define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_add_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -285,13 +285,13 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32* %out, i32 %in, i64
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile add i32* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile add ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_add_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -359,14 +359,14 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32* %out, i32* %out
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile add i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile add ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i32(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_add_i32:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -409,11 +409,11 @@ define amdgpu_kernel void @atomic_add_i32(i32* %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile add i32* %out, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i32_ret(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_add_i32_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -465,12 +465,12 @@ define amdgpu_kernel void @atomic_add_i32_ret(i32* %out, i32* %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile add i32* %out, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %val = atomicrmw volatile add ptr %out, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i32_addr64(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_add_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -525,12 +525,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64(i32* %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile add i32* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile add ptr %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_add_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -594,13 +594,13 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile add i32* %ptr, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile add ptr %ptr, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i32_offset(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_and_i32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -647,12 +647,12 @@ define amdgpu_kernel void @atomic_and_i32_offset(i32* %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile and i32* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile and ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_and_i32_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -708,13 +708,13 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile and i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile and ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_and_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -773,13 +773,13 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32* %out, i32 %in, i64
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile and i32* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile and ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_and_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -847,14 +847,14 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32* %out, i32* %out
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile and i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile and ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i32(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_and_i32:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -897,11 +897,11 @@ define amdgpu_kernel void @atomic_and_i32(i32* %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile and i32* %out, i32 %in seq_cst
+  %val = atomicrmw volatile and ptr %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i32_ret(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_and_i32_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -953,12 +953,12 @@ define amdgpu_kernel void @atomic_and_i32_ret(i32* %out, i32* %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile and i32* %out, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %val = atomicrmw volatile and ptr %out, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i32_addr64(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_and_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -1013,12 +1013,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64(i32* %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile and i32* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile and ptr %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_and_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -1082,13 +1082,13 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile and i32* %ptr, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile and ptr %ptr, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i32_offset(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_sub_i32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -1135,12 +1135,12 @@ define amdgpu_kernel void @atomic_sub_i32_offset(i32* %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile sub i32* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile sub ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_sub_i32_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1196,13 +1196,13 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile sub i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile sub ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_sub_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -1261,13 +1261,13 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32* %out, i32 %in, i64
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile sub i32* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile sub ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_sub_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -1335,14 +1335,14 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32* %out, i32* %out
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile sub i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile sub ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i32(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_sub_i32:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -1385,11 +1385,11 @@ define amdgpu_kernel void @atomic_sub_i32(i32* %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile sub i32* %out, i32 %in seq_cst
+  %val = atomicrmw volatile sub ptr %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i32_ret(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_sub_i32_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1441,12 +1441,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret(i32* %out, i32* %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile sub i32* %out, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %val = atomicrmw volatile sub ptr %out, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i32_addr64(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_sub_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -1501,12 +1501,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(i32* %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile sub i32* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile sub ptr %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_sub_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -1570,13 +1570,13 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile sub i32* %ptr, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile sub ptr %ptr, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_offset(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_max_i32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -1620,12 +1620,12 @@ define amdgpu_kernel void @atomic_max_i32_offset(i32* %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile max i32* %gep, i32 %in syncscope("workgroup") seq_cst
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_max_i32_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1681,13 +1681,13 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile max i32* %gep, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32* %out2
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -1743,13 +1743,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32* %out, i32 %in, i64
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile max i32* %gep, i32 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -1817,14 +1817,14 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32* %out, i32* %out
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile max i32* %gep, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_max_i32:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -1864,11 +1864,11 @@ define amdgpu_kernel void @atomic_max_i32(i32* %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile max i32* %out, i32 %in syncscope("workgroup") seq_cst
+  %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_ret(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_max_i32_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1920,12 +1920,12 @@ define amdgpu_kernel void @atomic_max_i32_ret(i32* %out, i32* %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile max i32* %out, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32* %out2
+  %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_addr64(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -1977,12 +1977,12 @@ define amdgpu_kernel void @atomic_max_i32_addr64(i32* %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile max i32* %ptr, i32 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -2046,13 +2046,13 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile max i32* %ptr, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_offset(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_umax_i32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -2096,12 +2096,12 @@ define amdgpu_kernel void @atomic_umax_i32_offset(i32* %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile umax i32* %gep, i32 %in syncscope("workgroup") seq_cst
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_umax_i32_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2157,13 +2157,13 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile umax i32* %gep, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32* %out2
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -2219,13 +2219,13 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32* %out, i32 %in, i64
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile umax i32* %gep, i32 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -2293,14 +2293,14 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32* %out, i32* %ou
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile umax i32* %gep, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_umax_i32:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -2340,11 +2340,11 @@ define amdgpu_kernel void @atomic_umax_i32(i32* %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile umax i32* %out, i32 %in syncscope("workgroup") seq_cst
+  %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_ret(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_umax_i32_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2396,12 +2396,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret(i32* %out, i32* %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile umax i32* %out, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32* %out2
+  %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_addr64(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -2453,12 +2453,12 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(i32* %out, i32 %in, i64 %index
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile umax i32* %ptr, i32 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -2522,13 +2522,13 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile umax i32* %ptr, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_offset(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_min_i32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -2572,12 +2572,12 @@ define amdgpu_kernel void @atomic_min_i32_offset(i32* %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile min i32* %gep, i32 %in syncscope("workgroup") seq_cst
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_min_i32_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2633,13 +2633,13 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile min i32* %gep, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32* %out2
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -2695,13 +2695,13 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32* %out, i32 %in, i64
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile min i32* %gep, i32 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -2769,14 +2769,14 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32* %out, i32* %out
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile min i32* %gep, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_min_i32:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -2816,11 +2816,11 @@ define amdgpu_kernel void @atomic_min_i32(i32* %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile min i32* %out, i32 %in syncscope("workgroup") seq_cst
+  %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_ret(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_min_i32_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2872,12 +2872,12 @@ define amdgpu_kernel void @atomic_min_i32_ret(i32* %out, i32* %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile min i32* %out, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32* %out2
+  %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_addr64(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -2929,12 +2929,12 @@ define amdgpu_kernel void @atomic_min_i32_addr64(i32* %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile min i32* %ptr, i32 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -2998,13 +2998,13 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile min i32* %ptr, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_offset(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_umin_i32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -3048,12 +3048,12 @@ define amdgpu_kernel void @atomic_umin_i32_offset(i32* %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile umin i32* %gep, i32 %in syncscope("workgroup") seq_cst
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_umin_i32_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3109,13 +3109,13 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile umin i32* %gep, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32* %out2
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -3171,13 +3171,13 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32* %out, i32 %in, i64
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile umin i32* %gep, i32 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -3245,14 +3245,14 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32* %out, i32* %ou
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile umin i32* %gep, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_umin_i32:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -3292,11 +3292,11 @@ define amdgpu_kernel void @atomic_umin_i32(i32* %out, i32 %in) {
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile umin i32* %out, i32 %in syncscope("workgroup") seq_cst
+  %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_ret(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_umin_i32_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3348,12 +3348,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret(i32* %out, i32* %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile umin i32* %out, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32* %out2
+  %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_addr64(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -3405,12 +3405,12 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(i32* %out, i32 %in, i64 %index
 ; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile umin i32* %ptr, i32 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -3474,13 +3474,13 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile umin i32* %ptr, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i32_offset(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_or_i32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -3527,12 +3527,12 @@ define amdgpu_kernel void @atomic_or_i32_offset(i32* %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile or i32* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile or ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_or_i32_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3588,13 +3588,13 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(i32* %out, i32* %out2, i32 %
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile or i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile or ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_or_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -3653,13 +3653,13 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32* %out, i32 %in, i64 %
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile or i32* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile or ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_or_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -3727,14 +3727,14 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32* %out, i32* %out2
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile or i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile or ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i32(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_or_i32:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -3777,11 +3777,11 @@ define amdgpu_kernel void @atomic_or_i32(i32* %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile or i32* %out, i32 %in seq_cst
+  %val = atomicrmw volatile or ptr %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i32_ret(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_or_i32_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3833,12 +3833,12 @@ define amdgpu_kernel void @atomic_or_i32_ret(i32* %out, i32* %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile or i32* %out, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %val = atomicrmw volatile or ptr %out, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i32_addr64(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_or_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -3893,12 +3893,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64(i32* %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile or i32* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile or ptr %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_or_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -3962,13 +3962,13 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32* %out, i32* %out2, i32 %
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile or i32* %ptr, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile or ptr %ptr, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i32_offset(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_xchg_i32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -4015,12 +4015,12 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(i32* %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile xchg i32* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile xchg ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_f32_offset(float* %out, float %in) {
+define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) {
 ; GCN1-LABEL: atomic_xchg_f32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -4067,12 +4067,12 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(float* %out, float %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr float, float* %out, i32 4
-  %val = atomicrmw volatile xchg float* %gep, float %in seq_cst
+  %gep = getelementptr float, ptr %out, i32 4
+  %val = atomicrmw volatile xchg ptr %gep, float %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_xchg_i32_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4128,13 +4128,13 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile xchg i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile xchg ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xchg_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -4193,13 +4193,13 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32* %out, i32 %in, i64
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile xchg i32* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile xchg ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xchg_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -4267,14 +4267,14 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32* %out, i32* %ou
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile xchg i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile xchg ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i32(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_xchg_i32:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -4317,11 +4317,11 @@ define amdgpu_kernel void @atomic_xchg_i32(i32* %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst
+  %val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i32_ret(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_xchg_i32_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4373,12 +4373,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(i32* %out, i32* %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i32_addr64(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xchg_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -4433,12 +4433,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(i32* %out, i32 %in, i64 %index
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile xchg i32* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile xchg ptr %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xchg_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -4502,15 +4502,15 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile xchg i32* %ptr, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile xchg ptr %ptr, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
 ; CMP_SWAP
 
-define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4557,12 +4557,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32* %out, i32 %in, i32 %ol
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32* %out, i32* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4621,14 +4621,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32* %out, i32* %out2,
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst seq_cst
   %flag = extractvalue { i32, i1 } %val, 0
-  store i32 %flag, i32* %out2
+  store i32 %flag, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -4693,13 +4693,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32* %out, i32 %in,
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val  = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val  = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -4773,15 +4773,15 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32* %out, i32*
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val  = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val  = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst seq_cst
   %flag = extractvalue { i32, i1 } %val, 0
-  store i32 %flag, i32* %out2
+  store i32 %flag, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i32(i32* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4824,11 +4824,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(i32* %out, i32 %in, i32 %old) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = cmpxchg volatile i32* %out, i32 %old, i32 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr %out, i32 %old, i32 %in seq_cst seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32* %out, i32* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4883,13 +4883,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32* %out, i32* %out2, i32 %in
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = cmpxchg volatile i32* %out, i32 %old, i32 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr %out, i32 %old, i32 %in seq_cst seq_cst
   %flag = extractvalue { i32, i1 } %val, 0
-  store i32 %flag, i32* %out2
+  store i32 %flag, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %index, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -4950,12 +4950,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32* %out, i32 %in, i64 %in
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = cmpxchg volatile i32* %ptr, i32 %old, i32 %in seq_cst seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in seq_cst seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -5025,14 +5025,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32* %out, i32* %out2,
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = cmpxchg volatile i32* %ptr, i32 %old, i32 %in seq_cst seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in seq_cst seq_cst
   %flag = extractvalue { i32, i1 } %val, 0
-  store i32 %flag, i32* %out2
+  store i32 %flag, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i32_offset(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_xor_i32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -5079,12 +5079,12 @@ define amdgpu_kernel void @atomic_xor_i32_offset(i32* %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile xor i32* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile xor ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_xor_i32_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -5140,13 +5140,13 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile xor i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile xor ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xor_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -5205,13 +5205,13 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32* %out, i32 %in, i64
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile xor i32* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile xor ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xor_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -5279,14 +5279,14 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32* %out, i32* %out
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile xor i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile xor ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i32(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_xor_i32:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -5329,11 +5329,11 @@ define amdgpu_kernel void @atomic_xor_i32(i32* %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile xor i32* %out, i32 %in seq_cst
+  %val = atomicrmw volatile xor ptr %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i32_ret(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_xor_i32_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -5385,12 +5385,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret(i32* %out, i32* %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile xor i32* %out, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %val = atomicrmw volatile xor ptr %out, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i32_addr64(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xor_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -5445,12 +5445,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(i32* %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile xor i32* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile xor ptr %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xor_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -5514,13 +5514,13 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile xor i32* %ptr, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile xor ptr %ptr, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i32_offset(i32* %in, i32* %out) {
+define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) {
 ; GCN1-LABEL: atomic_load_i32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5570,13 +5570,13 @@ define amdgpu_kernel void @atomic_load_i32_offset(i32* %in, i32* %out) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %in, i32 4
-  %val = load atomic i32, i32* %gep  seq_cst, align 4
-  store i32 %val, i32* %out
+  %gep = getelementptr i32, ptr %in, i32 4
+  %val = load atomic i32, ptr %gep  seq_cst, align 4
+  store i32 %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i32(i32* %in, i32* %out) {
+define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) {
 ; GCN1-LABEL: atomic_load_i32:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5622,12 +5622,12 @@ define amdgpu_kernel void @atomic_load_i32(i32* %in, i32* %out) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = load atomic i32, i32* %in seq_cst, align 4
-  store i32 %val, i32* %out
+  %val = load atomic i32, ptr %in seq_cst, align 4
+  store i32 %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32* %in, i32* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_load_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -5689,14 +5689,14 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32* %in, i32* %out, i6
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %in, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = load atomic i32, i32* %gep seq_cst, align 4
-  store i32 %val, i32* %out
+  %ptr = getelementptr i32, ptr %in, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = load atomic i32, ptr %gep seq_cst, align 4
+  store i32 %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i32_addr64(i32* %in, i32* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_load_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -5754,13 +5754,13 @@ define amdgpu_kernel void @atomic_load_i32_addr64(i32* %in, i32* %out, i64 %inde
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %in, i64 %index
-  %val = load atomic i32, i32* %ptr seq_cst, align 4
-  store i32 %val, i32* %out
+  %ptr = getelementptr i32, ptr %in, i64 %index
+  %val = load atomic i32, ptr %ptr seq_cst, align 4
+  store i32 %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32* %out) {
+define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) {
 ; GCN1-LABEL: atomic_store_i32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
@@ -5801,12 +5801,12 @@ define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32* %out) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  store atomic i32 %in, i32* %gep  seq_cst, align 4
+  %gep = getelementptr i32, ptr %out, i32 4
+  store atomic i32 %in, ptr %gep  seq_cst, align 4
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i32(i32 %in, i32* %out) {
+define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr %out) {
 ; GCN1-LABEL: atomic_store_i32:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
@@ -5843,11 +5843,11 @@ define amdgpu_kernel void @atomic_store_i32(i32 %in, i32* %out) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  store atomic i32 %in, i32* %out seq_cst, align 4
+  store atomic i32 %in, ptr %out seq_cst, align 4
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_store_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
@@ -5897,13 +5897,13 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32* %out, i6
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  store atomic i32 %in, i32* %gep seq_cst, align 4
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  store atomic i32 %in, ptr %gep seq_cst, align 4
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_store_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
@@ -5949,12 +5949,12 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32* %out, i64 %inde
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  store atomic i32 %in, i32* %ptr seq_cst, align 4
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  store atomic i32 %in, ptr %ptr seq_cst, align 4
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_f32_offset(float* %in, float* %out) {
+define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) {
 ; GCN1-LABEL: atomic_load_f32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -6004,13 +6004,13 @@ define amdgpu_kernel void @atomic_load_f32_offset(float* %in, float* %out) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr float, float* %in, i32 4
-  %val = load atomic float, float* %gep  seq_cst, align 4
-  store float %val, float* %out
+  %gep = getelementptr float, ptr %in, i32 4
+  %val = load atomic float, ptr %gep  seq_cst, align 4
+  store float %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_f32(float* %in, float* %out) {
+define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) {
 ; GCN1-LABEL: atomic_load_f32:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -6056,12 +6056,12 @@ define amdgpu_kernel void @atomic_load_f32(float* %in, float* %out) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = load atomic float, float* %in seq_cst, align 4
-  store float %val, float* %out
+  %val = load atomic float, ptr %in seq_cst, align 4
+  store float %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_f32_addr64_offset(float* %in, float* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_load_f32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -6123,14 +6123,14 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(float* %in, float* %out
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr float, float* %in, i64 %index
-  %gep = getelementptr float, float* %ptr, i32 4
-  %val = load atomic float, float* %gep seq_cst, align 4
-  store float %val, float* %out
+  %ptr = getelementptr float, ptr %in, i64 %index
+  %gep = getelementptr float, ptr %ptr, i32 4
+  %val = load atomic float, ptr %gep seq_cst, align 4
+  store float %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_f32_addr64(float* %in, float* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_load_f32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -6188,13 +6188,13 @@ define amdgpu_kernel void @atomic_load_f32_addr64(float* %in, float* %out, i64 %
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr float, float* %in, i64 %index
-  %val = load atomic float, float* %ptr seq_cst, align 4
-  store float %val, float* %out
+  %ptr = getelementptr float, ptr %in, i64 %index
+  %val = load atomic float, ptr %ptr seq_cst, align 4
+  store float %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_f32_offset(float %in, float* %out) {
+define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) {
 ; GCN1-LABEL: atomic_store_f32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
@@ -6235,12 +6235,12 @@ define amdgpu_kernel void @atomic_store_f32_offset(float %in, float* %out) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr float, float* %out, i32 4
-  store atomic float %in, float* %gep  seq_cst, align 4
+  %gep = getelementptr float, ptr %out, i32 4
+  store atomic float %in, ptr %gep  seq_cst, align 4
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_f32(float %in, float* %out) {
+define amdgpu_kernel void @atomic_store_f32(float %in, ptr %out) {
 ; GCN1-LABEL: atomic_store_f32:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
@@ -6277,11 +6277,11 @@ define amdgpu_kernel void @atomic_store_f32(float %in, float* %out) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  store atomic float %in, float* %out seq_cst, align 4
+  store atomic float %in, ptr %out seq_cst, align 4
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, float* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_store_f32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
@@ -6331,13 +6331,13 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, float* %out
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr float, float* %out, i64 %index
-  %gep = getelementptr float, float* %ptr, i32 4
-  store atomic float %in, float* %gep seq_cst, align 4
+  %ptr = getelementptr float, ptr %out, i64 %index
+  %gep = getelementptr float, ptr %ptr, i32 4
+  store atomic float %in, ptr %gep seq_cst, align 4
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_f32_addr64(float %in, float* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_store_f32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
@@ -6383,12 +6383,12 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, float* %out, i64 %
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr float, float* %out, i64 %index
-  store atomic float %in, float* %ptr seq_cst, align 4
+  %ptr = getelementptr float, ptr %out, i64 %index
+  store atomic float %in, ptr %ptr seq_cst, align 4
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i8_offset(i8* %in, i8* %out) {
+define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) {
 ; GCN1-LABEL: atomic_load_i8_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -6438,13 +6438,13 @@ define amdgpu_kernel void @atomic_load_i8_offset(i8* %in, i8* %out) {
 ; GCN3-NEXT:    flat_store_byte v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i8, i8* %in, i64 16
-  %val = load atomic i8, i8* %gep  seq_cst, align 1
-  store i8 %val, i8* %out
+  %gep = getelementptr i8, ptr %in, i64 16
+  %val = load atomic i8, ptr %gep  seq_cst, align 1
+  store i8 %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i8(i8* %in, i8* %out) {
+define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) {
 ; GCN1-LABEL: atomic_load_i8:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -6490,12 +6490,12 @@ define amdgpu_kernel void @atomic_load_i8(i8* %in, i8* %out) {
 ; GCN3-NEXT:    flat_store_byte v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = load atomic i8, i8* %in seq_cst, align 1
-  store i8 %val, i8* %out
+  %val = load atomic i8, ptr %in seq_cst, align 1
+  store i8 %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i8_addr64_offset(i8* %in, i8* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_load_i8_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -6554,14 +6554,14 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(i8* %in, i8* %out, i64 %
 ; GCN3-NEXT:    flat_store_byte v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i8, i8* %in, i64 %index
-  %gep = getelementptr i8, i8* %ptr, i64 16
-  %val = load atomic i8, i8* %gep seq_cst, align 1
-  store i8 %val, i8* %out
+  %ptr = getelementptr i8, ptr %in, i64 %index
+  %gep = getelementptr i8, ptr %ptr, i64 16
+  %val = load atomic i8, ptr %gep seq_cst, align 1
+  store i8 %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, i8* %out) {
+define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) {
 ; GCN1-LABEL: atomic_store_i8_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
@@ -6602,12 +6602,12 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, i8* %out) {
 ; GCN3-NEXT:    flat_store_byte v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i8, i8* %out, i64 16
-  store atomic i8 %in, i8* %gep  seq_cst, align 1
+  %gep = getelementptr i8, ptr %out, i64 16
+  store atomic i8 %in, ptr %gep  seq_cst, align 1
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i8(i8 %in, i8* %out) {
+define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) {
 ; GCN1-LABEL: atomic_store_i8:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
@@ -6644,11 +6644,11 @@ define amdgpu_kernel void @atomic_store_i8(i8 %in, i8* %out) {
 ; GCN3-NEXT:    flat_store_byte v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  store atomic i8 %in, i8* %out seq_cst, align 1
+  store atomic i8 %in, ptr %out seq_cst, align 1
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, i8* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_store_i8_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
@@ -6695,13 +6695,13 @@ define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, i8* %out, i64 %
 ; GCN3-NEXT:    flat_store_byte v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i8, i8* %out, i64 %index
-  %gep = getelementptr i8, i8* %ptr, i64 16
-  store atomic i8 %in, i8* %gep seq_cst, align 1
+  %ptr = getelementptr i8, ptr %out, i64 %index
+  %gep = getelementptr i8, ptr %ptr, i64 16
+  store atomic i8 %in, ptr %gep seq_cst, align 1
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i16_offset(i16* %in, i16* %out) {
+define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) {
 ; GCN1-LABEL: atomic_load_i16_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -6751,13 +6751,13 @@ define amdgpu_kernel void @atomic_load_i16_offset(i16* %in, i16* %out) {
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i16, i16* %in, i64 8
-  %val = load atomic i16, i16* %gep  seq_cst, align 2
-  store i16 %val, i16* %out
+  %gep = getelementptr i16, ptr %in, i64 8
+  %val = load atomic i16, ptr %gep  seq_cst, align 2
+  store i16 %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i16(i16* %in, i16* %out) {
+define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) {
 ; GCN1-LABEL: atomic_load_i16:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -6803,12 +6803,12 @@ define amdgpu_kernel void @atomic_load_i16(i16* %in, i16* %out) {
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = load atomic i16, i16* %in seq_cst, align 2
-  store i16 %val, i16* %out
+  %val = load atomic i16, ptr %in seq_cst, align 2
+  store i16 %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i16_addr64_offset(i16* %in, i16* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_load_i16_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -6870,14 +6870,14 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(i16* %in, i16* %out, i6
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i16, i16* %in, i64 %index
-  %gep = getelementptr i16, i16* %ptr, i64 8
-  %val = load atomic i16, i16* %gep seq_cst, align 2
-  store i16 %val, i16* %out
+  %ptr = getelementptr i16, ptr %in, i64 %index
+  %gep = getelementptr i16, ptr %ptr, i64 8
+  %val = load atomic i16, ptr %gep seq_cst, align 2
+  store i16 %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, i16* %out) {
+define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) {
 ; GCN1-LABEL: atomic_store_i16_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
@@ -6918,12 +6918,12 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, i16* %out) {
 ; GCN3-NEXT:    flat_store_short v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i16, i16* %out, i64 8
-  store atomic i16 %in, i16* %gep  seq_cst, align 2
+  %gep = getelementptr i16, ptr %out, i64 8
+  store atomic i16 %in, ptr %gep  seq_cst, align 2
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i16(i16 %in, i16* %out) {
+define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) {
 ; GCN1-LABEL: atomic_store_i16:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
@@ -6960,11 +6960,11 @@ define amdgpu_kernel void @atomic_store_i16(i16 %in, i16* %out) {
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  store atomic i16 %in, i16* %out seq_cst, align 2
+  store atomic i16 %in, ptr %out seq_cst, align 2
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, i16* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_store_i16_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
@@ -7014,13 +7014,13 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, i16* %out, i6
 ; GCN3-NEXT:    flat_store_short v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i16, i16* %out, i64 %index
-  %gep = getelementptr i16, i16* %ptr, i64 8
-  store atomic i16 %in, i16* %gep seq_cst, align 2
+  %ptr = getelementptr i16, ptr %out, i64 %index
+  %gep = getelementptr i16, ptr %ptr, i64 8
+  store atomic i16 %in, ptr %gep seq_cst, align 2
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_f16_offset(half %in, half* %out) {
+define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) {
 ; GCN1-LABEL: atomic_store_f16_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
@@ -7061,12 +7061,12 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, half* %out) {
 ; GCN3-NEXT:    flat_store_short v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr half, half* %out, i64 8
-  store atomic half %in, half* %gep  seq_cst, align 2
+  %gep = getelementptr half, ptr %out, i64 8
+  store atomic half %in, ptr %gep  seq_cst, align 2
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_f16(half %in, half* %out) {
+define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) {
 ; GCN1-LABEL: atomic_store_f16:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
@@ -7103,6 +7103,6 @@ define amdgpu_kernel void @atomic_store_f16(half %in, half* %out) {
 ; GCN3-NEXT:    flat_store_short v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  store atomic half %in, half* %out seq_cst, align 2
+  store atomic half %in, ptr %out seq_cst, align 2
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index dc7b8608ed438..41a4998b3ba91 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN1 %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN2 %s
 
-define amdgpu_kernel void @atomic_add_i64_offset(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_add_i64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -35,12 +35,12 @@ define amdgpu_kernel void @atomic_add_i64_offset(i64* %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile add i64* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile add ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i64_ret_offset(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_add_i64_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -81,13 +81,13 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile add i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile add ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_add_i64_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -128,13 +128,13 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64* %out, i64 %in, i64
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile add i64* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile add ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_add_i64_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -179,14 +179,14 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64* %out, i64* %out
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile add i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile add ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i64(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_add_i64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -215,11 +215,11 @@ define amdgpu_kernel void @atomic_add_i64(i64* %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile add i64* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile add ptr %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i64_ret(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_add_i64_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -256,12 +256,12 @@ define amdgpu_kernel void @atomic_add_i64_ret(i64* %out, i64* %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile add i64* %out, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %tmp0 = atomicrmw volatile add ptr %out, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i64_addr64(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_add_i64_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -298,12 +298,12 @@ define amdgpu_kernel void @atomic_add_i64_addr64(i64* %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile add i64* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_add_i64_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -344,13 +344,13 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile add i64* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i64_offset(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_and_i64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -383,12 +383,12 @@ define amdgpu_kernel void @atomic_and_i64_offset(i64* %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile and i64* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile and ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i64_ret_offset(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_and_i64_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -429,13 +429,13 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile and i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile and ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_and_i64_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -476,13 +476,13 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64* %out, i64 %in, i64
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile and i64* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile and ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_and_i64_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -527,14 +527,14 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64* %out, i64* %out
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile and i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile and ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i64(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_and_i64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -563,11 +563,11 @@ define amdgpu_kernel void @atomic_and_i64(i64* %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile and i64* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile and ptr %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i64_ret(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_and_i64_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -604,12 +604,12 @@ define amdgpu_kernel void @atomic_and_i64_ret(i64* %out, i64* %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile and i64* %out, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %tmp0 = atomicrmw volatile and ptr %out, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i64_addr64(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_and_i64_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -646,12 +646,12 @@ define amdgpu_kernel void @atomic_and_i64_addr64(i64* %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile and i64* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_and_i64_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -692,13 +692,13 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile and i64* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i64_offset(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_sub_i64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -731,12 +731,12 @@ define amdgpu_kernel void @atomic_sub_i64_offset(i64* %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile sub i64* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_sub_i64_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -777,13 +777,13 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile sub i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_sub_i64_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -824,13 +824,13 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64* %out, i64 %in, i64
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile sub i64* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_sub_i64_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -875,14 +875,14 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64* %out, i64* %out
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile sub i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i64(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_sub_i64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -911,11 +911,11 @@ define amdgpu_kernel void @atomic_sub_i64(i64* %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile sub i64* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile sub ptr %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i64_ret(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_sub_i64_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -952,12 +952,12 @@ define amdgpu_kernel void @atomic_sub_i64_ret(i64* %out, i64* %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile sub i64* %out, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %tmp0 = atomicrmw volatile sub ptr %out, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i64_addr64(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_sub_i64_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -994,12 +994,12 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(i64* %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile sub i64* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_sub_i64_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -1040,13 +1040,13 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile sub i64* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_offset(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_max_i64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1077,12 +1077,12 @@ define amdgpu_kernel void @atomic_max_i64_offset(i64* %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile max i64* %gep, i64 %in syncscope("workgroup") seq_cst
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_ret_offset(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_max_i64_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -1123,13 +1123,13 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile max i64* %gep, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64* %out2
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i64_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1168,13 +1168,13 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64* %out, i64 %in, i64
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile max i64* %gep, i64 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i64_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -1219,14 +1219,14 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64* %out, i64* %out
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile max i64* %gep, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_max_i64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1253,11 +1253,11 @@ define amdgpu_kernel void @atomic_max_i64(i64* %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile max i64* %out, i64 %in syncscope("workgroup") seq_cst
+  %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_ret(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_max_i64_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1294,12 +1294,12 @@ define amdgpu_kernel void @atomic_max_i64_ret(i64* %out, i64* %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile max i64* %out, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64* %out2
+  %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_addr64(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i64_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1334,12 +1334,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64(i64* %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile max i64* %ptr, i64 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i64_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -1380,13 +1380,13 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile max i64* %ptr, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_offset(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_umax_i64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1417,12 +1417,12 @@ define amdgpu_kernel void @atomic_umax_i64_offset(i64* %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile umax i64* %gep, i64 %in syncscope("workgroup") seq_cst
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_umax_i64_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -1463,13 +1463,13 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile umax i64* %gep, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64* %out2
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i64_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1508,13 +1508,13 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64* %out, i64 %in, i64
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile umax i64* %gep, i64 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i64_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -1559,14 +1559,14 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64* %out, i64* %ou
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile umax i64* %gep, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_umax_i64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1593,11 +1593,11 @@ define amdgpu_kernel void @atomic_umax_i64(i64* %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile umax i64* %out, i64 %in syncscope("workgroup") seq_cst
+  %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_ret(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_umax_i64_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1634,12 +1634,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret(i64* %out, i64* %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile umax i64* %out, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64* %out2
+  %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_addr64(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i64_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1674,12 +1674,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(i64* %out, i64 %in, i64 %index
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile umax i64* %ptr, i64 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i64_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -1720,13 +1720,13 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile umax i64* %ptr, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_offset(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_min_i64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1757,12 +1757,12 @@ define amdgpu_kernel void @atomic_min_i64_offset(i64* %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile min i64* %gep, i64 %in syncscope("workgroup") seq_cst
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_ret_offset(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_min_i64_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -1803,13 +1803,13 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile min i64* %gep, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64* %out2
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i64_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1848,13 +1848,13 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64* %out, i64 %in, i64
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile min i64* %gep, i64 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i64_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -1899,14 +1899,14 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64* %out, i64* %out
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile min i64* %gep, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_min_i64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1933,11 +1933,11 @@ define amdgpu_kernel void @atomic_min_i64(i64* %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile min i64* %out, i64 %in syncscope("workgroup") seq_cst
+  %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_ret(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_min_i64_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1974,12 +1974,12 @@ define amdgpu_kernel void @atomic_min_i64_ret(i64* %out, i64* %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile min i64* %out, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64* %out2
+  %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_addr64(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i64_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2014,12 +2014,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64(i64* %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile min i64* %ptr, i64 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i64_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -2060,13 +2060,13 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile min i64* %ptr, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_offset(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_umin_i64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2097,12 +2097,12 @@ define amdgpu_kernel void @atomic_umin_i64_offset(i64* %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile umin i64* %gep, i64 %in syncscope("workgroup") seq_cst
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_umin_i64_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -2143,13 +2143,13 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile umin i64* %gep, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64* %out2
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i64_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2188,13 +2188,13 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64* %out, i64 %in, i64
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile umin i64* %gep, i64 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i64_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -2239,14 +2239,14 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64* %out, i64* %ou
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile umin i64* %gep, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_umin_i64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2273,11 +2273,11 @@ define amdgpu_kernel void @atomic_umin_i64(i64* %out, i64 %in) {
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile umin i64* %out, i64 %in syncscope("workgroup") seq_cst
+  %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_ret(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_umin_i64_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2314,12 +2314,12 @@ define amdgpu_kernel void @atomic_umin_i64_ret(i64* %out, i64* %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile umin i64* %out, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64* %out2
+  %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_addr64(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i64_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2354,12 +2354,12 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(i64* %out, i64 %in, i64 %index
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile umin i64* %ptr, i64 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i64_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -2400,13 +2400,13 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile umin i64* %ptr, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i64_offset(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_or_i64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2439,12 +2439,12 @@ define amdgpu_kernel void @atomic_or_i64_offset(i64* %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile or i64* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile or ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i64_ret_offset(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_or_i64_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -2485,13 +2485,13 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(i64* %out, i64* %out2, i64 %
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile or i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile or ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_or_i64_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2532,13 +2532,13 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64* %out, i64 %in, i64 %
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile or i64* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile or ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_or_i64_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -2583,14 +2583,14 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64* %out, i64* %out2
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile or i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile or ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i64(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_or_i64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2619,11 +2619,11 @@ define amdgpu_kernel void @atomic_or_i64(i64* %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile or i64* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile or ptr %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i64_ret(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_or_i64_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2660,12 +2660,12 @@ define amdgpu_kernel void @atomic_or_i64_ret(i64* %out, i64* %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile or i64* %out, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %tmp0 = atomicrmw volatile or ptr %out, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i64_addr64(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_or_i64_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2702,12 +2702,12 @@ define amdgpu_kernel void @atomic_or_i64_addr64(i64* %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile or i64* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_or_i64_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -2748,13 +2748,13 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64* %out, i64* %out2, i64 %
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile or i64* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i64_offset(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_xchg_i64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2787,12 +2787,12 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(i64* %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile xchg i64* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_f64_offset(double* %out, double %in) {
+define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
 ; GCN1-LABEL: atomic_xchg_f64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2825,12 +2825,12 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(double* %out, double %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr double, double* %out, i64 4
-  %tmp0 = atomicrmw volatile xchg double* %gep, double %in seq_cst
+  %gep = getelementptr double, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile xchg ptr %gep, double %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_pointer_offset(i8** %out, i8* %in) {
+define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
 ; GCN1-LABEL: atomic_xchg_pointer_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2863,12 +2863,12 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(i8** %out, i8* %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i8*, i8** %out, i32 4
-  %val = atomicrmw volatile xchg i8** %gep, i8* %in seq_cst
+  %gep = getelementptr ptr, ptr %out, i32 4
+  %val = atomicrmw volatile xchg ptr %gep, ptr %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_xchg_i64_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -2909,13 +2909,13 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile xchg i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xchg_i64_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2956,13 +2956,13 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64* %out, i64 %in, i64
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile xchg i64* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xchg_i64_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -3007,14 +3007,14 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64* %out, i64* %ou
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile xchg i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i64(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_xchg_i64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -3043,11 +3043,11 @@ define amdgpu_kernel void @atomic_xchg_i64(i64* %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile xchg i64* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i64_ret(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_xchg_i64_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3084,12 +3084,12 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(i64* %out, i64* %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile xchg i64* %out, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i64_addr64(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xchg_i64_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3126,12 +3126,12 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(i64* %out, i64 %in, i64 %index
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile xchg i64* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xchg_i64_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -3172,13 +3172,13 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile xchg i64* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i64_offset(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_xor_i64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -3211,12 +3211,12 @@ define amdgpu_kernel void @atomic_xor_i64_offset(i64* %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile xor i64* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_xor_i64_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -3257,13 +3257,13 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile xor i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xor_i64_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3304,13 +3304,13 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64* %out, i64 %in, i64
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile xor i64* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xor_i64_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -3355,14 +3355,14 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64* %out, i64* %out
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile xor i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i64(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_xor_i64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -3391,11 +3391,11 @@ define amdgpu_kernel void @atomic_xor_i64(i64* %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile xor i64* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xor ptr %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i64_ret(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_xor_i64_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3432,12 +3432,12 @@ define amdgpu_kernel void @atomic_xor_i64_ret(i64* %out, i64* %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile xor i64* %out, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %tmp0 = atomicrmw volatile xor ptr %out, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i64_addr64(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xor_i64_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3474,12 +3474,12 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(i64* %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile xor i64* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_xor_i64_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -3520,13 +3520,13 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile xor i64* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i64_offset(i64* %in, i64* %out) {
+define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) {
 ; GCN1-LABEL: atomic_load_i64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -3561,13 +3561,13 @@ define amdgpu_kernel void @atomic_load_i64_offset(i64* %in, i64* %out) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %in, i64 4
-  %val = load atomic i64, i64* %gep  seq_cst, align 8
-  store i64 %val, i64* %out
+  %gep = getelementptr i64, ptr %in, i64 4
+  %val = load atomic i64, ptr %gep  seq_cst, align 8
+  store i64 %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i64(i64* %in, i64* %out) {
+define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) {
 ; GCN1-LABEL: atomic_load_i64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -3598,12 +3598,12 @@ define amdgpu_kernel void @atomic_load_i64(i64* %in, i64* %out) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %val = load atomic i64, i64* %in seq_cst, align 8
-  store i64 %val, i64* %out
+  %val = load atomic i64, ptr %in seq_cst, align 8
+  store i64 %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64* %in, i64* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_load_i64_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -3646,14 +3646,14 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64* %in, i64* %out, i6
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %in, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %val = load atomic i64, i64* %gep seq_cst, align 8
-  store i64 %val, i64* %out
+  %ptr = getelementptr i64, ptr %in, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %val = load atomic i64, ptr %gep seq_cst, align 8
+  store i64 %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i64_addr64(i64* %in, i64* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_load_i64_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -3692,13 +3692,13 @@ define amdgpu_kernel void @atomic_load_i64_addr64(i64* %in, i64* %out, i64 %inde
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %in, i64 %index
-  %val = load atomic i64, i64* %ptr seq_cst, align 8
-  store i64 %val, i64* %out
+  %ptr = getelementptr i64, ptr %in, i64 %index
+  %val = load atomic i64, ptr %ptr seq_cst, align 8
+  store i64 %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64* %out) {
+define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) {
 ; GCN1-LABEL: atomic_store_i64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -3727,12 +3727,12 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64* %out) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  store atomic i64 %in, i64* %gep  seq_cst, align 8
+  %gep = getelementptr i64, ptr %out, i64 4
+  store atomic i64 %in, ptr %gep  seq_cst, align 8
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i64(i64 %in, i64* %out) {
+define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) {
 ; GCN1-LABEL: atomic_store_i64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -3757,11 +3757,11 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, i64* %out) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  store atomic i64 %in, i64* %out seq_cst, align 8
+  store atomic i64 %in, ptr %out seq_cst, align 8
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_store_i64_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3798,13 +3798,13 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64* %out, i6
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  store atomic i64 %in, i64* %gep seq_cst, align 8
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  store atomic i64 %in, ptr %gep seq_cst, align 8
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_store_i64_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3837,12 +3837,12 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64* %out, i64 %inde
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  store atomic i64 %in, i64* %ptr seq_cst, align 8
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  store atomic i64 %in, ptr %ptr seq_cst, align 8
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3881,12 +3881,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64* %out, i64 %in, i64 %ol
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %val = cmpxchg volatile i64* %gep, i64 %old, i64 %in seq_cst seq_cst
+  %gep = getelementptr i64, ptr %out, i64 4
+  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in seq_cst seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i64_soffset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3925,12 +3925,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64* %out, i64 %in, i64 %o
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 9000
-  %val = cmpxchg volatile i64* %gep, i64 %old, i64 %in seq_cst seq_cst
+  %gep = getelementptr i64, ptr %out, i64 9000
+  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in seq_cst seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64* %out, i64* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in, i64 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i64_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -3973,14 +3973,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64* %out, i64* %out2,
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %val = cmpxchg volatile i64* %gep, i64 %old, i64 %in seq_cst seq_cst
+  %gep = getelementptr i64, ptr %out, i64 4
+  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
-  store i64 %extract0, i64* %out2
+  store i64 %extract0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index, i64 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i64_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -4023,13 +4023,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64* %out, i64 %in,
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %val = cmpxchg volatile i64* %gep, i64 %old, i64 %in seq_cst seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in seq_cst seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -4092,15 +4092,15 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64* %out, i64*
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %val = cmpxchg volatile i64* %gep, i64 %old, i64 %in seq_cst seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
-  store i64 %extract0, i64* %out2
+  store i64 %extract0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i64(i64* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4135,11 +4135,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(i64* %out, i64 %in, i64 %old) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %val = cmpxchg volatile i64* %out, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr %out, i64 %old, i64 %in seq_cst seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64* %out, i64* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, i64 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i64_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -4178,13 +4178,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64* %out, i64* %out2, i64 %in
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %val = cmpxchg volatile i64* %out, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr %out, i64 %old, i64 %in seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
-  store i64 %extract0, i64* %out2
+  store i64 %extract0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %index, i64 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i64_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -4223,12 +4223,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64* %out, i64 %in, i64 %in
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %val = cmpxchg volatile i64* %ptr, i64 %old, i64 %in seq_cst seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in seq_cst seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) {
 ; GCN1-LABEL: atomic_cmpxchg_i64_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -4287,14 +4287,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64* %out, i64* %out2,
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %val = cmpxchg volatile i64* %ptr, i64 %old, i64 %in seq_cst seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
-  store i64 %extract0, i64* %out2
+  store i64 %extract0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_f64_offset(double* %in, double* %out) {
+define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) {
 ; GCN1-LABEL: atomic_load_f64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4329,13 +4329,13 @@ define amdgpu_kernel void @atomic_load_f64_offset(double* %in, double* %out) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr double, double* %in, i64 4
-  %val = load atomic double, double* %gep  seq_cst, align 8
-  store double %val, double* %out
+  %gep = getelementptr double, ptr %in, i64 4
+  %val = load atomic double, ptr %gep  seq_cst, align 8
+  store double %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_f64(double* %in, double* %out) {
+define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) {
 ; GCN1-LABEL: atomic_load_f64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4366,12 +4366,12 @@ define amdgpu_kernel void @atomic_load_f64(double* %in, double* %out) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %val = load atomic double, double* %in seq_cst, align 8
-  store double %val, double* %out
+  %val = load atomic double, ptr %in seq_cst, align 8
+  store double %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_f64_addr64_offset(double* %in, double* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_load_f64_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -4414,14 +4414,14 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(double* %in, double* %o
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr double, double* %in, i64 %index
-  %gep = getelementptr double, double* %ptr, i64 4
-  %val = load atomic double, double* %gep seq_cst, align 8
-  store double %val, double* %out
+  %ptr = getelementptr double, ptr %in, i64 %index
+  %gep = getelementptr double, ptr %ptr, i64 4
+  %val = load atomic double, ptr %gep seq_cst, align 8
+  store double %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_f64_addr64(double* %in, double* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_load_f64_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -4460,13 +4460,13 @@ define amdgpu_kernel void @atomic_load_f64_addr64(double* %in, double* %out, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr double, double* %in, i64 %index
-  %val = load atomic double, double* %ptr seq_cst, align 8
-  store double %val, double* %out
+  %ptr = getelementptr double, ptr %in, i64 %index
+  %val = load atomic double, ptr %ptr seq_cst, align 8
+  store double %val, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_f64_offset(double %in, double* %out) {
+define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) {
 ; GCN1-LABEL: atomic_store_f64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4495,12 +4495,12 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, double* %out) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr double, double* %out, i64 4
-  store atomic double %in, double* %gep  seq_cst, align 8
+  %gep = getelementptr double, ptr %out, i64 4
+  store atomic double %in, ptr %gep  seq_cst, align 8
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_f64(double %in, double* %out) {
+define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) {
 ; GCN1-LABEL: atomic_store_f64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4525,11 +4525,11 @@ define amdgpu_kernel void @atomic_store_f64(double %in, double* %out) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  store atomic double %in, double* %out seq_cst, align 8
+  store atomic double %in, ptr %out seq_cst, align 8
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, double* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_store_f64_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4566,13 +4566,13 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, double* %o
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr double, double* %out, i64 %index
-  %gep = getelementptr double, double* %ptr, i64 4
-  store atomic double %in, double* %gep seq_cst, align 8
+  %ptr = getelementptr double, ptr %out, i64 %index
+  %gep = getelementptr double, ptr %ptr, i64 4
+  store atomic double %in, ptr %gep seq_cst, align 8
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_f64_addr64(double %in, double* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %index) {
 ; GCN1-LABEL: atomic_store_f64_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4605,7 +4605,7 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, double* %out, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr double, double* %out, i64 %index
-  store atomic double %in, double* %ptr seq_cst, align 8
+  %ptr = getelementptr double, ptr %out, i64 %index
+  store atomic double %in, ptr %ptr seq_cst, align 8
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_min_max_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_min_max_system.ll
index c823c5631183e..843800fd3881c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_min_max_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_min_max_system.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN1 %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN2 %s
 
-define amdgpu_kernel void @atomic_max_i64_offset(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_max_i64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -69,12 +69,12 @@ define amdgpu_kernel void @atomic_max_i64_offset(i64* %out, i64 %in) {
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile max i64* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile max ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_ret_offset(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_max_i64_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -151,13 +151,13 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile max i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile max ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i64_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -232,13 +232,13 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64* %out, i64 %in, i64
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile max i64* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile max ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i64_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -319,14 +319,14 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64* %out, i64* %out
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile max i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile max ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_max_i64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -389,11 +389,11 @@ define amdgpu_kernel void @atomic_max_i64(i64* %out, i64 %in) {
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile max i64* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile max ptr %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_ret(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_max_i64_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -466,12 +466,12 @@ define amdgpu_kernel void @atomic_max_i64_ret(i64* %out, i64* %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile max i64* %out, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %tmp0 = atomicrmw volatile max ptr %out, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_addr64(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i64_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -542,12 +542,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64(i64* %out, i64 %in, i64 %index)
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile max i64* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i64_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -624,13 +624,13 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile max i64* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_offset(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_umax_i64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -697,12 +697,12 @@ define amdgpu_kernel void @atomic_umax_i64_offset(i64* %out, i64 %in) {
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile umax i64* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_umax_i64_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -779,13 +779,13 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile umax i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i64_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -860,13 +860,13 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64* %out, i64 %in, i64
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile umax i64* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i64_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -947,14 +947,14 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64* %out, i64* %ou
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile umax i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_umax_i64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1017,11 +1017,11 @@ define amdgpu_kernel void @atomic_umax_i64(i64* %out, i64 %in) {
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile umax i64* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile umax ptr %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_ret(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_umax_i64_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1094,12 +1094,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret(i64* %out, i64* %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile umax i64* %out, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %tmp0 = atomicrmw volatile umax ptr %out, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_addr64(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i64_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -1170,12 +1170,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(i64* %out, i64 %in, i64 %index
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile umax i64* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i64_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -1252,13 +1252,13 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile umax i64* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_offset(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_min_i64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1325,12 +1325,12 @@ define amdgpu_kernel void @atomic_min_i64_offset(i64* %out, i64 %in) {
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile min i64* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile min ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_ret_offset(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_min_i64_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1407,13 +1407,13 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile min i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile min ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i64_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -1488,13 +1488,13 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64* %out, i64 %in, i64
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile min i64* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile min ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i64_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -1575,14 +1575,14 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64* %out, i64* %out
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile min i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile min ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_min_i64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1645,11 +1645,11 @@ define amdgpu_kernel void @atomic_min_i64(i64* %out, i64 %in) {
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile min i64* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile min ptr %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_ret(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_min_i64_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1722,12 +1722,12 @@ define amdgpu_kernel void @atomic_min_i64_ret(i64* %out, i64* %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile min i64* %out, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %tmp0 = atomicrmw volatile min ptr %out, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_addr64(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i64_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -1798,12 +1798,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64(i64* %out, i64 %in, i64 %index)
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile min i64* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i64_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -1880,13 +1880,13 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile min i64* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_offset(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_umin_i64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1953,12 +1953,12 @@ define amdgpu_kernel void @atomic_umin_i64_offset(i64* %out, i64 %in) {
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile umin i64* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_umin_i64_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2035,13 +2035,13 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64* %out, i64 4
-  %tmp0 = atomicrmw volatile umin i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i64_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -2116,13 +2116,13 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64* %out, i64 %in, i64
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile umin i64* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i64_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -2203,14 +2203,14 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64* %out, i64* %ou
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %gep = getelementptr i64, i64* %ptr, i64 4
-  %tmp0 = atomicrmw volatile umin i64* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64(i64* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_umin_i64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2273,11 +2273,11 @@ define amdgpu_kernel void @atomic_umin_i64(i64* %out, i64 %in) {
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile umin i64* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile umin ptr %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_ret(i64* %out, i64* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-LABEL: atomic_umin_i64_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2350,12 +2350,12 @@ define amdgpu_kernel void @atomic_umin_i64_ret(i64* %out, i64* %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile umin i64* %out, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %tmp0 = atomicrmw volatile umin ptr %out, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_addr64(i64* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i64_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -2426,12 +2426,12 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(i64* %out, i64 %in, i64 %index
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile umin i64* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i64_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -2508,8 +2508,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64* %out, i64* %out2, i64
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64* %out, i64 %index
-  %tmp0 = atomicrmw volatile umin i64* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64* %out2
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr %out2
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_min_max_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_min_max_system.ll
index 3a8b4ea96a0c5..301e104b58b6a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_min_max_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_min_max_system.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN2 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN3 %s
 
-define amdgpu_kernel void @atomic_max_i32_offset(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_max_i32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -89,12 +89,12 @@ define amdgpu_kernel void @atomic_max_i32_offset(i32* %out, i32 %in) {
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile max i32* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile max ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_max_i32_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -192,13 +192,13 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[1:2], v0
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile max i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile max ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -296,13 +296,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32* %out, i32 %in, i64
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile max i32* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile max ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -412,14 +412,14 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32* %out, i32* %out
 ; GCN3-NEXT:    flat_store_dword v[1:2], v0
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile max i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile max ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_max_i32:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -501,11 +501,11 @@ define amdgpu_kernel void @atomic_max_i32(i32* %out, i32 %in) {
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile max i32* %out, i32 %in seq_cst
+  %val = atomicrmw volatile max ptr %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_ret(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_max_i32_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -599,12 +599,12 @@ define amdgpu_kernel void @atomic_max_i32_ret(i32* %out, i32* %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[1:2], v0
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile max i32* %out, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %val = atomicrmw volatile max ptr %out, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_addr64(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -698,12 +698,12 @@ define amdgpu_kernel void @atomic_max_i32_addr64(i32* %out, i32 %in, i64 %index)
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile max i32* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile max ptr %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_max_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -809,13 +809,13 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[1:2], v0
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile max i32* %ptr, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile max ptr %ptr, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_offset(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_umax_i32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -901,12 +901,12 @@ define amdgpu_kernel void @atomic_umax_i32_offset(i32* %out, i32 %in) {
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile umax i32* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile umax ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_umax_i32_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1004,13 +1004,13 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[1:2], v0
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile umax i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile umax ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -1108,13 +1108,13 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32* %out, i32 %in, i64
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile umax i32* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile umax ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -1224,14 +1224,14 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32* %out, i32* %ou
 ; GCN3-NEXT:    flat_store_dword v[1:2], v0
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile umax i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile umax ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_umax_i32:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -1313,11 +1313,11 @@ define amdgpu_kernel void @atomic_umax_i32(i32* %out, i32 %in) {
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile umax i32* %out, i32 %in seq_cst
+  %val = atomicrmw volatile umax ptr %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_ret(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_umax_i32_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1411,12 +1411,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret(i32* %out, i32* %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[1:2], v0
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile umax i32* %out, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %val = atomicrmw volatile umax ptr %out, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_addr64(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -1510,12 +1510,12 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(i32* %out, i32 %in, i64 %index
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile umax i32* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile umax ptr %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umax_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -1621,13 +1621,13 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[1:2], v0
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile umax i32* %ptr, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile umax ptr %ptr, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_offset(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_min_i32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -1713,12 +1713,12 @@ define amdgpu_kernel void @atomic_min_i32_offset(i32* %out, i32 %in) {
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile min i32* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile min ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_min_i32_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1816,13 +1816,13 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[1:2], v0
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile min i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile min ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -1920,13 +1920,13 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32* %out, i32 %in, i64
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile min i32* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile min ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -2036,14 +2036,14 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32* %out, i32* %out
 ; GCN3-NEXT:    flat_store_dword v[1:2], v0
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile min i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile min ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_min_i32:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -2125,11 +2125,11 @@ define amdgpu_kernel void @atomic_min_i32(i32* %out, i32 %in) {
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile min i32* %out, i32 %in seq_cst
+  %val = atomicrmw volatile min ptr %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_ret(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_min_i32_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2223,12 +2223,12 @@ define amdgpu_kernel void @atomic_min_i32_ret(i32* %out, i32* %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[1:2], v0
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile min i32* %out, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %val = atomicrmw volatile min ptr %out, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_addr64(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -2322,12 +2322,12 @@ define amdgpu_kernel void @atomic_min_i32_addr64(i32* %out, i32 %in, i64 %index)
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile min i32* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile min ptr %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_min_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -2433,13 +2433,13 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[1:2], v0
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile min i32* %ptr, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile min ptr %ptr, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_offset(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_umin_i32_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -2525,12 +2525,12 @@ define amdgpu_kernel void @atomic_umin_i32_offset(i32* %out, i32 %in) {
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile umin i32* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile umin ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_umin_i32_ret_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2628,13 +2628,13 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[1:2], v0
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32* %out, i32 4
-  %val = atomicrmw volatile umin i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = atomicrmw volatile umin ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i32_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -2732,13 +2732,13 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32* %out, i32 %in, i64
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile umin i32* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile umin ptr %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -2848,14 +2848,14 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32* %out, i32* %ou
 ; GCN3-NEXT:    flat_store_dword v[1:2], v0
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %gep = getelementptr i32, i32* %ptr, i32 4
-  %val = atomicrmw volatile umin i32* %gep, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %val = atomicrmw volatile umin ptr %gep, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32(i32* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_umin_i32:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
@@ -2937,11 +2937,11 @@ define amdgpu_kernel void @atomic_umin_i32(i32* %out, i32 %in) {
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile umin i32* %out, i32 %in seq_cst
+  %val = atomicrmw volatile umin ptr %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_ret(i32* %out, i32* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN1-LABEL: atomic_umin_i32_ret:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3035,12 +3035,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret(i32* %out, i32* %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[1:2], v0
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile umin i32* %out, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %val = atomicrmw volatile umin ptr %out, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_addr64(i32* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i32_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -3134,12 +3134,12 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(i32* %out, i32 %in, i64 %index
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile umin i32* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile umin ptr %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
 ; GCN1-LABEL: atomic_umin_i32_ret_addr64:
 ; GCN1:       ; %bb.0: ; %entry
 ; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -3245,8 +3245,8 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32* %out, i32* %out2, i32
 ; GCN3-NEXT:    flat_store_dword v[1:2], v0
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32* %out, i64 %index
-  %val = atomicrmw volatile umin i32* %ptr, i32 %in seq_cst
-  store i32 %val, i32* %out2
+  %ptr = getelementptr i32, ptr %out, i64 %index
+  %val = atomicrmw volatile umin ptr %ptr, i32 %in seq_cst
+  store i32 %val, ptr %out2
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index bca2989e4c483..e1ad935625458 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -6,141 +6,141 @@
 ; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f32:
 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v{{[0-9]+}}
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
-define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep0
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep0
   %a.add = fadd nnan float %a, 1.0
   %max = call float @llvm.maxnum.f32(float %a.add, float 2.0)
   %med = call float @llvm.minnum.f32(float %max, float 4.0)
 
-  store float %med, float addrspace(1)* %outgep
+  store float %med, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_f32:
 ; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep0
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep0
   %a.add = fadd nnan float %a, 1.0
 
   %max = call float @llvm.maxnum.f32(float %a.add, float 2.0)
   %med = call float @llvm.minnum.f32(float %max, float 4.0)
 
-  store float %med, float addrspace(1)* %outgep
+  store float %med, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_commute0_f32:
 ; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep0
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep0
   %a.add = fadd nnan float %a, 1.0
 
   %max = call float @llvm.maxnum.f32(float 2.0, float %a.add)
   %med = call float @llvm.minnum.f32(float 4.0, float %max)
 
-  store float %med, float addrspace(1)* %outgep
+  store float %med, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_commute1_f32:
 ; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep0
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep0
   %a.add = fadd nnan float %a, 1.0
 
   %max = call float @llvm.maxnum.f32(float %a.add, float 2.0)
   %med = call float @llvm.minnum.f32(float 4.0, float %max)
 
-  store float %med, float addrspace(1)* %outgep
+  store float %med, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_constant_order_f32:
 ; GCN: v_max_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
 ; GCN: v_min_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep0
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep0
   %a.add = fadd nnan float %a, 1.0
 
   %max = call float @llvm.maxnum.f32(float %a.add, float 4.0)
   %med = call float @llvm.minnum.f32(float %max, float 2.0)
 
-  store float %med, float addrspace(1)* %outgep
+  store float %med, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_multi_use_f32:
 ; GCN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; GCN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep0
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep0
   %a.add = fadd nnan float %a, 1.0
 
   %max = call float @llvm.maxnum.f32(float %a.add, float 2.0)
   %med = call float @llvm.minnum.f32(float %max, float 4.0)
 
-  store volatile float %med, float addrspace(1)* %outgep
-  store volatile float %max, float addrspace(1)* %outgep
+  store volatile float %med, ptr addrspace(1) %outgep
+  store volatile float %max, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f64:
 ; GCN: v_max_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 2.0
 ; GCN: v_min_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 4.0
-define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr double, double addrspace(1)* %out, i32 %tid
-  %a = load double, double addrspace(1)* %gep0
+  %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr double, ptr addrspace(1) %out, i32 %tid
+  %a = load double, ptr addrspace(1) %gep0
   %a.add = fadd nnan double %a, 1.0
 
   %max = call double @llvm.maxnum.f64(double %a.add, double 2.0)
   %med = call double @llvm.minnum.f64(double %max, double 4.0)
 
-  store double %med, double addrspace(1)* %outgep
+  store double %med, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_no_nans_f32:
 ; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
-define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep0
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep0
 
   %max = call float @llvm.maxnum.f32(float %a, float 2.0)
   %med = call float @llvm.minnum.f32(float %max, float 4.0)
 
-  store float %med, float addrspace(1)* %outgep
+  store float %med, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_legacy_fmed3_r_i_i_f32:
 ; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
-define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep0
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep0
   %a.nnan = fadd nnan float %a, 1.0
 
   ; fmax_legacy
@@ -151,7 +151,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %ou
   %cmp1 = fcmp uge float %max, 4.0
   %med = select i1 %cmp1, float 4.0, float %max
 
-  store float %med, float addrspace(1)* %outgep
+  store float %med, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -160,21 +160,21 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %ou
 ; GCN: {{buffer_|flat_|global_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_|global_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], [[B]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %a.fneg = fsub float -0.0, %a
   %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
   %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -183,21 +183,21 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrs
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], -[[B]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %b.fneg = fsub float -0.0, %b
   %tmp0 = call float @llvm.minnum.f32(float %a, float %b.fneg)
   %tmp1 = call float @llvm.maxnum.f32(float %a, float %b.fneg)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -206,21 +206,21 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(float addrs
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], -[[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %c.fneg = fsub float -0.0, %c
   %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
   %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fneg)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -229,15 +229,15 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(float addrs
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], |[[B]]|, -|[[C]]|
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
 
   %a.fneg = fsub float -0.0, %a
   %b.fabs = call float @llvm.fabs.f32(float %b)
@@ -249,7 +249,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
 
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -258,15 +258,15 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, -|[[A]]|, -|[[B]]|, -|[[C]]|
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
 
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %a.fabs.fneg = fsub float -0.0, %a.fabs
@@ -280,7 +280,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float add
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
 
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -292,15 +292,15 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float add
 ; GCN-DAG: v_add_f32_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]]
 ; GCN-DAG: v_add_f32_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
-define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
 
   %a.nnan = fadd nnan float %a, 1.0
   %b.nnan = fadd nnan float %b, 2.0
@@ -310,7 +310,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out,
   %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -320,20 +320,20 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out,
 ; GCN: {{buffer_|flat_|global_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_|global_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
   %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
   %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -342,20 +342,20 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(float addrspace(1)*
 ; GCN: {{buffer_|flat_|global_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_|global_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
   %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
   %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
   %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -364,20 +364,20 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(float addrspace(1)* %out, f
 ; GCN: {{buffer_|flat_|global_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_|global_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define amdgpu_kernel void @v_fast_call_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call fast float @llvm.minnum.f32(float %a, float %b)
   %tmp1 = call fast float @llvm.maxnum.f32(float %a, float %b)
   %tmp2 = call fast float @llvm.minnum.f32(float %tmp1, float %c)
   %med3 = call fast float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -399,20 +399,20 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(float addrspace(1)* %out, f
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
   %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -421,20 +421,20 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(float addrspace(1)*
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
   %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -443,20 +443,20 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(float addrspace(1)*
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
   %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
   %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -465,20 +465,20 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(float addrspace(1)*
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
   %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
   %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -487,20 +487,20 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(float addrspace(1)*
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
   %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
   %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -509,20 +509,20 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(float addrspace(1)*
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
   %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -531,20 +531,20 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(float addrspace(1)*
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
   %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
   %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -553,20 +553,20 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(float addrspace(1)*
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
   %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
   %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -575,20 +575,20 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(float addrspace(1)*
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
   %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
   %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -597,20 +597,20 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(float addrspace(1)*
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
   %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
   %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -619,20 +619,20 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(float addrspace(1)*
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
   %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
   %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
   %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -641,20 +641,20 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(float addrspace(1)
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
   %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
   %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
   %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -663,20 +663,20 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(float addrspace(1)
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
   %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
   %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
   %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -685,20 +685,20 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(float addrspace(1)
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
   %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
   %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -707,20 +707,20 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(float addrspace(1)
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
   %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
   %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
   %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -729,20 +729,20 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(float addrspace(1)
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
   %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
   %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
   %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -754,20 +754,20 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(float addrspace(1)
 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.maxnum.f32(float %a, float %b)
   %tmp1 = call float @llvm.minnum.f32(float %a, float %b)
   %tmp2 = call float @llvm.maxnum.f32(float %tmp1, float %c)
   %med3 = call float @llvm.minnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -780,91 +780,91 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(float addrspace(1)
 ; GCN-DAG: v_max_f32
 ; GCN: v_min_f32
 ; GCN: v_max_f32
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
-  store volatile float %tmp0, float addrspace(1)* undef
+  store volatile float %tmp0, ptr addrspace(1) undef
   %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use1:
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
   %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
-  store volatile float %tmp1, float addrspace(1)* undef
+  store volatile float %tmp1, ptr addrspace(1) undef
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use2:
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
   %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
-  store volatile float %tmp2, float addrspace(1)* undef
+  store volatile float %tmp2, ptr addrspace(1) undef
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
 
 ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0:
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
   %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_nnan_inputs_missing0_med3_f32_pat0:
-define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
 
   %a.nnan = fadd float %a, 1.0
   %b.nnan = fadd nnan float %b, 2.0
@@ -874,20 +874,20 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(float addrspace(
   %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_nnan_inputs_missing1_med3_f32_pat0:
-define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
 
   %a.nnan = fadd nnan float %a, 1.0
   %b.nnan = fadd float %b, 2.0
@@ -897,20 +897,20 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(float addrspace(
   %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_nnan_inputs_missing2_med3_f32_pat0:
-define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
 
   %a.nnan = fadd nnan float %a, 1.0
   %b.nnan = fadd nnan float %b, 2.0
@@ -920,7 +920,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(
   %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -932,21 +932,21 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(
 ; GCN-DAG: v_max_f32
 ; GCN-DAG: v_min_f32
 ; GCN-DAG: v_max_f32
-define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %a.fneg = fsub float -0.0, %a
   %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
   %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
   %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %med3, float addrspace(1)* %outgep
+  store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -957,18 +957,18 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(fl
 ; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[A]], [[B]]
 ; GCN: v_min_f32_e32 v{{[0-9]+}}, [[MAX]], [[C]]
-define amdgpu_kernel void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load volatile float, float addrspace(1)* %gep0
-  %b = load volatile float, float addrspace(1)* %gep1
-  %c = load volatile float, float addrspace(1)* %gep2
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile float, ptr addrspace(1) %gep0
+  %b = load volatile float, ptr addrspace(1) %gep1
+  %c = load volatile float, ptr addrspace(1) %gep2
   %max = call float @llvm.maxnum.f32(float %a, float %b)
   %minmax = call float @llvm.minnum.f32(float %max, float %c)
-  store float %minmax, float addrspace(1)* %outgep
+  store float %minmax, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -984,16 +984,16 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(float addrspace(1)* %
 
 ; GFX9: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0
 ; GFX9: v_med3_f16 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
-define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
-  %a = load half, half addrspace(1)* %gep0
+  %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr half, ptr addrspace(1) %out, i32 %tid
+  %a = load half, ptr addrspace(1) %gep0
   %a.add = fadd nnan half %a, 1.0
   %max = call half @llvm.maxnum.f16(half %a.add, half 2.0)
   %med = call half @llvm.minnum.f16(half %max, half 4.0)
 
-  store half %med, half addrspace(1)* %outgep
+  store half %med, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -1021,15 +1021,15 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)*
 ; VI: v_max_f16
 
 ; GFX9: v_med3_f16 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
-define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr half, half addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr half, half addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
-  %a = load volatile half, half addrspace(1)* %gep0
-  %b = load volatile half, half addrspace(1)* %gep1
-  %c = load volatile half, half addrspace(1)* %gep2
+  %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr half, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr half, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr half, ptr addrspace(1) %out, i32 %tid
+  %a = load volatile half, ptr addrspace(1) %gep0
+  %b = load volatile half, ptr addrspace(1) %gep1
+  %c = load volatile half, ptr addrspace(1) %gep2
 
   %a.nnan = fadd nnan half %a, 1.0
   %b.nnan = fadd nnan half %b, 2.0
@@ -1039,7 +1039,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out,
   %tmp1 = call half @llvm.maxnum.f16(half %a.nnan, half %b.nnan)
   %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %c.nnan)
   %med3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2)
-  store half %med3, half addrspace(1)* %outgep
+  store half %med3, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -1047,16 +1047,16 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out,
 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5,
 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x41000000, [[ADD]]
 ; GCN: v_min_f32_e32 v{{[0-9]+}}, 0x41800000, [[MAX]]
-define amdgpu_kernel void @two_non_inline_constant(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep0
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep0
   %add = fadd nnan float %a, 0.5
   %max = call float @llvm.maxnum.f32(float %add, float 8.0)
   %med = call float @llvm.minnum.f32(float %max, float 16.0)
 
-  store float %med, float addrspace(1)* %out.gep
+  store float %med, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -1065,19 +1065,19 @@ define amdgpu_kernel void @two_non_inline_constant(float addrspace(1)* %out, flo
 ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41800000
 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5,
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 1.0, [[K1]]
-define amdgpu_kernel void @one_non_inline_constant(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep0
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep0
   %add = fadd nnan float %a, 0.5
   %max = call float @llvm.maxnum.f32(float %add, float 1.0)
   %med = call float @llvm.minnum.f32(float %max, float 16.0)
 
-  store float %med, float addrspace(1)* %out.gep
+  store float %med, ptr addrspace(1) %out.gep
 
   %extra.use = fadd float %a, 16.0
-  store volatile float %extra.use, float addrspace(1)* undef
+  store volatile float %extra.use, ptr addrspace(1) undef
   ret void
 }
 
@@ -1086,21 +1086,21 @@ define amdgpu_kernel void @one_non_inline_constant(float addrspace(1)* %out, flo
 ; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x41800000
 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5,
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], [[K0]], [[VK1]]
-define amdgpu_kernel void @two_non_inline_constant_multi_use(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep0
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep0
   %add = fadd nnan float %a, 0.5
   %max = call float @llvm.maxnum.f32(float %add, float 8.0)
   %med = call float @llvm.minnum.f32(float %max, float 16.0)
 
-  store float %med, float addrspace(1)* %out.gep
+  store float %med, ptr addrspace(1) %out.gep
 
   %extra.use0 = fadd float %a, 16.0
-  store volatile float %extra.use0, float addrspace(1)* undef
+  store volatile float %extra.use0, ptr addrspace(1) undef
   %extra.use1 = fadd float %a, 8.0
-  store volatile float %extra.use1, float addrspace(1)* undef
+  store volatile float %extra.use1, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index f891b326708e5..352a99619ac1e 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -8,13 +8,13 @@
 ; GCN: buffer_load_dword [[REGA:v[0-9]+]]
 ; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; GCN: buffer_store_dword [[RESULT]],
-define amdgpu_kernel void @test_fmin3_olt_0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 {
-  %a = load volatile float, float addrspace(1)* %aptr, align 4
-  %b = load volatile float, float addrspace(1)* %bptr, align 4
-  %c = load volatile float, float addrspace(1)* %cptr, align 4
+define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
+  %a = load volatile float, ptr addrspace(1) %aptr, align 4
+  %b = load volatile float, ptr addrspace(1) %bptr, align 4
+  %c = load volatile float, ptr addrspace(1) %cptr, align 4
   %f0 = call float @llvm.minnum.f32(float %a, float %b)
   %f1 = call float @llvm.minnum.f32(float %f0, float %c)
-  store float %f1, float addrspace(1)* %out, align 4
+  store float %f1, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -25,13 +25,13 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(float addrspace(1)* %out, float
 ; GCN: buffer_load_dword [[REGC:v[0-9]+]]
 ; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; GCN: buffer_store_dword [[RESULT]],
-define amdgpu_kernel void @test_fmin3_olt_1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 {
-  %a = load volatile float, float addrspace(1)* %aptr, align 4
-  %b = load volatile float, float addrspace(1)* %bptr, align 4
-  %c = load volatile float, float addrspace(1)* %cptr, align 4
+define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
+  %a = load volatile float, ptr addrspace(1) %aptr, align 4
+  %b = load volatile float, ptr addrspace(1) %bptr, align 4
+  %c = load volatile float, ptr addrspace(1) %cptr, align 4
   %f0 = call float @llvm.minnum.f32(float %a, float %b)
   %f1 = call float @llvm.minnum.f32(float %c, float %f0)
-  store float %f1, float addrspace(1)* %out, align 4
+  store float %f1, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -48,13 +48,13 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(float addrspace(1)* %out, float
 
 ; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; GCN: buffer_store_short [[RESULT]],
-define amdgpu_kernel void @test_fmin3_olt_0_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 {
-  %a = load volatile half, half addrspace(1)* %aptr, align 2
-  %b = load volatile half, half addrspace(1)* %bptr, align 2
-  %c = load volatile half, half addrspace(1)* %cptr, align 2
+define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
+  %a = load volatile half, ptr addrspace(1) %aptr, align 2
+  %b = load volatile half, ptr addrspace(1) %bptr, align 2
+  %c = load volatile half, ptr addrspace(1) %cptr, align 2
   %f0 = call half @llvm.minnum.f16(half %a, half %b)
   %f1 = call half @llvm.minnum.f16(half %f0, half %c)
-  store half %f1, half addrspace(1)* %out, align 2
+  store half %f1, ptr addrspace(1) %out, align 2
   ret void
 }
 
@@ -75,13 +75,13 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(half addrspace(1)* %out, half ad
 
 ; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]]
 ; GCN: buffer_store_short [[RESULT]],
-define amdgpu_kernel void @test_fmin3_olt_1_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 {
-  %a = load volatile half, half addrspace(1)* %aptr, align 2
-  %b = load volatile half, half addrspace(1)* %bptr, align 2
-  %c = load volatile half, half addrspace(1)* %cptr, align 2
+define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
+  %a = load volatile half, ptr addrspace(1) %aptr, align 2
+  %b = load volatile half, ptr addrspace(1) %bptr, align 2
+  %c = load volatile half, ptr addrspace(1) %cptr, align 2
   %f0 = call half @llvm.minnum.f16(half %a, half %b)
   %f1 = call half @llvm.minnum.f16(half %c, half %f0)
-  store half %f1, half addrspace(1)* %out, align 2
+  store half %f1, ptr addrspace(1) %out, align 2
   ret void
 }
 
@@ -120,26 +120,26 @@ entry:
 
 ; GCN-LABEL: {{^}}test_fmin3_olt_0_f64:
 ; GCN-NOT: v_min3
-define amdgpu_kernel void @test_fmin3_olt_0_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr, double addrspace(1)* %cptr) #0 {
-  %a = load volatile double, double addrspace(1)* %aptr, align 4
-  %b = load volatile double, double addrspace(1)* %bptr, align 4
-  %c = load volatile double, double addrspace(1)* %cptr, align 4
+define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
+  %a = load volatile double, ptr addrspace(1) %aptr, align 4
+  %b = load volatile double, ptr addrspace(1) %bptr, align 4
+  %c = load volatile double, ptr addrspace(1) %cptr, align 4
   %f0 = call double @llvm.minnum.f64(double %a, double %b)
   %f1 = call double @llvm.minnum.f64(double %f0, double %c)
-  store double %f1, double addrspace(1)* %out, align 4
+  store double %f1, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; Commute operand of second fmin
 ; GCN-LABEL: {{^}}test_fmin3_olt_1_f64:
 ; GCN-NOT: v_min3
-define amdgpu_kernel void @test_fmin3_olt_1_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr, double addrspace(1)* %cptr) #0 {
-  %a = load volatile double, double addrspace(1)* %aptr, align 4
-  %b = load volatile double, double addrspace(1)* %bptr, align 4
-  %c = load volatile double, double addrspace(1)* %cptr, align 4
+define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
+  %a = load volatile double, ptr addrspace(1) %aptr, align 4
+  %b = load volatile double, ptr addrspace(1) %bptr, align 4
+  %c = load volatile double, ptr addrspace(1) %cptr, align 4
   %f0 = call double @llvm.minnum.f64(double %a, double %b)
   %f1 = call double @llvm.minnum.f64(double %c, double %f0)
-  store double %f1, double addrspace(1)* %out, align 4
+  store double %f1, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
index 9aff45801e5b3..d20c39d510364 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -check-prefix=VI %s
 
-define amdgpu_kernel void @test_fmin_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_uge_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: test_fmin_legacy_uge_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -42,19 +42,19 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(double addrspace(1)* %out, d
 ; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
+  %a = load double, ptr addrspace(1) %gep.0, align 8
+  %b = load double, ptr addrspace(1) %gep.1, align 8
 
   %cmp = fcmp uge double %a, %b
   %val = select i1 %cmp, double %b, double %a
-  store double %val, double addrspace(1)* %out, align 8
+  store double %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @test_fmin_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ugt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: test_fmin_legacy_ugt_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -94,19 +94,19 @@ define amdgpu_kernel void @test_fmin_legacy_ugt_f64(double addrspace(1)* %out, d
 ; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
+  %a = load double, ptr addrspace(1) %gep.0, align 8
+  %b = load double, ptr addrspace(1) %gep.1, align 8
 
   %cmp = fcmp ugt double %a, %b
   %val = select i1 %cmp, double %b, double %a
-  store double %val, double addrspace(1)* %out, align 8
+  store double %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ule_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: test_fmin_legacy_ule_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -146,19 +146,19 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, d
 ; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
+  %a = load double, ptr addrspace(1) %gep.0, align 8
+  %b = load double, ptr addrspace(1) %gep.1, align 8
 
   %cmp = fcmp ule double %a, %b
   %val = select i1 %cmp, double %a, double %b
-  store double %val, double addrspace(1)* %out, align 8
+  store double %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: test_fmin_legacy_ult_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -198,19 +198,19 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, d
 ; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
+  %a = load double, ptr addrspace(1) %gep.0, align 8
+  %b = load double, ptr addrspace(1) %gep.1, align 8
 
   %cmp = fcmp ult double %a, %b
   %val = select i1 %cmp, double %a, double %b
-  store double %val, double addrspace(1)* %out, align 8
+  store double %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @test_fmin_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_oge_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: test_fmin_legacy_oge_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -250,19 +250,19 @@ define amdgpu_kernel void @test_fmin_legacy_oge_f64(double addrspace(1)* %out, d
 ; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
+  %a = load double, ptr addrspace(1) %gep.0, align 8
+  %b = load double, ptr addrspace(1) %gep.1, align 8
 
   %cmp = fcmp oge double %a, %b
   %val = select i1 %cmp, double %b, double %a
-  store double %val, double addrspace(1)* %out, align 8
+  store double %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @test_fmin_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ogt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: test_fmin_legacy_ogt_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -302,19 +302,19 @@ define amdgpu_kernel void @test_fmin_legacy_ogt_f64(double addrspace(1)* %out, d
 ; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
+  %a = load double, ptr addrspace(1) %gep.0, align 8
+  %b = load double, ptr addrspace(1) %gep.1, align 8
 
   %cmp = fcmp ogt double %a, %b
   %val = select i1 %cmp, double %b, double %a
-  store double %val, double addrspace(1)* %out, align 8
+  store double %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ole_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: test_fmin_legacy_ole_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -354,19 +354,19 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, d
 ; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
+  %a = load double, ptr addrspace(1) %gep.0, align 8
+  %b = load double, ptr addrspace(1) %gep.1, align 8
 
   %cmp = fcmp ole double %a, %b
   %val = select i1 %cmp, double %a, double %b
-  store double %val, double addrspace(1)* %out, align 8
+  store double %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_olt_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: test_fmin_legacy_olt_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -406,15 +406,15 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, d
 ; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load double, double addrspace(1)* %gep.0, align 8
-  %b = load double, double addrspace(1)* %gep.1, align 8
+  %a = load double, ptr addrspace(1) %gep.0, align 8
+  %b = load double, ptr addrspace(1) %gep.1, align 8
 
   %cmp = fcmp olt double %a, %b
   %val = select i1 %cmp, double %a, double %b
-  store double %val, double addrspace(1)* %out, align 8
+  store double %val, ptr addrspace(1) %out, align 8
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
index 51ee7f8a8a5b1..ac1bfdb4dbf0c 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -21,12 +21,12 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; VI-SAFE: v_cmp_nlt_f32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 
 ; VI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(float addrspace(1)* %out, <4 x float> %reg0) #0 {
+define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(ptr addrspace(1) %out, <4 x float> %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = fcmp uge float %r0, %r1
    %r3 = select i1 %r2, float %r1, float %r0
-   store float %r3, float addrspace(1)* %out
+   store float %r3, ptr addrspace(1) %out
    ret void
 }
 
@@ -46,10 +46,10 @@ define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(float addrspace(
 ; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[VB]], [[VA]]
 
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[#LOAD + 2]], [[VB]]
-define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
   %cmp = fcmp ule float %a, %b
   %val = select i1 %cmp, float %a, float %b
-  store float %val, float addrspace(1)* %out, align 4
+  store float %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -67,12 +67,12 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out,
 ; VI-SAFE: v_cndmask_b32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]], vcc
 
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]
-define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(ptr addrspace(1) %out, float %a, float %b) #0 {
   %a.nnan = fadd nnan float %a, 1.0
   %b.nnan = fadd nnan float %b, 2.0
   %cmp = fcmp ule float %a.nnan, %b.nnan
   %val = select i1 %cmp, float %a.nnan, float %b.nnan
-  store float %val, float addrspace(1)* %out, align 4
+  store float %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -86,17 +86,17 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(float addrspace(1
 ; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
-define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ule_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load volatile float, float addrspace(1)* %gep.0, align 4
-  %b = load volatile float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
   %cmp = fcmp ule float %a, %b
   %val = select i1 %cmp, float %a, float %b
-  store float %val, float addrspace(1)* %out, align 4
+  store float %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -110,17 +110,17 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, fl
 ; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
-define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ole_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load volatile float, float addrspace(1)* %gep.0, align 4
-  %b = load volatile float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
   %cmp = fcmp ole float %a, %b
   %val = select i1 %cmp, float %a, float %b
-  store float %val, float addrspace(1)* %out, align 4
+  store float %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -134,17 +134,17 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, fl
 ; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
-define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_olt_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load volatile float, float addrspace(1)* %gep.0, align 4
-  %b = load volatile float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
   %cmp = fcmp olt float %a, %b
   %val = select i1 %cmp, float %a, float %b
-  store float %val, float addrspace(1)* %out, align 4
+  store float %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -158,17 +158,17 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, fl
 ; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
-define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load volatile float, float addrspace(1)* %gep.0, align 4
-  %b = load volatile float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
   %cmp = fcmp ult float %a, %b
   %val = select i1 %cmp, float %a, float %b
-  store float %val, float addrspace(1)* %out, align 4
+  store float %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -182,17 +182,17 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, fl
 ; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
-define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr <1 x float>, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr <1 x float>, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load volatile <1 x float>, <1 x float> addrspace(1)* %gep.0
-  %b = load volatile <1 x float>, <1 x float> addrspace(1)* %gep.1
+  %a = load volatile <1 x float>, ptr addrspace(1) %gep.0
+  %b = load volatile <1 x float>, ptr addrspace(1) %gep.1
 
   %cmp = fcmp ult <1 x float> %a, %b
   %val = select <1 x i1> %cmp, <1 x float> %a, <1 x float> %b
-  store <1 x float> %val, <1 x float> addrspace(1)* %out
+  store <1 x float> %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -209,17 +209,17 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)*
 
 ; GCN-NONAN: v_min_f32_e32
 ; GCN-NONAN: v_min_f32_e32
-define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr <2 x float>, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr <2 x float>, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load volatile <2 x float>, <2 x float> addrspace(1)* %gep.0
-  %b = load volatile <2 x float>, <2 x float> addrspace(1)* %gep.1
+  %a = load volatile <2 x float>, ptr addrspace(1) %gep.0
+  %b = load volatile <2 x float>, ptr addrspace(1) %gep.1
 
   %cmp = fcmp ult <2 x float> %a, %b
   %val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
-  store <2 x float> %val, <2 x float> addrspace(1)* %out
+  store <2 x float> %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -242,17 +242,17 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)*
 ; GCN-NONAN: v_min_f32_e32
 ; GCN-NONAN: v_min_f32_e32
 ; GCN-NONAN-NOT: v_min_
-define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr <3 x float>, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr <3 x float>, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load <3 x float>, <3 x float> addrspace(1)* %gep.0
-  %b = load <3 x float>, <3 x float> addrspace(1)* %gep.1
+  %a = load <3 x float>, ptr addrspace(1) %gep.0
+  %b = load <3 x float>, ptr addrspace(1) %gep.1
 
   %cmp = fcmp ult <3 x float> %a, %b
   %val = select <3 x i1> %cmp, <3 x float> %a, <3 x float> %b
-  store <3 x float> %val, <3 x float> addrspace(1)* %out
+  store <3 x float> %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -264,18 +264,18 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)*
 ; GCN-NEXT: v_cndmask_b32
 ; GCN-NOT: v_min
 ; GCN: s_endpgm
-define amdgpu_kernel void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ole_f32_multi_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load volatile float, float addrspace(1)* %gep.0, align 4
-  %b = load volatile float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
   %cmp = fcmp ole float %a, %b
   %val0 = select i1 %cmp, float %a, float %b
-  store float %val0, float addrspace(1)* %out0, align 4
-  store i1 %cmp, i1 addrspace(1)* %out1
+  store float %val0, ptr addrspace(1) %out0, align 4
+  store i1 %cmp, ptr addrspace(1) %out1
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll b/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll
index 6f884529574e5..89a0529f08623 100644
--- a/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll
@@ -18,7 +18,7 @@ declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0
 ; GCN: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[QUIETB]], [[QUIETA]]
 define amdgpu_kernel void @test_fmin_f64_ieee_noflush([8 x i32], double %a, [8 x i32], double %b) #1 {
   %val = call double @llvm.minnum.f64(double %a, double %b) #0
-  store double %val, double addrspace(1)* undef, align 8
+  store double %val, ptr addrspace(1) undef, align 8
   ret void
 }
 
@@ -34,7 +34,7 @@ define amdgpu_kernel void @test_fmin_f64_ieee_noflush([8 x i32], double %a, [8 x
 ; GCN: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[QUIETB]], [[QUIETA]]
 define amdgpu_kernel void @test_fmin_f64_ieee_flush([8 x i32], double %a, [8 x i32], double %b) #2 {
   %val = call double @llvm.minnum.f64(double %a, double %b) #0
-  store double %val, double addrspace(1)* undef, align 8
+  store double %val, ptr addrspace(1) undef, align 8
   ret void
 }
 
@@ -47,19 +47,19 @@ define amdgpu_kernel void @test_fmin_f64_ieee_flush([8 x i32], double %a, [8 x i
 ; GCN-NOT: [[RESULT]]
 ; GCN: ds_write_b64 v{{[0-9]+}}, [[RESULT]]
 define amdgpu_ps void @test_fmin_f64_no_ieee() nounwind {
-  %a = load volatile double, double addrspace(3)* undef
-  %b = load volatile double, double addrspace(3)* undef
+  %a = load volatile double, ptr addrspace(3) undef
+  %b = load volatile double, ptr addrspace(3) undef
   %val = call double @llvm.minnum.f64(double %a, double %b) #0
-  store volatile double %val, double addrspace(3)* undef
+  store volatile double %val, ptr addrspace(3) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_fmin_v2f64:
 ; GCN: v_min_f64
 ; GCN: v_min_f64
-define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v2f64(ptr addrspace(1) %out, <2 x double> %a, <2 x double> %b) nounwind {
   %val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) #0
-  store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
+  store <2 x double> %val, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -68,9 +68,9 @@ define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x
 ; GCN: v_min_f64
 ; GCN: v_min_f64
 ; GCN: v_min_f64
-define amdgpu_kernel void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v4f64(ptr addrspace(1) %out, <4 x double> %a, <4 x double> %b) nounwind {
   %val = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b) #0
-  store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
+  store <4 x double> %val, ptr addrspace(1) %out, align 32
   ret void
 }
 
@@ -83,9 +83,9 @@ define amdgpu_kernel void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x
 ; GCN: v_min_f64
 ; GCN: v_min_f64
 ; GCN: v_min_f64
-define amdgpu_kernel void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v8f64(ptr addrspace(1) %out, <8 x double> %a, <8 x double> %b) nounwind {
   %val = call <8 x double> @llvm.minnum.v8f64(<8 x double> %a, <8 x double> %b) #0
-  store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
+  store <8 x double> %val, ptr addrspace(1) %out, align 64
   ret void
 }
 
@@ -106,9 +106,9 @@ define amdgpu_kernel void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x
 ; GCN: v_min_f64
 ; GCN: v_min_f64
 ; GCN: v_min_f64
-define amdgpu_kernel void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v16f64(ptr addrspace(1) %out, <16 x double> %a, <16 x double> %b) nounwind {
   %val = call <16 x double> @llvm.minnum.v16f64(<16 x double> %a, <16 x double> %b) #0
-  store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
+  store <16 x double> %val, ptr addrspace(1) %out, align 128
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fminnum.ll b/llvm/test/CodeGen/AMDGPU/fminnum.ll
index 84ed73a864376..69424f31d1b9f 100644
--- a/llvm/test/CodeGen/AMDGPU/fminnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminnum.ll
@@ -7,9 +7,9 @@
 ; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[QUIET1]], [[QUIET0]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @test_fmin_f32_ieee_mode_on(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_fmin_f32_ieee_mode_on(ptr addrspace(1) %out, float %a, float %b) #0 {
   %val = call float @llvm.minnum.f32(float %a, float %b) #1
-  store float %val, float addrspace(1)* %out, align 4
+  store float %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -43,9 +43,9 @@ define amdgpu_ps float @test_fmin_f32_ieee_mode_off(float %a, float %b) #0 {
 ; GCN-LABEL: {{^}}test_fmin_v2f32:
 ; GCN: v_min_f32_e32
 ; GCN: v_min_f32_e32
-define amdgpu_kernel void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @test_fmin_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 {
   %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b)
-  store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
+  store <2 x float> %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -54,9 +54,9 @@ define amdgpu_kernel void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x
 ; GCN: v_min_f32_e32
 ; GCN: v_min_f32_e32
 ; GCN: v_min_f32_e32
-define amdgpu_kernel void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) #0 {
+define amdgpu_kernel void @test_fmin_v4f32(ptr addrspace(1) %out, <4 x float> %a, <4 x float> %b) #0 {
   %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b)
-  store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
+  store <4 x float> %val, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -69,9 +69,9 @@ define amdgpu_kernel void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x
 ; GCN: v_min_f32_e32
 ; GCN: v_min_f32_e32
 ; GCN: v_min_f32_e32
-define amdgpu_kernel void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) #0 {
+define amdgpu_kernel void @test_fmin_v8f32(ptr addrspace(1) %out, <8 x float> %a, <8 x float> %b) #0 {
   %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b)
-  store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
+  store <8 x float> %val, ptr addrspace(1) %out, align 32
   ret void
 }
 
@@ -92,9 +92,9 @@ define amdgpu_kernel void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x
 ; GCN: v_min_f32_e32
 ; GCN: v_min_f32_e32
 ; GCN: v_min_f32_e32
-define amdgpu_kernel void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) #0 {
+define amdgpu_kernel void @test_fmin_v16f32(ptr addrspace(1) %out, <16 x float> %a, <16 x float> %b) #0 {
   %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b)
-  store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
+  store <16 x float> %val, ptr addrspace(1) %out, align 64
   ret void
 }
 
@@ -102,9 +102,9 @@ define amdgpu_kernel void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16
 ; GCN-NOT: v_min_f32_e32
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
 ; GCN: buffer_store_dword [[REG]]
-define amdgpu_kernel void @constant_fold_fmin_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @constant_fold_fmin_f32(ptr addrspace(1) %out) #0 {
   %val = call float @llvm.minnum.f32(float 1.0, float 2.0)
-  store float %val, float addrspace(1)* %out, align 4
+  store float %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -112,9 +112,9 @@ define amdgpu_kernel void @constant_fold_fmin_f32(float addrspace(1)* %out) #0 {
 ; GCN-NOT: v_min_f32_e32
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000
 ; GCN: buffer_store_dword [[REG]]
-define amdgpu_kernel void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @constant_fold_fmin_f32_nan_nan(ptr addrspace(1) %out) #0 {
   %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000)
-  store float %val, float addrspace(1)* %out, align 4
+  store float %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -122,9 +122,9 @@ define amdgpu_kernel void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %o
 ; GCN-NOT: v_min_f32_e32
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
 ; GCN: buffer_store_dword [[REG]]
-define amdgpu_kernel void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @constant_fold_fmin_f32_val_nan(ptr addrspace(1) %out) #0 {
   %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000)
-  store float %val, float addrspace(1)* %out, align 4
+  store float %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -132,9 +132,9 @@ define amdgpu_kernel void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %o
 ; GCN-NOT: v_min_f32_e32
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
 ; GCN: buffer_store_dword [[REG]]
-define amdgpu_kernel void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @constant_fold_fmin_f32_nan_val(ptr addrspace(1) %out) #0 {
   %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0)
-  store float %val, float addrspace(1)* %out, align 4
+  store float %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -142,9 +142,9 @@ define amdgpu_kernel void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %o
 ; GCN-NOT: v_min_f32_e32
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[REG]]
-define amdgpu_kernel void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @constant_fold_fmin_f32_p0_p0(ptr addrspace(1) %out) #0 {
   %val = call float @llvm.minnum.f32(float 0.0, float 0.0)
-  store float %val, float addrspace(1)* %out, align 4
+  store float %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -152,9 +152,9 @@ define amdgpu_kernel void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out
 ; GCN-NOT: v_min_f32_e32
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[REG]]
-define amdgpu_kernel void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @constant_fold_fmin_f32_p0_n0(ptr addrspace(1) %out) #0 {
   %val = call float @llvm.minnum.f32(float 0.0, float -0.0)
-  store float %val, float addrspace(1)* %out, align 4
+  store float %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -162,9 +162,9 @@ define amdgpu_kernel void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out
 ; GCN-NOT: v_min_f32_e32
 ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define amdgpu_kernel void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @constant_fold_fmin_f32_n0_p0(ptr addrspace(1) %out) #0 {
   %val = call float @llvm.minnum.f32(float -0.0, float 0.0)
-  store float %val, float addrspace(1)* %out, align 4
+  store float %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -172,9 +172,9 @@ define amdgpu_kernel void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out
 ; GCN-NOT: v_min_f32_e32
 ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define amdgpu_kernel void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @constant_fold_fmin_f32_n0_n0(ptr addrspace(1) %out) #0 {
   %val = call float @llvm.minnum.f32(float -0.0, float -0.0)
-  store float %val, float addrspace(1)* %out, align 4
+  store float %val, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs-divergence-driven-isel.ll
index 8fa91f2c024e8..5ea6ac77149d0 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs-divergence-driven-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs-divergence-driven-isel.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GCN,FP16 %s
 
 
-define amdgpu_kernel void @divergent_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @divergent_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: name:            divergent_fneg_f32
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
@@ -10,29 +10,29 @@ define amdgpu_kernel void @divergent_fneg_f32(float addrspace(1)* %out, float ad
 
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds float, float addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
-  %val = load volatile float, float addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds float, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
+  %val = load volatile float, ptr addrspace(1) %in.gep
   %fneg = fneg float %val
-  store float %fneg, float addrspace(1)* %out.gep
+  store float %fneg, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @uniform_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in, i64 %idx) {
+define amdgpu_kernel void @uniform_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %idx) {
 ; GCN-LABEL: name:            uniform_fneg_f32
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
 ; GCN: S_XOR_B32 killed %{{[0-9]+}}, killed %[[REG]]
 
-  %in.gep = getelementptr inbounds float, float addrspace(1)* %in, i64 %idx
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %idx
-  %val = load volatile float, float addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds float, ptr addrspace(1) %in, i64 %idx
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %idx
+  %val = load volatile float, ptr addrspace(1) %in.gep
   %fneg = fneg float %val
-  store float %fneg, float addrspace(1)* %out.gep
+  store float %fneg, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @divergent_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @divergent_fabs_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: name:            divergent_fabs_f32
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 2147483647
@@ -40,29 +40,29 @@ define amdgpu_kernel void @divergent_fabs_f32(float addrspace(1)* %out, float ad
 
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds float, float addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
-  %val = load volatile float, float addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds float, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
+  %val = load volatile float, ptr addrspace(1) %in.gep
   %fabs = call float @llvm.fabs.f32(float %val)
-  store float %fabs, float addrspace(1)* %out.gep
+  store float %fabs, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @uniform_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in, i64 %idx) {
+define amdgpu_kernel void @uniform_fabs_f32(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %idx) {
 ; GCN-LABEL: name:            uniform_fabs_f32
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 2147483647
 ; GCN: S_AND_B32 killed %{{[0-9]+}}, killed %[[REG]]
 
-  %in.gep = getelementptr inbounds float, float addrspace(1)* %in, i64 %idx
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %idx
-  %val = load volatile float, float addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds float, ptr addrspace(1) %in, i64 %idx
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %idx
+  %val = load volatile float, ptr addrspace(1) %in.gep
   %fabs = call float @llvm.fabs.f32(float %val)
-  store float %fabs, float addrspace(1)* %out.gep
+  store float %fabs, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @divergent_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @divergent_fneg_fabs_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: name:            divergent_fneg_fabs_f32
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
@@ -70,32 +70,32 @@ define amdgpu_kernel void @divergent_fneg_fabs_f32(float addrspace(1)* %out, flo
 
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds float, float addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
-  %val = load volatile float, float addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds float, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
+  %val = load volatile float, ptr addrspace(1) %in.gep
   %fabs = call float @llvm.fabs.f32(float %val)
   %fneg = fneg float %fabs
-  store float %fneg, float addrspace(1)* %out.gep
+  store float %fneg, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @uniform_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in, i64 %idx) {
+define amdgpu_kernel void @uniform_fneg_fabs_f32(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %idx) {
 ; GCN-LABEL: name:            uniform_fneg_fabs_f32
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
 ; GCN: S_OR_B32 killed %{{[0-9]+}}, killed %[[REG]]
 
-  %in.gep = getelementptr inbounds float, float addrspace(1)* %in, i64 %idx
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %idx
-  %val = load volatile float, float addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds float, ptr addrspace(1) %in, i64 %idx
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %idx
+  %val = load volatile float, ptr addrspace(1) %in.gep
   %fabs = call float @llvm.fabs.f32(float %val)
   %fneg = fneg float %fabs
-  store float %fneg, float addrspace(1)* %out.gep
+  store float %fneg, ptr addrspace(1) %out.gep
   ret void
 }
 
 
-define amdgpu_kernel void @divergent_fabs_f16(half addrspace(1)* %in, half addrspace(1)* %out) {
+define amdgpu_kernel void @divergent_fabs_f16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: name:            divergent_fabs_f16
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; FP16: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 32767
@@ -103,27 +103,27 @@ define amdgpu_kernel void @divergent_fabs_f16(half addrspace(1)* %in, half addrs
 
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds half, half addrspace(1)* %in, i64 %tid.ext
-  %val = load volatile half, half addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds half, ptr addrspace(1) %in, i64 %tid.ext
+  %val = load volatile half, ptr addrspace(1) %in.gep
   %fabs = call half @llvm.fabs.f16(half %val)
-  store half %fabs, half addrspace(1)* %out
+  store half %fabs, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @uniform_fabs_f16(half addrspace(1)* %in, half addrspace(1)* %out, i64 %idx) {
+define amdgpu_kernel void @uniform_fabs_f16(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %idx) {
 ; GCN-LABEL: name:            uniform_fabs_f16
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 32767
 ; GCN: S_AND_B32 killed %{{[0-9]+}}, killed %[[REG]]
 
-  %in.gep = getelementptr inbounds half, half addrspace(1)* %in, i64 %idx
-  %val = load volatile half, half addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds half, ptr addrspace(1) %in, i64 %idx
+  %val = load volatile half, ptr addrspace(1) %in.gep
   %fabs = call half @llvm.fabs.f16(half %val)
-  store half %fabs, half addrspace(1)* %out
+  store half %fabs, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @divergent_fneg_f16(half addrspace(1)* %in, half addrspace(1)* %out) {
+define amdgpu_kernel void @divergent_fneg_f16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: name:            divergent_fneg_f16
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; FP16: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 32768
@@ -131,27 +131,27 @@ define amdgpu_kernel void @divergent_fneg_f16(half addrspace(1)* %in, half addrs
 
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds half, half addrspace(1)* %in, i64 %tid.ext
-  %val = load volatile half, half addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds half, ptr addrspace(1) %in, i64 %tid.ext
+  %val = load volatile half, ptr addrspace(1) %in.gep
   %fneg = fneg half %val
-  store half %fneg, half addrspace(1)* %out
+  store half %fneg, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @uniform_fneg_f16(half addrspace(1)* %in, half addrspace(1)* %out, i64 %idx) {
+define amdgpu_kernel void @uniform_fneg_f16(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %idx) {
 ; GCN-LABEL: name:            uniform_fneg_f16
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 32768
 ; GCN: S_XOR_B32 killed %{{[0-9]+}}, killed %[[REG]]
 
-  %in.gep = getelementptr inbounds half, half addrspace(1)* %in, i64 %idx
-  %val = load volatile half, half addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds half, ptr addrspace(1) %in, i64 %idx
+  %val = load volatile half, ptr addrspace(1) %in.gep
   %fneg = fneg half %val
-  store half %fneg, half addrspace(1)* %out
+  store half %fneg, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @divergent_fneg_fabs_f16(half addrspace(1)* %in, half addrspace(1)* %out) {
+define amdgpu_kernel void @divergent_fneg_fabs_f16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: name:            divergent_fneg_fabs_f16
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; FP16: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 32768
@@ -159,118 +159,118 @@ define amdgpu_kernel void @divergent_fneg_fabs_f16(half addrspace(1)* %in, half
 
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds half, half addrspace(1)* %in, i64 %tid.ext
-  %val = load volatile half, half addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds half, ptr addrspace(1) %in, i64 %tid.ext
+  %val = load volatile half, ptr addrspace(1) %in.gep
   %fabs = call half @llvm.fabs.f16(half %val)
   %fneg = fneg half %fabs
-  store half %fneg, half addrspace(1)* %out
+  store half %fneg, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @uniform_fneg_fabs_f16(half addrspace(1)* %in, half addrspace(1)* %out, i64 %idx) {
+define amdgpu_kernel void @uniform_fneg_fabs_f16(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %idx) {
 ; GCN-LABEL: name:            uniform_fneg_fabs_f16
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 32768
 ; GCN: S_OR_B32 killed %{{[0-9]+}}, killed %[[REG]]
 
-  %in.gep = getelementptr inbounds half, half addrspace(1)* %in, i64 %idx
-  %val = load volatile half, half addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds half, ptr addrspace(1) %in, i64 %idx
+  %val = load volatile half, ptr addrspace(1) %in.gep
   %fabs = call half @llvm.fabs.f16(half %val)
   %fneg = fneg half %fabs
-  store half %fneg, half addrspace(1)* %out
+  store half %fneg, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @divergent_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) {
+define amdgpu_kernel void @divergent_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: name:            divergent_fneg_v2f16
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; FP16: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147450880
 ; FP16: V_XOR_B32_e64 killed %[[REG]]
 
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
-  %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2
+  %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
+  %val = load <2 x half>, ptr addrspace(1) %gep.in, align 2
   %fneg = fneg <2 x half> %val
-  store <2 x half> %fneg, <2 x half> addrspace(1)* %gep.out
+  store <2 x half> %fneg, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @uniform_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @uniform_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
 ; GCN-LABEL: name:            uniform_fneg_v2f16
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147450880
 ; GCN: S_XOR_B32 killed %{{[0-9]+}}, killed %[[REG]]
 
-  %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %idx
-  %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %idx
-  %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2
+  %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %idx
+  %gep.out = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %idx
+  %val = load <2 x half>, ptr addrspace(1) %gep.in, align 2
   %fneg = fneg <2 x half> %val
-  store <2 x half> %fneg, <2 x half> addrspace(1)* %gep.out
+  store <2 x half> %fneg, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @divergent_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) {
+define amdgpu_kernel void @divergent_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: name:            divergent_fabs_v2f16
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; FP16: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 2147450879
 ; FP16: V_AND_B32_e64 killed %[[REG]]
 
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
-  %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2
+  %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
+  %val = load <2 x half>, ptr addrspace(1) %gep.in, align 2
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
-  store <2 x half> %fabs, <2 x half> addrspace(1)* %gep.out
+  store <2 x half> %fabs, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @uniform_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @uniform_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
 ; GCN-LABEL: name:            uniform_fabs_v2f16
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 2147450879
 ; GCN: S_AND_B32 killed %{{[0-9]+}}, killed %[[REG]]
 
-  %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %idx
-  %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %idx
-  %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2
+  %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %idx
+  %gep.out = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %idx
+  %val = load <2 x half>, ptr addrspace(1) %gep.in, align 2
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
-  store <2 x half> %fabs, <2 x half> addrspace(1)* %gep.out
+  store <2 x half> %fabs, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @divergent_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) {
+define amdgpu_kernel void @divergent_fneg_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: name:            divergent_fneg_fabs_v2f16
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; FP16: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147450880
 ; FP16: V_OR_B32_e64 killed %[[REG]]
 
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
-  %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2
+  %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
+  %val = load <2 x half>, ptr addrspace(1) %gep.in, align 2
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
   %fneg = fneg <2 x half> %fabs
-  store <2 x half> %fneg, <2 x half> addrspace(1)* %gep.out
+  store <2 x half> %fneg, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @uniform_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @uniform_fneg_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
 ; GCN-LABEL: name:            uniform_fneg_fabs_v2f16
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147450880
 ; GCN: S_OR_B32 killed %{{[0-9]+}}, killed %[[REG]]
 
-  %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %idx
-  %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %idx
-  %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2
+  %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %idx
+  %gep.out = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %idx
+  %val = load <2 x half>, ptr addrspace(1) %gep.in, align 2
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
   %fneg = fneg <2 x half> %fabs
-  store <2 x half> %fneg, <2 x half> addrspace(1)* %gep.out
+  store <2 x half> %fneg, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @divergent_fneg_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @divergent_fneg_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: name:            divergent_fneg_v2f32
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
@@ -278,30 +278,30 @@ define amdgpu_kernel void @divergent_fneg_v2f32(<2 x float> addrspace(1)* %out,
 ; GCN: V_XOR_B32_e64 %[[REG]]
 
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
-  %val = load <2 x float>, <2 x float> addrspace(1)* %gep.in, align 4
+  %gep.in = getelementptr inbounds <2 x float>, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr inbounds <2 x float>, ptr addrspace(1) %in, i32 %tid
+  %val = load <2 x float>, ptr addrspace(1) %gep.in, align 4
   %fneg = fneg <2 x float> %val
-  store <2 x float> %fneg, <2 x float> addrspace(1)* %gep.out
+  store <2 x float> %fneg, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @uniform_fneg_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @uniform_fneg_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
 ; GCN-LABEL: name:            uniform_fneg_v2f32
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
 ; GCN: S_XOR_B32 killed %{{[0-9]+}}, %[[REG]]
 ; GCN: S_XOR_B32 killed %{{[0-9]+}}, %[[REG]]
 
-  %gep.in = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %idx
-  %gep.out = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %idx
-  %val = load <2 x float>, <2 x float> addrspace(1)* %gep.in, align 4
+  %gep.in = getelementptr inbounds <2 x float>, ptr addrspace(1) %in, i32 %idx
+  %gep.out = getelementptr inbounds <2 x float>, ptr addrspace(1) %in, i32 %idx
+  %val = load <2 x float>, ptr addrspace(1) %gep.in, align 4
   %fneg = fneg <2 x float> %val
-  store <2 x float> %fneg, <2 x float> addrspace(1)* %gep.out
+  store <2 x float> %fneg, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @divergent_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @divergent_fabs_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: name:            divergent_fabs_v2f32
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 2147483647
@@ -309,30 +309,30 @@ define amdgpu_kernel void @divergent_fabs_v2f32(<2 x float> addrspace(1)* %out,
 ; GCN: V_AND_B32_e64 %[[REG]]
 
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
-  %val = load <2 x float>, <2 x float> addrspace(1)* %gep.in, align 4
+  %gep.in = getelementptr inbounds <2 x float>, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr inbounds <2 x float>, ptr addrspace(1) %in, i32 %tid
+  %val = load <2 x float>, ptr addrspace(1) %gep.in, align 4
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %val)
-  store <2 x float> %fabs, <2 x float> addrspace(1)* %gep.out
+  store <2 x float> %fabs, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @uniform_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @uniform_fabs_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
 ; GCN-LABEL: name:            uniform_fabs_v2f32
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 2147483647
 ; GCN: S_AND_B32 killed %{{[0-9]+}}, %[[REG]]
 ; GCN: S_AND_B32 killed %{{[0-9]+}}, %[[REG]]
 
-  %gep.in = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %idx
-  %gep.out = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %idx
-  %val = load <2 x float>, <2 x float> addrspace(1)* %gep.in, align 4
+  %gep.in = getelementptr inbounds <2 x float>, ptr addrspace(1) %in, i32 %idx
+  %gep.out = getelementptr inbounds <2 x float>, ptr addrspace(1) %in, i32 %idx
+  %val = load <2 x float>, ptr addrspace(1) %gep.in, align 4
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %val)
-  store <2 x float> %fabs, <2 x float> addrspace(1)* %gep.out
+  store <2 x float> %fabs, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @divergent_fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @divergent_fneg_fabs_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: name:            divergent_fneg_fabs_v2f32
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
@@ -340,32 +340,32 @@ define amdgpu_kernel void @divergent_fneg_fabs_v2f32(<2 x float> addrspace(1)* %
 ; GCN: V_OR_B32_e64 %[[REG]]
 
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
-  %val = load <2 x float>, <2 x float> addrspace(1)* %gep.in, align 4
+  %gep.in = getelementptr inbounds <2 x float>, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr inbounds <2 x float>, ptr addrspace(1) %in, i32 %tid
+  %val = load <2 x float>, ptr addrspace(1) %gep.in, align 4
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %val)
   %fneg = fneg <2 x float> %fabs
-  store <2 x float> %fneg, <2 x float> addrspace(1)* %gep.out
+  store <2 x float> %fneg, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @uniform_fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @uniform_fneg_fabs_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
 ; GCN-LABEL: name:            uniform_fneg_fabs_v2f32
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; GCN: %[[REG:[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
 ; GCN: S_OR_B32 killed %{{[0-9]+}}, %[[REG]]
 ; GCN: S_OR_B32 killed %{{[0-9]+}}, %[[REG]]
 
-  %gep.in = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %idx
-  %gep.out = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in, i32 %idx
-  %val = load <2 x float>, <2 x float> addrspace(1)* %gep.in, align 4
+  %gep.in = getelementptr inbounds <2 x float>, ptr addrspace(1) %in, i32 %idx
+  %gep.out = getelementptr inbounds <2 x float>, ptr addrspace(1) %in, i32 %idx
+  %val = load <2 x float>, ptr addrspace(1) %gep.in, align 4
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %val)
   %fneg = fneg <2 x float> %fabs
-  store <2 x float> %fneg, <2 x float> addrspace(1)* %gep.out
+  store <2 x float> %fneg, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @divergent_fneg_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @divergent_fneg_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: name:            divergent_fneg_f64
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; SI: %[[VREG64:[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64
@@ -379,15 +379,15 @@ define amdgpu_kernel void @divergent_fneg_f64(double addrspace(1)* %out, double
 
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds double, double addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
-  %val = load volatile double, double addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds double, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
+  %val = load volatile double, ptr addrspace(1) %in.gep
   %fneg = fneg double %val
-  store double %fneg, double addrspace(1)* %out.gep
+  store double %fneg, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @uniform_fneg_f64(double addrspace(1)* %out, double addrspace(1)* %in, i64 %idx) {
+define amdgpu_kernel void @uniform_fneg_f64(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %idx) {
 ; GCN-LABEL: name:            uniform_fneg_f64
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; SI: %[[VREG64:[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64
@@ -399,15 +399,15 @@ define amdgpu_kernel void @uniform_fneg_f64(double addrspace(1)* %out, double ad
 ; GCN: %[[XOR_COPY:[0-9]+]]:sreg_32 = COPY %[[XOR]]
 ; GCN: REG_SEQUENCE killed %[[LO32]], %subreg.sub0, killed %[[XOR_COPY]], %subreg.sub1
 
-  %in.gep = getelementptr inbounds double, double addrspace(1)* %in, i64 %idx
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %idx
-  %val = load volatile double, double addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds double, ptr addrspace(1) %in, i64 %idx
+  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %idx
+  %val = load volatile double, ptr addrspace(1) %in.gep
   %fneg = fneg double %val
-  store double %fneg, double addrspace(1)* %out.gep
+  store double %fneg, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @divergent_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @divergent_fabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: name:            divergent_fabs_f64
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; SI: %[[VREG64:[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64
@@ -421,15 +421,15 @@ define amdgpu_kernel void @divergent_fabs_f64(double addrspace(1)* %out, double
 
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds double, double addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
-  %val = load volatile double, double addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds double, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
+  %val = load volatile double, ptr addrspace(1) %in.gep
   %fabs = call double @llvm.fabs.f64(double %val)
-  store double %fabs, double addrspace(1)* %out.gep
+  store double %fabs, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @uniform_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in, i64 %idx) {
+define amdgpu_kernel void @uniform_fabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %idx) {
 ; GCN-LABEL: name:            uniform_fabs_f64
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; SI: %[[VREG64:[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64
@@ -442,15 +442,15 @@ define amdgpu_kernel void @uniform_fabs_f64(double addrspace(1)* %out, double ad
 ; GCN: REG_SEQUENCE killed %[[LO32]], %subreg.sub0, killed %[[AND_COPY]], %subreg.sub1
 
 
-  %in.gep = getelementptr inbounds double, double addrspace(1)* %in, i64 %idx
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %idx
-  %val = load volatile double, double addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds double, ptr addrspace(1) %in, i64 %idx
+  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %idx
+  %val = load volatile double, ptr addrspace(1) %in.gep
   %fabs = call double @llvm.fabs.f64(double %val)
-  store double %fabs, double addrspace(1)* %out.gep
+  store double %fabs, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @divergent_fneg_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @divergent_fneg_fabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: name:            divergent_fneg_fabs_f64
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; SI: %[[VREG64:[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64
@@ -464,16 +464,16 @@ define amdgpu_kernel void @divergent_fneg_fabs_f64(double addrspace(1)* %out, do
 
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds double, double addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
-  %val = load volatile double, double addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds double, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
+  %val = load volatile double, ptr addrspace(1) %in.gep
   %fabs = call double @llvm.fabs.f64(double %val)
   %fneg = fneg double %fabs
-  store double %fneg, double addrspace(1)* %out.gep
+  store double %fneg, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @uniform_fneg_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in, i64 %idx) {
+define amdgpu_kernel void @uniform_fneg_fabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %idx) {
 ; GCN-LABEL: name:            uniform_fneg_fabs_f64
 ; GCN-LABEL: bb.0 (%ir-block.0)
 ; SI: %[[VREG64:[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64
@@ -486,12 +486,12 @@ define amdgpu_kernel void @uniform_fneg_fabs_f64(double addrspace(1)* %out, doub
 ; GCN: REG_SEQUENCE killed %[[LO32]], %subreg.sub0, killed %[[OR_COPY]], %subreg.sub1
 
 
-  %in.gep = getelementptr inbounds double, double addrspace(1)* %in, i64 %idx
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %idx
-  %val = load volatile double, double addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds double, ptr addrspace(1) %in, i64 %idx
+  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %idx
+  %val = load volatile double, ptr addrspace(1) %in.gep
   %fabs = call double @llvm.fabs.f64(double %val)
   %fneg = fneg double %fabs
-  store double %fneg, double addrspace(1)* %out.gep
+  store double %fneg, ptr addrspace(1) %out.gep
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fold-fmul-to-neg-abs.ll b/llvm/test/CodeGen/AMDGPU/fold-fmul-to-neg-abs.ll
index 3637722d004d3..9fe566dd58baa 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-fmul-to-neg-abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-fmul-to-neg-abs.ll
@@ -5,14 +5,14 @@
 ; GCN: v_or_b32_e32 [[NEG:v[0-9]]], 0x80000000, [[V]]
 ; GCN: store_dword [[NEG]]
 
-define amdgpu_kernel void @fold_mul_neg(float addrspace(1)* %arg) {
+define amdgpu_kernel void @fold_mul_neg(ptr addrspace(1) %arg) {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tid
-  %v = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %tid
+  %v = load float, ptr addrspace(1) %gep, align 4
   %cmp = fcmp fast ogt float %v, 0.000000e+00
   %sel = select i1 %cmp, float -1.000000e+00, float 1.000000e+00
   %mul = fmul fast float %v, %sel
-  store float %mul, float addrspace(1)* %gep, align 4
+  store float %mul, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -21,14 +21,14 @@ define amdgpu_kernel void @fold_mul_neg(float addrspace(1)* %arg) {
 ; GCN: v_and_b32_e32 [[ABS:v[0-9]]], 0x7fffffff, [[V]]
 ; GCN: store_dword [[ABS]]
 
-define amdgpu_kernel void @fold_mul_abs(float addrspace(1)* %arg) {
+define amdgpu_kernel void @fold_mul_abs(ptr addrspace(1) %arg) {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tid
-  %v = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %tid
+  %v = load float, ptr addrspace(1) %gep, align 4
   %cmp = fcmp fast olt float %v, 0.000000e+00
   %sel = select i1 %cmp, float -1.000000e+00, float 1.000000e+00
   %mul = fmul fast float %v, %sel
-  store float %mul, float addrspace(1)* %gep, align 4
+  store float %mul, ptr addrspace(1) %gep, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
index a46f3e2fb3b42..3f197d28af83d 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
@@ -208,11 +208,11 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f32(<4 x i32> inreg %rsrc, floa
 ; G_GFX1100-NEXT:    s_endpgm
 main_body:
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
-  store float %ret, float addrspace(1)* undef
+  store float %ret, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, float addrspace(3)* %out) {
+define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, ptr addrspace(3) %out) {
 ; SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
 ; SI:       ; %bb.0: ; %main_body
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -365,7 +365,7 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre
 ; GFX1010-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
 main_body:
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
-  store float %ret, float addrspace(3)* %out, align 8
+  store float %ret, ptr addrspace(3) %out, align 8
   ret void
 }
 
@@ -562,11 +562,11 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f32(<4 x i32> inreg %rsrc, floa
 ; G_GFX1100-NEXT:    s_endpgm
 main_body:
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
-  store float %ret, float addrspace(1)* undef
+  store float %ret, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, float addrspace(1)* %out) {
+define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, ptr addrspace(1) %out) {
 ; SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; SI:       ; %bb.0: ; %main_body
 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -698,6 +698,6 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre
 ; G_GFX1100-NEXT:    s_endpgm
 main_body:
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
-  store float %ret, float addrspace(1)* %out, align 8
+  store float %ret, ptr addrspace(1) %out, align 8
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll
index 17bf2ea89c6ca..c0ca697833b08 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll
@@ -3,12 +3,12 @@
 
 ; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX10
 
-declare double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
-declare double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
-declare float @llvm.amdgcn.global.atomic.fmin.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
-declare float @llvm.amdgcn.global.atomic.fmax.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
+declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
+declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
+declare float @llvm.amdgcn.global.atomic.fmin.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
+declare float @llvm.amdgcn.global.atomic.fmax.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
 
-define amdgpu_kernel void @global_atomic_fmin_f32_noret(float addrspace(1)* %ptr, float %data) {
+define amdgpu_kernel void @global_atomic_fmin_f32_noret(ptr addrspace(1) %ptr, float %data) {
 ; GFX10-LABEL: global_atomic_fmin_f32_noret:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    s_clause 0x1
@@ -31,11 +31,11 @@ define amdgpu_kernel void @global_atomic_fmin_f32_noret(float addrspace(1)* %ptr
 ; G_GFX10-NEXT:    global_atomic_fmin v1, v0, s[2:3]
 ; G_GFX10-NEXT:    s_endpgm
 main_body:
-  %ret = call float @llvm.amdgcn.global.atomic.fmin.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
+  %ret = call float @llvm.amdgcn.global.atomic.fmin.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fmax_f32_noret(float addrspace(1)* %ptr, float %data) {
+define amdgpu_kernel void @global_atomic_fmax_f32_noret(ptr addrspace(1) %ptr, float %data) {
 ; GFX10-LABEL: global_atomic_fmax_f32_noret:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    s_clause 0x1
@@ -58,11 +58,11 @@ define amdgpu_kernel void @global_atomic_fmax_f32_noret(float addrspace(1)* %ptr
 ; G_GFX10-NEXT:    global_atomic_fmax v1, v0, s[2:3]
 ; G_GFX10-NEXT:    s_endpgm
 main_body:
-  %ret = call float @llvm.amdgcn.global.atomic.fmax.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
+  %ret = call float @llvm.amdgcn.global.atomic.fmax.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
   ret void
 }
 
-define float @global_atomic_fmax_f32_rtn(float addrspace(1)* %ptr, float %data) {
+define float @global_atomic_fmax_f32_rtn(ptr addrspace(1) %ptr, float %data) {
 ; GFX10-LABEL: global_atomic_fmax_f32_rtn:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -79,11 +79,11 @@ define float @global_atomic_fmax_f32_rtn(float addrspace(1)* %ptr, float %data)
 ; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX10-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = call float @llvm.amdgcn.global.atomic.fmax.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
+  %ret = call float @llvm.amdgcn.global.atomic.fmax.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
   ret float %ret
 }
 
-define float @global_atomic_fmin_f32_rtn(float addrspace(1)* %ptr, float %data) {
+define float @global_atomic_fmin_f32_rtn(ptr addrspace(1) %ptr, float %data) {
 ; GFX10-LABEL: global_atomic_fmin_f32_rtn:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -100,11 +100,11 @@ define float @global_atomic_fmin_f32_rtn(float addrspace(1)* %ptr, float %data)
 ; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX10-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = call float @llvm.amdgcn.global.atomic.fmin.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
+  %ret = call float @llvm.amdgcn.global.atomic.fmin.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
   ret float %ret
 }
 
-define amdgpu_kernel void @global_atomic_fmin_f64_noret(double addrspace(1)* %ptr, double %data) {
+define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) {
 ; GFX10-LABEL: global_atomic_fmin_f64_noret:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -125,11 +125,11 @@ define amdgpu_kernel void @global_atomic_fmin_f64_noret(double addrspace(1)* %pt
 ; G_GFX10-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
 ; G_GFX10-NEXT:    s_endpgm
 main_body:
-  %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fmax_f64_noret(double addrspace(1)* %ptr, double %data) {
+define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) {
 ; GFX10-LABEL: global_atomic_fmax_f64_noret:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -150,11 +150,11 @@ define amdgpu_kernel void @global_atomic_fmax_f64_noret(double addrspace(1)* %pt
 ; G_GFX10-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
 ; G_GFX10-NEXT:    s_endpgm
 main_body:
-  %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
   ret void
 }
 
-define double @global_atomic_fmax_f64_rtn(double addrspace(1)* %ptr, double %data) {
+define double @global_atomic_fmax_f64_rtn(ptr addrspace(1) %ptr, double %data) {
 ; GFX10-LABEL: global_atomic_fmax_f64_rtn:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -171,11 +171,11 @@ define double @global_atomic_fmax_f64_rtn(double addrspace(1)* %ptr, double %dat
 ; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX10-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
   ret double %ret
 }
 
-define double @global_atomic_fmin_f64_rtn(double addrspace(1)* %ptr, double %data) {
+define double @global_atomic_fmin_f64_rtn(ptr addrspace(1) %ptr, double %data) {
 ; GFX10-LABEL: global_atomic_fmin_f64_rtn:
 ; GFX10:       ; %bb.0: ; %main_body
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -192,6 +192,6 @@ define double @global_atomic_fmin_f64_rtn(double addrspace(1)* %ptr, double %dat
 ; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; G_GFX10-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
   ret double %ret
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
index 6873eaf022ea1..939d45c74107d 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
@@ -180,11 +180,11 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, doub
 ; G_GFX1030-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
-  store double %ret, double addrspace(3)* undef
+  store double %ret, ptr addrspace(3) undef
   ret void
 }
 
-define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(3)* %out) {
+define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) {
 ; SI-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
 ; SI:       ; %bb.0: ; %main_body
 ; SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
@@ -246,7 +246,7 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %r
 ; G_GFX1030-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
-  store double %ret, double addrspace(3)* %out, align 8
+  store double %ret, ptr addrspace(3) %out, align 8
   ret void
 }
 
@@ -417,11 +417,11 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, doub
 ; G_GFX1030-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
-  store double %ret, double addrspace(3)* undef
+  store double %ret, ptr addrspace(3) undef
   ret void
 }
 
-define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(3)* %out) {
+define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) {
 ; SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; SI:       ; %bb.0: ; %main_body
 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -531,6 +531,6 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
 ; G_GFX1030-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
-  store double %ret, double addrspace(3)* %out, align 8
+  store double %ret, ptr addrspace(3) %out, align 8
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index a50da7f08dda6..bb9d4028b7e20 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -8,12 +8,12 @@
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fptrunc_f32_to_f16(
-    half addrspace(1)* %r,
-    float addrspace(1)* %a) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
 entry:
-  %a.val = load float, float addrspace(1)* %a
+  %a.val = load float, ptr addrspace(1) %a
   %r.val = fptrunc float %a.val to half
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -24,12 +24,12 @@ entry:
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fptrunc_f64_to_f16(
-    half addrspace(1)* %r,
-    double addrspace(1)* %a) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
 entry:
-  %a.val = load double, double addrspace(1)* %a
+  %a.val = load double, ptr addrspace(1) %a
   %r.val = fptrunc double %a.val to half
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -50,12 +50,12 @@ entry:
 ; GCN:     s_endpgm
 
 define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
-    <2 x half> addrspace(1)* %r,
-    <2 x float> addrspace(1)* %a) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
 entry:
-  %a.val = load <2 x float>, <2 x float> addrspace(1)* %a
+  %a.val = load <2 x float>, ptr addrspace(1) %a
   %r.val = fptrunc <2 x float> %a.val to <2 x half>
-  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
+  store <2 x half> %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -77,12 +77,12 @@ entry:
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 
 define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
-    <2 x half> addrspace(1)* %r,
-    <2 x double> addrspace(1)* %a) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
 entry:
-  %a.val = load <2 x double>, <2 x double> addrspace(1)* %a
+  %a.val = load <2 x double>, ptr addrspace(1) %a
   %r.val = fptrunc <2 x double> %a.val to <2 x half>
-  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
+  store <2 x half> %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -92,13 +92,13 @@ entry:
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
-    half addrspace(1)* %r,
-    float addrspace(1)* %a) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
 entry:
-  %a.val = load float, float addrspace(1)* %a
+  %a.val = load float, ptr addrspace(1) %a
   %a.fneg = fneg float %a.val
   %r.val = fptrunc float %a.fneg to half
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -108,13 +108,13 @@ entry:
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
-    half addrspace(1)* %r,
-    float addrspace(1)* %a) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
 entry:
-  %a.val = load float, float addrspace(1)* %a
+  %a.val = load float, ptr addrspace(1) %a
   %a.fabs = call float @llvm.fabs.f32(float %a.val)
   %r.val = fptrunc float %a.fabs to half
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -124,14 +124,14 @@ entry:
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
-    half addrspace(1)* %r,
-    float addrspace(1)* %a) #0 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) #0 {
 entry:
-  %a.val = load float, float addrspace(1)* %a
+  %a.val = load float, ptr addrspace(1) %a
   %a.fabs = call float @llvm.fabs.f32(float %a.val)
   %a.fneg.fabs = fneg float %a.fabs
   %r.val = fptrunc float %a.fneg.fabs to half
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -142,14 +142,14 @@ entry:
 ; GFX9-NOT: v_and_b32
 ; GCN: buffer_store_dword v[[R_F16]]
 define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
-    i32 addrspace(1)* %r,
-    float addrspace(1)* %a) #0 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) #0 {
 entry:
-  %a.val = load float, float addrspace(1)* %a
+  %a.val = load float, ptr addrspace(1) %a
   %r.val = fptrunc float %a.val to half
   %r.i16 = bitcast half %r.val to i16
   %zext = zext i16 %r.i16 to i32
-  store i32 %zext, i32 addrspace(1)* %r
+  store i32 %zext, ptr addrspace(1) %r
   ret void
 }
 
@@ -160,15 +160,15 @@ entry:
 ; GFX9-NOT: v_and_b32
 ; GCN: buffer_store_dword v[[R_F16]]
 define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
-    i32 addrspace(1)* %r,
-    float addrspace(1)* %a) #0 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) #0 {
 entry:
-  %a.val = load float, float addrspace(1)* %a
+  %a.val = load float, ptr addrspace(1) %a
   %a.fabs = call float @llvm.fabs.f32(float %a.val)
   %r.val = fptrunc float %a.fabs to half
   %r.i16 = bitcast half %r.val to i16
   %zext = zext i16 %r.i16 to i32
-  store i32 %zext, i32 addrspace(1)* %r
+  store i32 %zext, ptr addrspace(1) %r
   ret void
 }
 
@@ -178,14 +178,14 @@ entry:
 ; GCN: v_bfe_i32 v[[R_F16_SEXT:[0-9]+]], v[[R_F16]], 0, 16
 ; GCN: buffer_store_dword v[[R_F16_SEXT]]
 define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
-    i32 addrspace(1)* %r,
-    float addrspace(1)* %a) #0 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) #0 {
 entry:
-  %a.val = load float, float addrspace(1)* %a
+  %a.val = load float, ptr addrspace(1) %a
   %r.val = fptrunc float %a.val to half
   %r.i16 = bitcast half %r.val to i16
   %zext = sext i16 %r.i16 to i32
-  store i32 %zext, i32 addrspace(1)* %r
+  store i32 %zext, ptr addrspace(1) %r
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index a977c2ebd746b..52ba95926f727 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -4,9 +4,9 @@
 
 ; GCN-LABEL: {{^}}fptrunc_f64_to_f32:
 ; GCN: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) {
   %result = fptrunc double %in to float
-  store float %result, float addrspace(1)* %out
+  store float %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -14,19 +14,19 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %
 ; GCN-NOT: v_cvt
 ; GCN-UNSAFE: v_cvt_f32_f64_e32 [[F32:v[0-9]+]]
 ; GCN-UNSAFE: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[F32]]
-define amdgpu_kernel void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) {
   %result = fptrunc double %in to half
   %result_i16 = bitcast half %result to i16
-  store i16 %result_i16, i16 addrspace(1)* %out
+  store i16 %result_i16, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}fptrunc_v2f64_to_v2f32:
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
-define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x double> %in) {
   %result = fptrunc <2 x double> %in to <2 x float>
-  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  store <2 x float> %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -34,9 +34,9 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
-define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(<3 x float> addrspace(1)* %out, <3 x double> %in) {
+define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x double> %in) {
   %result = fptrunc <3 x double> %in to <3 x float>
-  store <3 x float> %result, <3 x float> addrspace(1)* %out
+  store <3 x float> %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -45,9 +45,9 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(<3 x float> addrspace(1)* %out
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
-define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x double> %in) {
   %result = fptrunc <4 x double> %in to <4 x float>
-  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  store <4 x float> %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -60,8 +60,8 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
-define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) {
+define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x double> %in) {
   %result = fptrunc <8 x double> %in to <8 x float>
-  store <8 x float> %result, <8 x float> addrspace(1)* %out
+  store <8 x float> %result, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 77fe9d3817c61..00540a299d058 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -21,7 +21,7 @@
 ; GCN: ds_write_b32 v0, v0
 define void @func_mov_fi_i32() #0 {
   %alloca = alloca i32, addrspace(5)
-  store volatile i32 addrspace(5)* %alloca, i32 addrspace(5)* addrspace(3)* undef
+  store volatile ptr addrspace(5) %alloca, ptr addrspace(3) undef
   ret void
 }
 
@@ -47,8 +47,8 @@ define void @func_mov_fi_i32() #0 {
 define void @func_mov_fi_i32_offset() #0 {
   %alloca0 = alloca i32, addrspace(5)
   %alloca1 = alloca i32, addrspace(5)
-  store volatile i32 addrspace(5)* %alloca0, i32 addrspace(5)* addrspace(3)* undef
-  store volatile i32 addrspace(5)* %alloca1, i32 addrspace(5)* addrspace(3)* undef
+  store volatile ptr addrspace(5) %alloca0, ptr addrspace(3) undef
+  store volatile ptr addrspace(5) %alloca1, ptr addrspace(3) undef
   ret void
 }
 
@@ -71,8 +71,8 @@ define void @func_mov_fi_i32_offset() #0 {
 ; GCN: ds_write_b32 v0, v0
 define void @func_add_constant_to_fi_i32() #0 {
   %alloca = alloca [2 x i32], align 4, addrspace(5)
-  %gep0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %alloca, i32 0, i32 1
-  store volatile i32 addrspace(5)* %gep0, i32 addrspace(5)* addrspace(3)* undef
+  %gep0 = getelementptr inbounds [2 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
+  store volatile ptr addrspace(5) %gep0, ptr addrspace(3) undef
   ret void
 }
 
@@ -93,9 +93,9 @@ define void @func_add_constant_to_fi_i32() #0 {
 ; GCN: ds_write_b32 v0, v0
 define void @func_other_fi_user_i32() #0 {
   %alloca = alloca [2 x i32], align 4, addrspace(5)
-  %ptrtoint = ptrtoint [2 x i32] addrspace(5)* %alloca to i32
+  %ptrtoint = ptrtoint ptr addrspace(5) %alloca to i32
   %mul = mul i32 %ptrtoint, 9
-  store volatile i32 %mul, i32 addrspace(3)* undef
+  store volatile i32 %mul, ptr addrspace(3) undef
   ret void
 }
 
@@ -103,8 +103,8 @@ define void @func_other_fi_user_i32() #0 {
 ; GCN: v_mov_b32_e32 v1, 15{{$}}
 ; MUBUF:        buffer_store_dword v1, v0, s[0:3], 0 offen{{$}}
 ; GFX9-FLATSCR: scratch_store_dword v0, v1, off{{$}}
-define void @func_store_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 {
-  store volatile i32 15, i32 addrspace(5)* %ptr
+define void @func_store_private_arg_i32_ptr(ptr addrspace(5) %ptr) #0 {
+  store volatile i32 15, ptr addrspace(5) %ptr
   ret void
 }
 
@@ -112,8 +112,8 @@ define void @func_store_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 {
 ; GCN: s_waitcnt
 ; MUBUF-NEXT:        buffer_load_dword v0, v0, s[0:3], 0 offen glc{{$}}
 ; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off glc{{$}}
-define void @func_load_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 {
-  %val = load volatile i32, i32 addrspace(5)* %ptr
+define void @func_load_private_arg_i32_ptr(ptr addrspace(5) %ptr) #0 {
+  %val = load volatile i32, ptr addrspace(5) %ptr
   ret void
 }
 
@@ -131,11 +131,11 @@ define void @func_load_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 {
 
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
-define void @void_func_byval_struct_i8_i32_ptr({ i8, i32 } addrspace(5)* byval({ i8, i32 }) %arg0) #0 {
-  %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 0
-  %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 1
-  %load1 = load i32, i32 addrspace(5)* %gep1
-  store volatile i32 addrspace(5)* %gep1, i32 addrspace(5)* addrspace(3)* undef
+define void @void_func_byval_struct_i8_i32_ptr(ptr addrspace(5) byval({ i8, i32 }) %arg0) #0 {
+  %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 0
+  %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 1
+  %load1 = load i32, ptr addrspace(5) %gep1
+  store volatile ptr addrspace(5) %gep1, ptr addrspace(3) undef
   ret void
 }
 
@@ -145,13 +145,13 @@ define void @void_func_byval_struct_i8_i32_ptr({ i8, i32 } addrspace(5)* byval({
 ; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
 ; GFX9-FLATSCR-NEXT: scratch_load_ubyte v0, off, s32
 ; GFX9-FLATSCR-NEXT: scratch_load_dword v1, off, s32 offset:4
-define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* byval({ i8, i32 }) %arg0) #0 {
-  %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 0
-  %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 1
-  %load0 = load i8, i8 addrspace(5)* %gep0
-  %load1 = load i32, i32 addrspace(5)* %gep1
-  store volatile i8 %load0, i8 addrspace(3)* undef
-  store volatile i32 %load1, i32 addrspace(3)* undef
+define void @void_func_byval_struct_i8_i32_ptr_value(ptr addrspace(5) byval({ i8, i32 }) %arg0) #0 {
+  %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 0
+  %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 1
+  %load0 = load i8, ptr addrspace(5) %gep0
+  %load1 = load i32, ptr addrspace(5) %gep1
+  store volatile i8 %load0, ptr addrspace(3) undef
+  store volatile i32 %load1, ptr addrspace(3) undef
   ret void
 }
 
@@ -172,15 +172,15 @@ define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* b
 ; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]]
 
 ; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]]
-define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 } addrspace(5)* byval({ i8, i32 }) %arg0, i32 %arg2) #0 {
+define void @void_func_byval_struct_i8_i32_ptr_nonentry_block(ptr addrspace(5) byval({ i8, i32 }) %arg0, i32 %arg2) #0 {
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %bb, label %ret
 
 bb:
-  %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 0
-  %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 1
-  %load1 = load volatile i32, i32 addrspace(5)* %gep1
-  store volatile i32 addrspace(5)* %gep1, i32 addrspace(5)* addrspace(3)* undef
+  %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 0
+  %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 1
+  %load1 = load volatile i32, ptr addrspace(5) %gep1
+  store volatile ptr addrspace(5) %gep1, ptr addrspace(3) undef
   br label %ret
 
 ret:
@@ -205,12 +205,11 @@ ret:
 define void @func_other_fi_user_non_inline_imm_offset_i32() #0 {
   %alloca0 = alloca [128 x i32], align 4, addrspace(5)
   %alloca1 = alloca [8 x i32], align 4, addrspace(5)
-  %gep0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca0, i32 0, i32 65
-  %gep1 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %alloca1, i32 0, i32 0
-  store volatile i32 7, i32 addrspace(5)* %gep0
-  %ptrtoint = ptrtoint i32 addrspace(5)* %gep1 to i32
+  %gep0 = getelementptr inbounds [128 x i32], ptr addrspace(5) %alloca0, i32 0, i32 65
+  store volatile i32 7, ptr addrspace(5) %gep0
+  %ptrtoint = ptrtoint ptr addrspace(5) %alloca1 to i32
   %mul = mul i32 %ptrtoint, 9
-  store volatile i32 %mul, i32 addrspace(3)* undef
+  store volatile i32 %mul, ptr addrspace(3) undef
   ret void
 }
 
@@ -232,17 +231,16 @@ define void @func_other_fi_user_non_inline_imm_offset_i32_vcc_live() #0 {
   %alloca0 = alloca [128 x i32], align 4, addrspace(5)
   %alloca1 = alloca [8 x i32], align 4, addrspace(5)
   %vcc = call i64 asm sideeffect "; def $0", "={vcc}"()
-  %gep0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca0, i32 0, i32 65
-  %gep1 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %alloca1, i32 0, i32 0
-  store volatile i32 7, i32 addrspace(5)* %gep0
+  %gep0 = getelementptr inbounds [128 x i32], ptr addrspace(5) %alloca0, i32 0, i32 65
+  store volatile i32 7, ptr addrspace(5) %gep0
   call void asm sideeffect "; use $0", "{vcc}"(i64 %vcc)
-  %ptrtoint = ptrtoint i32 addrspace(5)* %gep1 to i32
+  %ptrtoint = ptrtoint ptr addrspace(5) %alloca1 to i32
   %mul = mul i32 %ptrtoint, 9
-  store volatile i32 %mul, i32 addrspace(3)* undef
+  store volatile i32 %mul, ptr addrspace(3) undef
   ret void
 }
 
-declare void @func(<4 x float> addrspace(5)* nocapture) #0
+declare void @func(ptr addrspace(5) nocapture) #0
 
 ; undef flag not preserved in eliminateFrameIndex when handling the
 ; stores in the middle block.
@@ -261,14 +259,14 @@ define void @undefined_stack_store_reg(float %arg, i32 %arg1) #0 {
 bb:
   %tmp = alloca <4 x float>, align 16, addrspace(5)
   %tmp2 = insertelement <4 x float> undef, float %arg, i32 0
-  store <4 x float> %tmp2, <4 x float> addrspace(5)* undef
+  store <4 x float> %tmp2, ptr addrspace(5) undef
   %tmp3 = icmp eq i32 %arg1, 0
   br i1 %tmp3, label %bb4, label %bb5
 
 bb4:
-  call void @func(<4 x float> addrspace(5)* nonnull undef)
-  store <4 x float> %tmp2, <4 x float> addrspace(5)* %tmp, align 16
-  call void @func(<4 x float> addrspace(5)* nonnull %tmp)
+  call void @func(ptr addrspace(5) nonnull undef)
+  store <4 x float> %tmp2, ptr addrspace(5) %tmp, align 16
+  call void @func(ptr addrspace(5) nonnull %tmp)
   br label %bb5
 
 bb5:
@@ -296,10 +294,10 @@ define void @alloca_ptr_nonentry_block(i32 %arg0) #0 {
   br i1 %cmp, label %bb, label %ret
 
 bb:
-  %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %alloca0, i32 0, i32 0
-  %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %alloca0, i32 0, i32 1
-  %load1 = load volatile i32, i32 addrspace(5)* %gep1
-  store volatile i32 addrspace(5)* %gep1, i32 addrspace(5)* addrspace(3)* undef
+  %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %alloca0, i32 0, i32 0
+  %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %alloca0, i32 0, i32 1
+  %load1 = load volatile i32, ptr addrspace(5) %gep1
+  store volatile ptr addrspace(5) %gep1, ptr addrspace(3) undef
   br label %ret
 
 ret:
@@ -321,16 +319,16 @@ define protected amdgpu_kernel void @tied_operand_test(i1 %c1, i1 %c2, i32 %val)
 entry:
   %scratch0 = alloca i16, align 4, addrspace(5)
   %scratch1 = alloca i16, align 4, addrspace(5)
-  %first = select i1 %c1, i16 addrspace(5)* %scratch0, i16 addrspace(5)* %scratch1
-  %spec.select = select i1 %c2, i16 addrspace(5)* %first, i16 addrspace(5)* %scratch0
-  %dead.load = load i16, i16 addrspace(5)* %spec.select, align 2
-  %scratch0.load = load i16, i16 addrspace(5)* %scratch0, align 4
+  %first = select i1 %c1, ptr addrspace(5) %scratch0, ptr addrspace(5) %scratch1
+  %spec.select = select i1 %c2, ptr addrspace(5) %first, ptr addrspace(5) %scratch0
+  %dead.load = load i16, ptr addrspace(5) %spec.select, align 2
+  %scratch0.load = load i16, ptr addrspace(5) %scratch0, align 4
   %add4 = add nuw nsw i32 %val, 4
-  %addr0 = getelementptr inbounds %struct0, %struct0 addrspace(3)* bitcast (%struct0 addrspace(3)* @_ZZN0 to %struct0 addrspace(3)*), i32 0, i32 0, i32 %add4, i32 0
-  store i16 123, i16 addrspace(3)* %addr0, align 2
+  %addr0 = getelementptr inbounds %struct0, ptr addrspace(3) @_ZZN0, i32 0, i32 0, i32 %add4, i32 0
+  store i16 123, ptr addrspace(3) %addr0, align 2
   %add5 = add nuw nsw i32 %val, 5
-  %addr1 = getelementptr inbounds %struct0, %struct0 addrspace(3)* bitcast (%struct0 addrspace(3)* @_ZZN0 to %struct0 addrspace(3)*), i32 0, i32 0, i32 %add5, i32 0
-  store i16 %scratch0.load, i16 addrspace(3)* %addr1, align 2
+  %addr1 = getelementptr inbounds %struct0, ptr addrspace(3) @_ZZN0, i32 0, i32 0, i32 %add5, i32 0
+  store i16 %scratch0.load, ptr addrspace(3) %addr1, align 2
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index d8098f6d7ead4..8029d4270b936 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -6,7 +6,7 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
 
-define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
+define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: frem_f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -188,16 +188,16 @@ define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)*
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-                      half addrspace(1)* %in2) #0 {
-   %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
-   %r0 = load half, half addrspace(1)* %in1, align 4
-   %r1 = load half, half addrspace(1)* %gep2, align 4
+                      ptr addrspace(1) %in2) #0 {
+   %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
+   %r0 = load half, ptr addrspace(1) %in1, align 4
+   %r1 = load half, ptr addrspace(1) %gep2, align 4
    %r2 = frem half %r0, %r1
-   store half %r2, half addrspace(1)* %out, align 4
+   store half %r2, ptr addrspace(1) %out, align 4
    ret void
 }
 
-define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
+define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: fast_frem_f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -333,16 +333,16 @@ define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-                      half addrspace(1)* %in2) #0 {
-   %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
-   %r0 = load half, half addrspace(1)* %in1, align 4
-   %r1 = load half, half addrspace(1)* %gep2, align 4
+                      ptr addrspace(1) %in2) #0 {
+   %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
+   %r0 = load half, ptr addrspace(1) %in1, align 4
+   %r1 = load half, ptr addrspace(1) %gep2, align 4
    %r2 = frem fast half %r0, %r1
-   store half %r2, half addrspace(1)* %out, align 4
+   store half %r2, ptr addrspace(1) %out, align 4
    ret void
 }
 
-define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
+define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: unsafe_frem_f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -478,16 +478,16 @@ define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspa
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-                             half addrspace(1)* %in2) #1 {
-   %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
-   %r0 = load half, half addrspace(1)* %in1, align 4
-   %r1 = load half, half addrspace(1)* %gep2, align 4
+                             ptr addrspace(1) %in2) #1 {
+   %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
+   %r0 = load half, ptr addrspace(1) %in1, align 4
+   %r1 = load half, ptr addrspace(1) %gep2, align 4
    %r2 = frem afn half %r0, %r1
-   store half %r2, half addrspace(1)* %out, align 4
+   store half %r2, ptr addrspace(1) %out, align 4
    ret void
 }
 
-define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: frem_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -685,16 +685,16 @@ define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-                      float addrspace(1)* %in2) #0 {
-   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
-   %r0 = load float, float addrspace(1)* %in1, align 4
-   %r1 = load float, float addrspace(1)* %gep2, align 4
+                      ptr addrspace(1) %in2) #0 {
+   %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
+   %r0 = load float, ptr addrspace(1) %in1, align 4
+   %r1 = load float, ptr addrspace(1) %gep2, align 4
    %r2 = frem float %r0, %r1
-   store float %r2, float addrspace(1)* %out, align 4
+   store float %r2, ptr addrspace(1) %out, align 4
    ret void
 }
 
-define amdgpu_kernel void @fast_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: fast_frem_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -822,16 +822,16 @@ define amdgpu_kernel void @fast_frem_f32(float addrspace(1)* %out, float addrspa
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-                      float addrspace(1)* %in2) #0 {
-   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
-   %r0 = load float, float addrspace(1)* %in1, align 4
-   %r1 = load float, float addrspace(1)* %gep2, align 4
+                      ptr addrspace(1) %in2) #0 {
+   %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
+   %r0 = load float, ptr addrspace(1) %in1, align 4
+   %r1 = load float, ptr addrspace(1) %gep2, align 4
    %r2 = frem fast float %r0, %r1
-   store float %r2, float addrspace(1)* %out, align 4
+   store float %r2, ptr addrspace(1) %out, align 4
    ret void
 }
 
-define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: unsafe_frem_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -959,16 +959,16 @@ define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrs
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-                             float addrspace(1)* %in2) #1 {
-   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
-   %r0 = load float, float addrspace(1)* %in1, align 4
-   %r1 = load float, float addrspace(1)* %gep2, align 4
+                             ptr addrspace(1) %in2) #1 {
+   %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
+   %r0 = load float, ptr addrspace(1) %in1, align 4
+   %r1 = load float, ptr addrspace(1) %gep2, align 4
    %r2 = frem afn float %r0, %r1
-   store float %r2, float addrspace(1)* %out, align 4
+   store float %r2, ptr addrspace(1) %out, align 4
    ret void
 }
 
-define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: frem_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
@@ -1179,15 +1179,15 @@ define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(
 ; GFX11-NEXT:    global_store_b64 v12, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-                      double addrspace(1)* %in2) #0 {
-   %r0 = load double, double addrspace(1)* %in1, align 8
-   %r1 = load double, double addrspace(1)* %in2, align 8
+                      ptr addrspace(1) %in2) #0 {
+   %r0 = load double, ptr addrspace(1) %in1, align 8
+   %r1 = load double, ptr addrspace(1) %in2, align 8
    %r2 = frem double %r0, %r1
-   store double %r2, double addrspace(1)* %out, align 8
+   store double %r2, ptr addrspace(1) %out, align 8
    ret void
 }
 
-define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: fast_frem_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1372,15 +1372,15 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
 ; GFX11-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-                      double addrspace(1)* %in2) #0 {
-   %r0 = load double, double addrspace(1)* %in1, align 8
-   %r1 = load double, double addrspace(1)* %in2, align 8
+                      ptr addrspace(1) %in2) #0 {
+   %r0 = load double, ptr addrspace(1) %in1, align 8
+   %r1 = load double, ptr addrspace(1) %in2, align 8
    %r2 = frem fast double %r0, %r1
-   store double %r2, double addrspace(1)* %out, align 8
+   store double %r2, ptr addrspace(1) %out, align 8
    ret void
 }
 
-define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: unsafe_frem_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1565,15 +1565,15 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
 ; GFX11-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-                             double addrspace(1)* %in2) #1 {
-   %r0 = load double, double addrspace(1)* %in1, align 8
-   %r1 = load double, double addrspace(1)* %in2, align 8
+                             ptr addrspace(1) %in2) #1 {
+   %r0 = load double, ptr addrspace(1) %in1, align 8
+   %r1 = load double, ptr addrspace(1) %in2, align 8
    %r2 = frem afn double %r0, %r1
-   store double %r2, double addrspace(1)* %out, align 8
+   store double %r2, ptr addrspace(1) %out, align 8
    ret void
 }
 
-define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
+define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: frem_v2f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1849,16 +1849,16 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-                        <2 x half> addrspace(1)* %in2) #0 {
-   %gep2 = getelementptr <2 x half>, <2 x half> addrspace(1)* %in2, i32 4
-   %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1, align 8
-   %r1 = load <2 x half>, <2 x half> addrspace(1)* %gep2, align 8
+                        ptr addrspace(1) %in2) #0 {
+   %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
+   %r0 = load <2 x half>, ptr addrspace(1) %in1, align 8
+   %r1 = load <2 x half>, ptr addrspace(1) %gep2, align 8
    %r2 = frem <2 x half> %r0, %r1
-   store <2 x half> %r2, <2 x half> addrspace(1)* %out, align 8
+   store <2 x half> %r2, ptr addrspace(1) %out, align 8
    ret void
 }
 
-define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in1,
+define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: frem_v4f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2299,16 +2299,16 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
 ; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-                        <4 x half> addrspace(1)* %in2) #0 {
-   %gep2 = getelementptr <4 x half>, <4 x half> addrspace(1)* %in2, i32 4
-   %r0 = load <4 x half>, <4 x half> addrspace(1)* %in1, align 16
-   %r1 = load <4 x half>, <4 x half> addrspace(1)* %gep2, align 16
+                        ptr addrspace(1) %in2) #0 {
+   %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
+   %r0 = load <4 x half>, ptr addrspace(1) %in1, align 16
+   %r1 = load <4 x half>, ptr addrspace(1) %gep2, align 16
    %r2 = frem <4 x half> %r0, %r1
-   store <4 x half> %r2, <4 x half> addrspace(1)* %out, align 16
+   store <4 x half> %r2, ptr addrspace(1) %out, align 16
    ret void
 }
 
-define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: frem_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2602,16 +2602,16 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
 ; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-                        <2 x float> addrspace(1)* %in2) #0 {
-   %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4
-   %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8
-   %r1 = load <2 x float>, <2 x float> addrspace(1)* %gep2, align 8
+                        ptr addrspace(1) %in2) #0 {
+   %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4
+   %r0 = load <2 x float>, ptr addrspace(1) %in1, align 8
+   %r1 = load <2 x float>, ptr addrspace(1) %gep2, align 8
    %r2 = frem <2 x float> %r0, %r1
-   store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8
+   store <2 x float> %r2, ptr addrspace(1) %out, align 8
    ret void
 }
 
-define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: frem_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3097,16 +3097,16 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-                        <4 x float> addrspace(1)* %in2) #0 {
-   %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4
-   %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16
-   %r1 = load <4 x float>, <4 x float> addrspace(1)* %gep2, align 16
+                        ptr addrspace(1) %in2) #0 {
+   %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4
+   %r0 = load <4 x float>, ptr addrspace(1) %in1, align 16
+   %r1 = load <4 x float>, ptr addrspace(1) %gep2, align 16
    %r2 = frem <4 x float> %r0, %r1
-   store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16
+   store <4 x float> %r2, ptr addrspace(1) %out, align 16
    ret void
 }
 
-define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: frem_v2f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
@@ -3427,12 +3427,12 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
 ; GFX11-NEXT:    global_store_b128 v16, v[0:3], s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-                        <2 x double> addrspace(1)* %in2) #0 {
-   %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4
-   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16
-   %r1 = load <2 x double>, <2 x double> addrspace(1)* %gep2, align 16
+                        ptr addrspace(1) %in2) #0 {
+   %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4
+   %r0 = load <2 x double>, ptr addrspace(1) %in1, align 16
+   %r1 = load <2 x double>, ptr addrspace(1) %gep2, align 16
    %r2 = frem <2 x double> %r0, %r1
-   store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16
+   store <2 x double> %r2, ptr addrspace(1) %out, align 16
    ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll b/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll
index 1f67442c45a9e..050016fc07202 100644
--- a/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll
@@ -13,10 +13,10 @@ declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone
 ; CI: v_trunc_f64
 ; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0xb0014
 ; SI: s_endpgm
-define amdgpu_kernel void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
-  %x = load double, double addrspace(1)* %in, align 8
+define amdgpu_kernel void @v_ftrunc_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %x = load double, ptr addrspace(1) %in, align 8
   %y = call double @llvm.trunc.f64(double %x) nounwind readnone
-  store double %y, double addrspace(1)* %out, align 8
+  store double %y, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -35,18 +35,18 @@ define amdgpu_kernel void @v_ftrunc_f64(double addrspace(1)* %out, double addrsp
 ; SI-DAG: s_cselect_b32
 ; SI-DAG: s_cselect_b32
 ; SI: s_endpgm
-define amdgpu_kernel void @ftrunc_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ftrunc_f64(ptr addrspace(1) %out, double %x) {
   %y = call double @llvm.trunc.f64(double %x) nounwind readnone
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}ftrunc_v2f64:
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define amdgpu_kernel void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+define amdgpu_kernel void @ftrunc_v2f64(ptr addrspace(1) %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x) nounwind readnone
-  store <2 x double> %y, <2 x double> addrspace(1)* %out
+  store <2 x double> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -54,9 +54,9 @@ define amdgpu_kernel void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x do
 ; FIXME-CI: v_trunc_f64_e32
 ; FIXME-CI: v_trunc_f64_e32
 ; FIXME-CI: v_trunc_f64_e32
-; define amdgpu_kernel void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+; define amdgpu_kernel void @ftrunc_v3f64(ptr addrspace(1) %out, <3 x double> %x) {
 ;   %y = call <3 x double> @llvm.trunc.v3f64(<3 x double> %x) nounwind readnone
-;   store <3 x double> %y, <3 x double> addrspace(1)* %out
+;   store <3 x double> %y, ptr addrspace(1) %out
 ;   ret void
 ; }
 
@@ -65,9 +65,9 @@ define amdgpu_kernel void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x do
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define amdgpu_kernel void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+define amdgpu_kernel void @ftrunc_v4f64(ptr addrspace(1) %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone
-  store <4 x double> %y, <4 x double> addrspace(1)* %out
+  store <4 x double> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -80,9 +80,9 @@ define amdgpu_kernel void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x do
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define amdgpu_kernel void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+define amdgpu_kernel void @ftrunc_v8f64(ptr addrspace(1) %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.trunc.v8f64(<8 x double> %x) nounwind readnone
-  store <8 x double> %y, <8 x double> addrspace(1)* %out
+  store <8 x double> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -103,8 +103,8 @@ define amdgpu_kernel void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x do
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define amdgpu_kernel void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+define amdgpu_kernel void @ftrunc_v16f64(ptr addrspace(1) %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.trunc.v16f64(<16 x double> %x) nounwind readnone
-  store <16 x double> %y, <16 x double> addrspace(1)* %out
+  store <16 x double> %y, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/ftrunc.ll b/llvm/test/CodeGen/AMDGPU/ftrunc.ll
index b5ad01eaeaf0a..14cd878f46f2c 100644
--- a/llvm/test/CodeGen/AMDGPU/ftrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/ftrunc.ll
@@ -12,9 +12,9 @@ declare <16 x float> @llvm.trunc.v16f32(<16 x float>) nounwind readnone
 ; FUNC-LABEL: {{^}}ftrunc_f32:
 ; EG: TRUNC
 ; SI: v_trunc_f32_e32
-define amdgpu_kernel void @ftrunc_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @ftrunc_f32(ptr addrspace(1) %out, float %x) {
   %y = call float @llvm.trunc.f32(float %x) nounwind readnone
-  store float %y, float addrspace(1)* %out
+  store float %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -23,9 +23,9 @@ define amdgpu_kernel void @ftrunc_f32(float addrspace(1)* %out, float %x) {
 ; EG: TRUNC
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define amdgpu_kernel void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
+define amdgpu_kernel void @ftrunc_v2f32(ptr addrspace(1) %out, <2 x float> %x) {
   %y = call <2 x float> @llvm.trunc.v2f32(<2 x float> %x) nounwind readnone
-  store <2 x float> %y, <2 x float> addrspace(1)* %out
+  store <2 x float> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -36,9 +36,9 @@ define amdgpu_kernel void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x flo
 ; FIXME-SI: v_trunc_f32_e32
 ; FIXME-SI: v_trunc_f32_e32
 ; FIXME-SI: v_trunc_f32_e32
-; define amdgpu_kernel void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
+; define amdgpu_kernel void @ftrunc_v3f32(ptr addrspace(1) %out, <3 x float> %x) {
 ;   %y = call <3 x float> @llvm.trunc.v3f32(<3 x float> %x) nounwind readnone
-;   store <3 x float> %y, <3 x float> addrspace(1)* %out
+;   store <3 x float> %y, ptr addrspace(1) %out
 ;   ret void
 ; }
 
@@ -51,9 +51,9 @@ define amdgpu_kernel void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x flo
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define amdgpu_kernel void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
+define amdgpu_kernel void @ftrunc_v4f32(ptr addrspace(1) %out, <4 x float> %x) {
   %y = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone
-  store <4 x float> %y, <4 x float> addrspace(1)* %out
+  store <4 x float> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -74,9 +74,9 @@ define amdgpu_kernel void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x flo
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define amdgpu_kernel void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
+define amdgpu_kernel void @ftrunc_v8f32(ptr addrspace(1) %out, <8 x float> %x) {
   %y = call <8 x float> @llvm.trunc.v8f32(<8 x float> %x) nounwind readnone
-  store <8 x float> %y, <8 x float> addrspace(1)* %out
+  store <8 x float> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -113,8 +113,8 @@ define amdgpu_kernel void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x flo
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define amdgpu_kernel void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
+define amdgpu_kernel void @ftrunc_v16f32(ptr addrspace(1) %out, <16 x float> %x) {
   %y = call <16 x float> @llvm.trunc.v16f32(<16 x float> %x) nounwind readnone
-  store <16 x float> %y, <16 x float> addrspace(1)* %out
+  store <16 x float> %y, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
index 54d5c38bad4c0..4eaca1701ea90 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@@ -228,7 +228,7 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
   ret void
 }
 
-define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)* %out) #0 {
+define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) %out) #0 {
 ; GFX9-LABEL: test_call_void_func_void_mayclobber_s31:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -361,7 +361,7 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)
   ret void
 }
 
-define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 {
+define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) %out) #0 {
 ; GFX9-LABEL: test_call_void_func_void_mayclobber_v31:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -496,7 +496,7 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)
 }
 
 
-define amdgpu_gfx void @test_call_void_func_void_preserves_s33(i32 addrspace(1)* %out) #0 {
+define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) %out) #0 {
 ; GFX9-LABEL: test_call_void_func_void_preserves_s33:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -630,7 +630,7 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(i32 addrspace(1)*
   ret void
 }
 
-define amdgpu_gfx void @test_call_void_func_void_preserves_s34(i32 addrspace(1)* %out) #0 {
+define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) %out) #0 {
 ; GFX9-LABEL: test_call_void_func_void_preserves_s34:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -764,7 +764,7 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(i32 addrspace(1)*
   ret void
 }
 
-define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)* %out) #0 {
+define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1) %out) #0 {
 ; GFX9-LABEL: test_call_void_func_void_preserves_v40:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx10-vop-literal.ll b/llvm/test/CodeGen/AMDGPU/gfx10-vop-literal.ll
index f222dca468447..a6d7c20718916 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx10-vop-literal.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx10-vop-literal.ll
@@ -8,12 +8,12 @@
 ; GFX9:  v_mov_b32_e32 [[C2:v[0-9]+]], 0xe7
 ; GFX9:  v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0x80992bff, v{{[0-9]+}}
 ; GFX9:  v_addc_co_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, [[C2]], vcc
-define amdgpu_kernel void @test_add_lit(i64 addrspace(1)* %p) {
+define amdgpu_kernel void @test_add_lit(ptr addrspace(1) %p) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %ptr = getelementptr inbounds i64, i64 addrspace(1)* %p, i32 %id
-  %load = load i64, i64 addrspace(1)* %ptr, align 8
+  %ptr = getelementptr inbounds i64, ptr addrspace(1) %p, i32 %id
+  %load = load i64, ptr addrspace(1) %ptr, align 8
   %add = add nsw i64 %load, 994294967295
-  store i64 %add, i64 addrspace(1)* %ptr, align 8
+  store i64 %add, ptr addrspace(1) %ptr, align 8
   ret void
 }
 
@@ -21,16 +21,16 @@ define amdgpu_kernel void @test_add_lit(i64 addrspace(1)* %p) {
 ; GFX10: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3039, v{{[0-9]+}}, vcc_lo
 ; GFX9:  v_mov_b32_e32 [[C:v[0-9]+]], 0x3039
 ; GFX9:  v_cndmask_b32_e32 v{{[0-9]+}}, [[C]], v{{[0-9]+}}, vcc
-define amdgpu_kernel void @test_cndmask_lit(i32 addrspace(1)* %p) {
+define amdgpu_kernel void @test_cndmask_lit(ptr addrspace(1) %p) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %n = add nuw nsw i32 %id, 1
-  %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %id
-  %v1 = load i32, i32 addrspace(1)* %p1, align 4
-  %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %n
-  %v2 = load i32, i32 addrspace(1)* %p2, align 4
+  %p1 = getelementptr inbounds i32, ptr addrspace(1) %p, i32 %id
+  %v1 = load i32, ptr addrspace(1) %p1, align 4
+  %p2 = getelementptr inbounds i32, ptr addrspace(1) %p, i32 %n
+  %v2 = load i32, ptr addrspace(1) %p2, align 4
   %cmp = icmp sgt i32 %v1, 0
   %sel = select i1 %cmp, i32 12345, i32 %v2
-  store i32 %sel, i32 addrspace(1)* %p1, align 4
+  store i32 %sel, ptr addrspace(1) %p1, align 4
   ret void
 }
 
@@ -40,9 +40,9 @@ define amdgpu_kernel void @test_cndmask_lit(i32 addrspace(1)* %p) {
 ; GFX9-DAG: v_mov_b32_e32 [[C2:v[0-9]+]], 0xddd5
 ; GFX9-DAG: s_movk_i32 [[C1:s[0-9]+]], 0x3039
 ; GFX9:     v_bfe_u32 v{{[0-9]+}}, [[C1]], v{{[0-9]+}}, [[C2]]
-define amdgpu_kernel void @test_bfe_2lit_s(i32 addrspace(1)* %p, i32 %src) {
+define amdgpu_kernel void @test_bfe_2lit_s(ptr addrspace(1) %p, i32 %src) {
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 12345, i32 %src, i32 56789)
-  store i32 %bfe, i32 addrspace(1)* %p, align 4
+  store i32 %bfe, ptr addrspace(1) %p, align 4
   ret void
 }
 
@@ -52,12 +52,12 @@ define amdgpu_kernel void @test_bfe_2lit_s(i32 addrspace(1)* %p, i32 %src) {
 ; GFX9-DAG: v_mov_b32_e32 [[C2:v[0-9]+]], 0xddd5
 ; GFX9-DAG: s_movk_i32 [[C1:s[0-9]+]], 0x3039
 ; GFX9:     v_bfe_u32 v{{[0-9]+}}, [[C1]], v{{[0-9]+}}, [[C2]]
-define amdgpu_kernel void @test_bfe_2lit_v(i32 addrspace(1)* %p) {
+define amdgpu_kernel void @test_bfe_2lit_v(ptr addrspace(1) %p) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %ptr = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %id
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %ptr = getelementptr inbounds i32, ptr addrspace(1) %p, i32 %id
+  %load = load i32, ptr addrspace(1) %ptr, align 4
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 12345, i32 %load, i32 56789)
-  store i32 %bfe, i32 addrspace(1)* %ptr, align 4
+  store i32 %bfe, ptr addrspace(1) %ptr, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
index 3f7d87b98177d..3aaeccf3daedd 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s
 
-define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(float addrspace(1)* %ptr, float %data) {
+define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) {
   ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic
   ; GFX908_GFX11: bb.0 (%ir-block.0):
   ; GFX908_GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -27,11 +27,11 @@ define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(float addrspace(1
   ; GFX90A_GFX940-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
   ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
-  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
+  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
   ret void
 }
 
-define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_intrinsic(float addrspace(1)* inreg %ptr, float %data) {
+define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, float %data) {
   ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic
   ; GFX908_GFX11: bb.0 (%ir-block.0):
   ; GFX908_GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
@@ -54,11 +54,11 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_intrinsic(float addrs
   ; GFX90A_GFX940-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
-  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* inreg %ptr, float %data)
+  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data)
   ret void
 }
 
-define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_flat_intrinsic(float addrspace(1)* %ptr, float %data) {
+define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, float %data) {
   ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic
   ; GFX908_GFX11: bb.0 (%ir-block.0):
   ; GFX908_GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -81,11 +81,11 @@ define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_flat_intrinsic(float addrsp
   ; GFX90A_GFX940-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
   ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
-  %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
+  %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
   ret void
 }
 
-define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic(float addrspace(1)* inreg %ptr, float %data) {
+define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, float %data) {
   ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic
   ; GFX908_GFX11: bb.0 (%ir-block.0):
   ; GFX908_GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
@@ -108,11 +108,11 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic(float
   ; GFX90A_GFX940-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
-  %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float addrspace(1)* inreg %ptr, float %data)
+  %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data)
   ret void
 }
 
-define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(float addrspace(1)* %ptr, float %data) #0 {
+define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) #0 {
   ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw
   ; GFX908_GFX11: bb.0 (%ir-block.0):
   ; GFX908_GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -135,11 +135,11 @@ define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(float addrspace(1
   ; GFX90A_GFX940-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
   ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
-  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %data syncscope("wavefront") monotonic
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
   ret void
 }
 
-define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(float addrspace(1)* inreg %ptr, float %data) #0 {
+define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) #0 {
   ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw
   ; GFX908_GFX11: bb.0 (%ir-block.0):
   ; GFX908_GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
@@ -162,11 +162,11 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(float addrs
   ; GFX90A_GFX940-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
-  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %data syncscope("wavefront") monotonic
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
   ret void
 }
 
 attributes #0 = {"amdgpu-unsafe-fp-atomics"="true" }
 
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)*, float)
-declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float addrspace(1)*, float)
+declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)
+declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
index 5de0b4e87b966..c216875786b53 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s
 
-define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(float addrspace(1)* %ptr, float %data) {
+define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -28,11 +28,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(float addrspace(1)*
   ; GFX11-NEXT:   [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX11-NEXT:   $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
-  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
+  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
   ret float %ret
 }
 
-define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_intrinsic(float addrspace(1)* inreg %ptr, float %data) {
+define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, float %data) {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
@@ -57,11 +57,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_intrinsic(float addrspa
   ; GFX11-NEXT:   [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX11-NEXT:   $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
-  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* inreg %ptr, float %data)
+  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data)
   ret float %ret
 }
 
-define amdgpu_ps float @global_atomic_fadd_f32_rtn_flat_intrinsic(float addrspace(1)* %ptr, float %data) {
+define amdgpu_ps float @global_atomic_fadd_f32_rtn_flat_intrinsic(ptr addrspace(1) %ptr, float %data) {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -86,11 +86,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_rtn_flat_intrinsic(float addrspac
   ; GFX11-NEXT:   [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX11-NEXT:   $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
-  %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
+  %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
   ret float %ret
 }
 
-define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_flat_intrinsic(float addrspace(1)* inreg %ptr, float %data) {
+define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, float %data) {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
@@ -115,11 +115,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_flat_intrinsic(float ad
   ; GFX11-NEXT:   [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX11-NEXT:   $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
-  %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float addrspace(1)* inreg %ptr, float %data)
+  %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data)
   ret float %ret
 }
 
-define amdgpu_ps float @global_atomic_fadd_f32_rtn_atomicrmw(float addrspace(1)* %ptr, float %data) #0 {
+define amdgpu_ps float @global_atomic_fadd_f32_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) #0 {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -144,11 +144,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_rtn_atomicrmw(float addrspace(1)*
   ; GFX11-NEXT:   [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
   ; GFX11-NEXT:   $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
-  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %data syncscope("wavefront") monotonic
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
   ret float %ret
 }
 
-define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(float addrspace(1)* inreg %ptr, float %data) #0 {
+define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) #0 {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_atomicrmw
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
@@ -173,11 +173,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(float addrspa
   ; GFX11-NEXT:   [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
   ; GFX11-NEXT:   $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
-  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %data syncscope("wavefront") monotonic
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
   ret float %ret
 }
 
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)*, float)
-declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float addrspace(1)*, float)
+declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)
+declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)
 
 attributes #0 = {"amdgpu-unsafe-fp-atomics"="true" }

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
index 5b257350337a5..a499cf7da4f27 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
 
-define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(double addrspace(1)* %ptr, double %data) {
+define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -17,11 +17,11 @@ define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(double addrspace(
   ; GFX90A_GFX940-NEXT:   [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
   ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
-  %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
   ret void
 }
 
-define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(double addrspace(1)* %ptr, double %data) {
+define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -40,11 +40,11 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(double addrspace(1
   ; GFX90A_GFX940-NEXT:   $sgpr0 = COPY [[COPY6]]
   ; GFX90A_GFX940-NEXT:   $sgpr1 = COPY [[COPY7]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
-  %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
   ret double %ret
 }
 
-define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(double addrspace(1)* inreg %ptr, double %data) {
+define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
@@ -59,11 +59,11 @@ define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(double addr
   ; GFX90A_GFX940-NEXT:   [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
   ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
-  %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
   ret void
 }
 
-define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(double addrspace(1)* inreg %ptr, double %data) {
+define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
@@ -82,11 +82,11 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(double addrs
   ; GFX90A_GFX940-NEXT:   $sgpr0 = COPY [[COPY5]]
   ; GFX90A_GFX940-NEXT:   $sgpr1 = COPY [[COPY6]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
-  %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
   ret double %ret
 }
 
-define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(double addrspace(1)* %ptr, double %data) {
+define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -101,11 +101,11 @@ define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(double addrs
   ; GFX90A_GFX940-NEXT:   [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
   ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
-  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
   ret void
 }
 
-define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(double addrspace(1)* %ptr, double %data) {
+define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -124,11 +124,11 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(double addrsp
   ; GFX90A_GFX940-NEXT:   $sgpr0 = COPY [[COPY6]]
   ; GFX90A_GFX940-NEXT:   $sgpr1 = COPY [[COPY7]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
-  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
   ret double %ret
 }
 
-define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(double addrspace(1)* inreg %ptr, double %data) {
+define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
@@ -143,11 +143,11 @@ define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(double
   ; GFX90A_GFX940-NEXT:   [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
   ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
-  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
   ret void
 }
 
-define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(double addrspace(1)* inreg %ptr, double %data) {
+define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
@@ -166,11 +166,11 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(double
   ; GFX90A_GFX940-NEXT:   $sgpr0 = COPY [[COPY5]]
   ; GFX90A_GFX940-NEXT:   $sgpr1 = COPY [[COPY6]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
-  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
   ret double %ret
 }
 
-define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(double addrspace(1)* %ptr, double %data) #0 {
+define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -185,11 +185,11 @@ define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(double addrspace(
   ; GFX90A_GFX940-NEXT:   [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
   ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
-  %ret = atomicrmw fadd double addrspace(1)* %ptr, double %data syncscope("wavefront") monotonic
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
   ret void
 }
 
-define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(double addrspace(1)* %ptr, double %data) #0 {
+define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) #0 {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -208,11 +208,11 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(double addrspace(1
   ; GFX90A_GFX940-NEXT:   $sgpr0 = COPY [[COPY6]]
   ; GFX90A_GFX940-NEXT:   $sgpr1 = COPY [[COPY7]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
-  %ret = atomicrmw fadd double addrspace(1)* %ptr, double %data syncscope("wavefront") monotonic
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
   ret double %ret
 }
 
-define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(double addrspace(1)* inreg %ptr, double %data) #0 {
+define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_atomicrmw
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
@@ -227,11 +227,11 @@ define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(double addr
   ; GFX90A_GFX940-NEXT:   [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
   ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
-  %ret = atomicrmw fadd double addrspace(1)* %ptr, double %data syncscope("wavefront") monotonic
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
   ret void
 }
 
-define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(double addrspace(1)* inreg %ptr, double %data) #0 {
+define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, double %data) #0 {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_atomicrmw
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
@@ -250,11 +250,11 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(double addrs
   ; GFX90A_GFX940-NEXT:   $sgpr0 = COPY [[COPY5]]
   ; GFX90A_GFX940-NEXT:   $sgpr1 = COPY [[COPY6]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
-  %ret = atomicrmw fadd double addrspace(1)* %ptr, double %data syncscope("wavefront") monotonic
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
   ret double %ret
 }
 
-declare double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)*, double)
-declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double addrspace(1)*, double)
+declare double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1), double)
+declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1), double)
 
 attributes #0 = {"amdgpu-unsafe-fp-atomics"="true" }

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll
index 2118d982d0c26..28410f6fd4b55 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
 
-define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
+define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) {
   ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic
   ; GFX908: bb.0 (%ir-block.0):
   ; GFX908-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -26,11 +26,11 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(<2 x half> addr
   ; GFX90A_GFX940-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
   ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
-  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
+  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
   ret void
 }
 
-define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(<2 x half> addrspace(1)* inreg %ptr, <2 x half> %data) {
+define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
   ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic
   ; GFX908: bb.0 (%ir-block.0):
   ; GFX908-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
@@ -53,11 +53,11 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(<2 x half
   ; GFX90A_GFX940-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
-  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
+  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
   ret void
 }
 
-define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
+define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) {
   ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic
   ; GFX908: bb.0 (%ir-block.0):
   ; GFX908-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -80,11 +80,11 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(<2 x half>
   ; GFX90A_GFX940-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
   ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
-  %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
+  %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
   ret void
 }
 
-define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(<2 x half> addrspace(1)* inreg %ptr, <2 x half> %data) {
+define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
   ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic
   ; GFX908: bb.0 (%ir-block.0):
   ; GFX908-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
@@ -107,9 +107,9 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(<2 x
   ; GFX90A_GFX940-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A_GFX940-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
-  %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
+  %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
   ret void
 }
 
-declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)*, <2 x half>)
-declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)*, <2 x half>)
+declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
+declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll
index da891c88c0ae7..c8d84309a25c7 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s
 
-define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_intrinsic(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
+define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -15,11 +15,11 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_intrinsic(<2 x half> a
   ; GFX90A_GFX940-NEXT:   [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
-  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
+  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
   ret <2 x half> %ret
 }
 
-define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_intrinsic(<2 x half> addrspace(1)* inreg %ptr, <2 x half> %data) {
+define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
@@ -32,11 +32,11 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_intrinsic(<2 x h
   ; GFX90A_GFX940-NEXT:   [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
-  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
+  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
   ret <2 x half> %ret
 }
 
-define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat_intrinsic(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
+define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_flat_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -49,11 +49,11 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat_intrinsic(<2 x ha
   ; GFX90A_GFX940-NEXT:   [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
-  %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
+  %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
   ret <2 x half> %ret
 }
 
-define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic(<2 x half> addrspace(1)* inreg %ptr, <2 x half> %data) {
+define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) {
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
@@ -66,9 +66,9 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic(<
   ; GFX90A_GFX940-NEXT:   [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
-  %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
+  %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
   ret <2 x half> %ret
 }
 
-declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)*, <2 x half>)
-declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)*, <2 x half>)
+declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
+declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll
index 1f09c304de53f..b42a71f330577 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll
@@ -8,7 +8,7 @@
 ; DISASSEMBLY-VI: .long 0xdd348000                                           // {{[0-9]+}}: DD348000
 ; DISASSEMBLY-VI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc                     // {{[0-9]+}}: 00000100
 
-define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(float addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -19,7 +19,7 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(float ad
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_wbinvl1_vol
 ; GCN-NEXT:    s_endpgm
-  %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
index 3f46f8d0a466a..68a81c2a182af 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
@@ -5,7 +5,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
 
-define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 {
 ; GFX900-LABEL: global_atomic_fadd_ret_f32:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -142,12 +142,12 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr)
 ; GFX11-NEXT:    global_store_b32 v[0:1], v1, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
-  store float %result, float addrspace(1)* undef
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst
+  store float %result, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(float addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr) #2 {
 ; GFX900-LABEL: global_atomic_fadd_ret_f32_ieee:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -253,12 +253,12 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(float addrspace(1)* %
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
-  store float %result, float addrspace(1)* undef
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
+  store float %result, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) #0 {
 ; GFX900-LABEL: global_atomic_fadd_noret_f32:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -342,11 +342,11 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(float addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %ptr) #2 {
 ; GFX900-LABEL: global_atomic_fadd_noret_f32_ieee:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -430,11 +430,11 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(float addrspace(1)*
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(float addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %ptr) #0 {
 ; GFX900-LABEL: global_atomic_fadd_ret_f32_agent:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -540,12 +540,12 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(float addrspace(1)*
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
-  store float %result, float addrspace(1)* undef
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
+  store float %result, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(float addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %ptr) #0 {
 ; GFX900-LABEL: global_atomic_fadd_ret_f32_system:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -682,12 +682,12 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(float addrspace(1)*
 ; GFX11-NEXT:    global_store_b32 v[0:1], v1, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("one-as") seq_cst
-  store float %result, float addrspace(1)* undef
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") seq_cst
+  store float %result, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(float addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 {
 ; GCN-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -739,12 +739,12 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(float addr
 ; GFX11-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX11-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
-  store float %result, float addrspace(1)* undef
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
+  store float %result, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(float addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 {
 ; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -766,11 +766,11 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(float ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_wbinvl1_vol
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(float addrspace(1)* %ptr) {
+define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %ptr) {
 ; GFX900-LABEL: global_atomic_fadd_noret_f32_safe:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -894,11 +894,11 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(float addrspace(1)*
 ; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @infer_as_before_atomic(float* addrspace(4)* %arg) #0 {
+define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 {
 ; GFX900-LABEL: infer_as_before_atomic:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -979,8 +979,8 @@ define amdgpu_kernel void @infer_as_before_atomic(float* addrspace(4)* %arg) #0
 ; GFX11-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %load = load float*, float* addrspace(4)* %arg
-  %v = atomicrmw fadd float* %load, float 1.0 syncscope("agent-one-as") monotonic, align 4
+  %load = load ptr, ptr addrspace(4) %arg
+  %v = atomicrmw fadd ptr %load, float 1.0 syncscope("agent-one-as") monotonic, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/global-constant.ll b/llvm/test/CodeGen/AMDGPU/global-constant.ll
index 3e3800ef62b75..2bd2888aaa5ef 100644
--- a/llvm/test/CodeGen/AMDGPU/global-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-constant.ll
@@ -19,13 +19,13 @@
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], private2 at rel32@hi+12
 
 ; R600-LABEL: private_test
-define amdgpu_kernel void @private_test(i32 %index, float addrspace(1)* %out) {
-  %ptr = getelementptr [4 x float], [4 x float] addrspace(4) * @private1, i32 0, i32 %index
-  %val = load float, float addrspace(4)* %ptr
-  store volatile float %val, float addrspace(1)* %out
-  %ptr2 = getelementptr [4 x float], [4 x float] addrspace(4) * @private2, i32 0, i32 %index
-  %val2 = load float, float addrspace(4)* %ptr2
-  store volatile float %val2, float addrspace(1)* %out
+define amdgpu_kernel void @private_test(i32 %index, ptr addrspace(1) %out) {
+  %ptr = getelementptr [4 x float], ptr addrspace(4) @private1, i32 0, i32 %index
+  %val = load float, ptr addrspace(4) %ptr
+  store volatile float %val, ptr addrspace(1) %out
+  %ptr2 = getelementptr [4 x float], ptr addrspace(4) @private2, i32 0, i32 %index
+  %val2 = load float, ptr addrspace(4) %ptr2
+  store volatile float %val2, ptr addrspace(1) %out
   ret void
 }
 
@@ -34,10 +34,10 @@ define amdgpu_kernel void @private_test(i32 %index, float addrspace(1)* %out) {
 ; GCN: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], available_externally at gotpcrel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], available_externally at gotpcrel32@hi+12
 ; R600-LABEL: available_externally_test
-define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) {
-  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(4)* @available_externally, i32 0, i32 1
-  %val = load i32, i32 addrspace(4)* %ptr
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @available_externally_test(ptr addrspace(1) %out) {
+  %ptr = getelementptr [256 x i32], ptr addrspace(4) @available_externally, i32 0, i32 1
+  %val = load i32, ptr addrspace(4) %ptr
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/global-directive.ll b/llvm/test/CodeGen/AMDGPU/global-directive.ll
index ce89e390eac1f..7c3d61fe8a357 100644
--- a/llvm/test/CodeGen/AMDGPU/global-directive.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-directive.ll
@@ -5,11 +5,11 @@
 
 ; SI:	.globl	foo
 ; SI: {{^}}foo:
-define amdgpu_kernel void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
+define amdgpu_kernel void @foo(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %a = load i32, ptr addrspace(1) %in
+  %b = load i32, ptr addrspace(1) %b_ptr
   %result = add i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll b/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll
index ef77513813741..4d9705ec8dc1b 100644
--- a/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll
@@ -7,10 +7,10 @@
 ; SI: buffer_load_ushort
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define amdgpu_kernel void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %a = load i16, i16 addrspace(1)* %in
+define amdgpu_kernel void @zextload_global_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %a = load i16, ptr addrspace(1) %in
   %ext = zext i16 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -18,138 +18,138 @@ define amdgpu_kernel void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i1
 ; SI: buffer_load_sshort
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define amdgpu_kernel void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %a = load i16, i16 addrspace(1)* %in
+define amdgpu_kernel void @sextload_global_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %a = load i16, ptr addrspace(1) %in
   %ext = sext i16 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32:
 ; SI: buffer_load_ushort
 ; SI: s_endpgm
-define amdgpu_kernel void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+define amdgpu_kernel void @zextload_global_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <1 x i16>, ptr addrspace(1) %in
   %ext = zext <1 x i16> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  store <1 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32:
 ; SI: buffer_load_sshort
 ; SI: s_endpgm
-define amdgpu_kernel void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+define amdgpu_kernel void @sextload_global_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <1 x i16>, ptr addrspace(1) %in
   %ext = sext <1 x i16> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  store <1 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32:
 ; SI: s_endpgm
-define amdgpu_kernel void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+define amdgpu_kernel void @zextload_global_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <2 x i16>, ptr addrspace(1) %in
   %ext = zext <2 x i16> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32:
 ; SI: s_endpgm
-define amdgpu_kernel void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+define amdgpu_kernel void @sextload_global_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <2 x i16>, ptr addrspace(1) %in
   %ext = sext <2 x i16> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32:
 ; SI: s_endpgm
-define amdgpu_kernel void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+define amdgpu_kernel void @zextload_global_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <4 x i16>, ptr addrspace(1) %in
   %ext = zext <4 x i16> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32:
 ; SI: s_endpgm
-define amdgpu_kernel void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+define amdgpu_kernel void @sextload_global_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <4 x i16>, ptr addrspace(1) %in
   %ext = sext <4 x i16> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32:
 ; SI: s_endpgm
-define amdgpu_kernel void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+define amdgpu_kernel void @zextload_global_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <8 x i16>, ptr addrspace(1) %in
   %ext = zext <8 x i16> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  store <8 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32:
 ; SI: s_endpgm
-define amdgpu_kernel void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+define amdgpu_kernel void @sextload_global_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <8 x i16>, ptr addrspace(1) %in
   %ext = sext <8 x i16> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  store <8 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32:
 ; SI: s_endpgm
-define amdgpu_kernel void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+define amdgpu_kernel void @zextload_global_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <16 x i16>, ptr addrspace(1) %in
   %ext = zext <16 x i16> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32:
 ; SI: s_endpgm
-define amdgpu_kernel void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+define amdgpu_kernel void @sextload_global_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <16 x i16>, ptr addrspace(1) %in
   %ext = sext <16 x i16> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32:
 ; SI: s_endpgm
-define amdgpu_kernel void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+define amdgpu_kernel void @zextload_global_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <32 x i16>, ptr addrspace(1) %in
   %ext = zext <32 x i16> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  store <32 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32:
 ; SI: s_endpgm
-define amdgpu_kernel void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+define amdgpu_kernel void @sextload_global_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <32 x i16>, ptr addrspace(1) %in
   %ext = sext <32 x i16> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  store <32 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32:
 ; SI: s_endpgm
-define amdgpu_kernel void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+define amdgpu_kernel void @zextload_global_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <64 x i16>, ptr addrspace(1) %in
   %ext = zext <64 x i16> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  store <64 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32:
 ; SI: s_endpgm
-define amdgpu_kernel void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+define amdgpu_kernel void @sextload_global_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <64 x i16>, ptr addrspace(1) %in
   %ext = sext <64 x i16> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  store <64 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -157,10 +157,10 @@ define amdgpu_kernel void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace
 ; SI-DAG: buffer_load_ushort v[[LO:[0-9]+]],
 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
-define amdgpu_kernel void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %a = load i16, i16 addrspace(1)* %in
+define amdgpu_kernel void @zextload_global_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %a = load i16, ptr addrspace(1) %in
   %ext = zext i16 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -168,135 +168,135 @@ define amdgpu_kernel void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i1
 ; VI: buffer_load_ushort [[LOAD:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0
 ; VI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
 ; VI: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0
-define amdgpu_kernel void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
-  %a = load i16, i16 addrspace(1)* %in
+define amdgpu_kernel void @sextload_global_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %a = load i16, ptr addrspace(1) %in
   %ext = sext i16 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64:
 ; SI: s_endpgm
-define amdgpu_kernel void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+define amdgpu_kernel void @zextload_global_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <1 x i16>, ptr addrspace(1) %in
   %ext = zext <1 x i16> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  store <1 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64:
 ; SI: s_endpgm
-define amdgpu_kernel void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+define amdgpu_kernel void @sextload_global_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <1 x i16>, ptr addrspace(1) %in
   %ext = sext <1 x i16> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  store <1 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64:
 ; SI: s_endpgm
-define amdgpu_kernel void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+define amdgpu_kernel void @zextload_global_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <2 x i16>, ptr addrspace(1) %in
   %ext = zext <2 x i16> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64:
 ; SI: s_endpgm
-define amdgpu_kernel void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+define amdgpu_kernel void @sextload_global_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <2 x i16>, ptr addrspace(1) %in
   %ext = sext <2 x i16> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64:
 ; SI: s_endpgm
-define amdgpu_kernel void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+define amdgpu_kernel void @zextload_global_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <4 x i16>, ptr addrspace(1) %in
   %ext = zext <4 x i16> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  store <4 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64:
 ; SI: s_endpgm
-define amdgpu_kernel void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+define amdgpu_kernel void @sextload_global_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <4 x i16>, ptr addrspace(1) %in
   %ext = sext <4 x i16> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  store <4 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64:
 ; SI: s_endpgm
-define amdgpu_kernel void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+define amdgpu_kernel void @zextload_global_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <8 x i16>, ptr addrspace(1) %in
   %ext = zext <8 x i16> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  store <8 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64:
 ; SI: s_endpgm
-define amdgpu_kernel void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+define amdgpu_kernel void @sextload_global_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <8 x i16>, ptr addrspace(1) %in
   %ext = sext <8 x i16> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  store <8 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64:
 ; SI: s_endpgm
-define amdgpu_kernel void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+define amdgpu_kernel void @zextload_global_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <16 x i16>, ptr addrspace(1) %in
   %ext = zext <16 x i16> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  store <16 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64:
 ; SI: s_endpgm
-define amdgpu_kernel void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+define amdgpu_kernel void @sextload_global_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <16 x i16>, ptr addrspace(1) %in
   %ext = sext <16 x i16> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  store <16 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64:
 ; SI: s_endpgm
-define amdgpu_kernel void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+define amdgpu_kernel void @zextload_global_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <32 x i16>, ptr addrspace(1) %in
   %ext = zext <32 x i16> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  store <32 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64:
 ; SI: s_endpgm
-define amdgpu_kernel void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+define amdgpu_kernel void @sextload_global_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <32 x i16>, ptr addrspace(1) %in
   %ext = sext <32 x i16> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  store <32 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64:
 ; SI: s_endpgm
-define amdgpu_kernel void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+define amdgpu_kernel void @zextload_global_v64i16_to_v64i64(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <64 x i16>, ptr addrspace(1) %in
   %ext = zext <64 x i16> %load to <64 x i64>
-  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+  store <64 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64:
 ; SI: s_endpgm
-define amdgpu_kernel void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
-  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+define amdgpu_kernel void @sextload_global_v64i16_to_v64i64(ptr addrspace(1) %out, ptr addrspace(1) nocapture %in) nounwind {
+  %load = load <64 x i16>, ptr addrspace(1) %in
   %ext = sext <64 x i16> %load to <64 x i64>
-  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+  store <64 x i64> %ext, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
index d8ed6f6d97379..8dd2d8a030370 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
@@ -10,7 +10,7 @@
 ; Check that we are changing SADDR form of a load to VADDR and do not have to use
 ; readfirstlane instructions to move address from VGPRs into SGPRs.
 
-define amdgpu_kernel void @test_move_load_address_to_vgpr(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocapture %arg) {
 ; GCN-LABEL: test_move_load_address_to_vgpr:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -35,8 +35,7 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr(i32 addrspace(1)* noca
 ; GCN-NEXT:  ; %bb.2: ; %bb2
 ; GCN-NEXT:    s_endpgm
 bb:
-  %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 0
-  %i2 = load volatile i32, i32 addrspace(1)* %i1, align 4
+  %i2 = load volatile i32, ptr addrspace(1) %arg, align 4
   br label %bb3
 
 bb2:                                              ; preds = %bb3
@@ -45,14 +44,14 @@ bb2:                                              ; preds = %bb3
 bb3:                                              ; preds = %bb3, %bb
   %i = phi i32 [ %i2, %bb ], [ %i8, %bb3 ]
   %i4 = zext i32 %i to i64
-  %i5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %i4
-  %i6 = load volatile i32, i32 addrspace(1)* %i5, align 4
+  %i5 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %i4
+  %i6 = load volatile i32, ptr addrspace(1) %i5, align 4
   %i8 = add nuw nsw i32 %i, 1
   %i9 = icmp eq i32 %i8, 256
   br i1 %i9, label %bb2, label %bb3
 }
 
-define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(i16 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(ptr addrspace(1) nocapture %arg) {
 ; GCN-LABEL: test_move_load_address_to_vgpr_d16_hi:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -75,8 +74,7 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(i16 addrspace(1
 ; GCN-NEXT:  ; %bb.2: ; %bb2
 ; GCN-NEXT:    s_endpgm
 bb:
-  %i1 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 0
-  %load.pre = load volatile i16, i16 addrspace(1)* %i1, align 4
+  %load.pre = load volatile i16, ptr addrspace(1) %arg, align 4
   %i2 = zext i16 %load.pre to i32
   br label %bb3
 
@@ -86,8 +84,8 @@ bb2:                                              ; preds = %bb3
 bb3:                                              ; preds = %bb3, %bb
   %i = phi i32 [ %i2, %bb ], [ %i8, %bb3 ]
   %i4 = zext i32 %i to i64
-  %i5 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %i4
-  %i6 = load volatile i16, i16 addrspace(1)* %i5, align 4
+  %i5 = getelementptr inbounds i16, ptr addrspace(1) %arg, i64 %i4
+  %i6 = load volatile i16, ptr addrspace(1) %i5, align 4
   %insertelt = insertelement <2 x i16> undef, i16 %i6, i32 1
   %i8 =  bitcast <2 x i16> %insertelt to i32
   %i9 = icmp eq i32 %i8, 256

diff  --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll
index 040c8dfff2d5a..2ee53ed988ffd 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll
@@ -9,7 +9,7 @@
 ; atomicrmw max
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps float @global_max_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_max_saddr_i32_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
@@ -91,14 +91,13 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %rtn = atomicrmw max i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_max_saddr_i32_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
@@ -180,15 +179,14 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %rtn = atomicrmw max i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw max ptr addrspace(1) %gep1, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps void @global_max_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_max_saddr_i32_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v5, v0, s[2:3]
@@ -263,13 +261,12 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %unused = atomicrmw max i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_max_saddr_i32_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
@@ -344,14 +341,13 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = atomicrmw max i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw max ptr addrspace(1) %gep1, i32 %data seq_cst
   ret void
 }
 
-define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_max_saddr_i64_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
@@ -445,14 +441,13 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(i8 addrspace(1)* inreg %s
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %rtn = atomicrmw max i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_max_saddr_i64_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
@@ -546,15 +541,14 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(i8 addrspace(1)* i
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %rtn = atomicrmw max i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw max ptr addrspace(1) %gep1, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps void @global_max_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_max_saddr_i64_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
@@ -638,13 +632,12 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %unused = atomicrmw max i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_max_saddr_i64_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
@@ -728,10 +721,9 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %unused = atomicrmw max i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw max ptr addrspace(1) %gep1, i64 %data seq_cst
   ret void
 }
 
@@ -739,7 +731,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; atomicrmw min
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps float @global_min_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_min_saddr_i32_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
@@ -821,14 +813,13 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %rtn = atomicrmw min i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_min_saddr_i32_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
@@ -910,15 +901,14 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %rtn = atomicrmw min i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw min ptr addrspace(1) %gep1, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps void @global_min_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_min_saddr_i32_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v5, v0, s[2:3]
@@ -993,13 +983,12 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %unused = atomicrmw min i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_min_saddr_i32_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
@@ -1074,14 +1063,13 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = atomicrmw min i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw min ptr addrspace(1) %gep1, i32 %data seq_cst
   ret void
 }
 
-define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_min_saddr_i64_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
@@ -1175,14 +1163,13 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(i8 addrspace(1)* inreg %s
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %rtn = atomicrmw min i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_min_saddr_i64_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
@@ -1276,15 +1263,14 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(i8 addrspace(1)* i
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %rtn = atomicrmw min i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw min ptr addrspace(1) %gep1, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps void @global_min_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_min_saddr_i64_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
@@ -1368,13 +1354,12 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %unused = atomicrmw min i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_min_saddr_i64_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
@@ -1458,10 +1443,9 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %unused = atomicrmw min i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw min ptr addrspace(1) %gep1, i64 %data seq_cst
   ret void
 }
 
@@ -1469,7 +1453,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; atomicrmw umax
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps float @global_umax_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_umax_saddr_i32_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
@@ -1551,14 +1535,13 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %rtn = atomicrmw umax i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_umax_saddr_i32_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
@@ -1640,15 +1623,14 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %rtn = atomicrmw umax i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw umax ptr addrspace(1) %gep1, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps void @global_umax_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_umax_saddr_i32_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v5, v0, s[2:3]
@@ -1723,13 +1705,12 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %unused = atomicrmw umax i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_umax_saddr_i32_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
@@ -1804,14 +1785,13 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = atomicrmw umax i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw umax ptr addrspace(1) %gep1, i32 %data seq_cst
   ret void
 }
 
-define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_umax_saddr_i64_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
@@ -1905,14 +1885,13 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %rtn = atomicrmw umax i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_umax_saddr_i64_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
@@ -2006,15 +1985,14 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(i8 addrspace(1)*
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %rtn = atomicrmw umax i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw umax ptr addrspace(1) %gep1, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps void @global_umax_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_umax_saddr_i64_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
@@ -2098,13 +2076,12 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %unused = atomicrmw umax i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_umax_saddr_i64_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
@@ -2188,10 +2165,9 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %unused = atomicrmw umax i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw umax ptr addrspace(1) %gep1, i64 %data seq_cst
   ret void
 }
 
@@ -2199,7 +2175,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; atomicrmw umin
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps float @global_umin_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_umin_saddr_i32_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
@@ -2281,14 +2257,13 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %rtn = atomicrmw umin i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_umin_saddr_i32_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
@@ -2370,15 +2345,14 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %rtn = atomicrmw umin i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw umin ptr addrspace(1) %gep1, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps void @global_umin_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_umin_saddr_i32_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v5, v0, s[2:3]
@@ -2453,13 +2427,12 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %unused = atomicrmw umin i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_umin_saddr_i32_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
@@ -2534,14 +2507,13 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = atomicrmw umin i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw umin ptr addrspace(1) %gep1, i32 %data seq_cst
   ret void
 }
 
-define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_umin_saddr_i64_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
@@ -2635,14 +2607,13 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %rtn = atomicrmw umin i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_umin_saddr_i64_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
@@ -2736,15 +2707,14 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(i8 addrspace(1)*
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %rtn = atomicrmw umin i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw umin ptr addrspace(1) %gep1, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps void @global_umin_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_umin_saddr_i64_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
@@ -2828,13 +2798,12 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %unused = atomicrmw umin i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_umin_saddr_i64_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
@@ -2918,10 +2887,9 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %unused = atomicrmw umin i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw umin ptr addrspace(1) %gep1, i64 %data seq_cst
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll
index c4fc680e7cbe1..0147084a6996f 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll
@@ -7,57 +7,53 @@
 ; amdgcn global atomic fadd
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps void @global_fadd_saddr_f32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, float %data) {
+define amdgpu_ps void @global_fadd_saddr_f32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, float %data) {
 ; GCN-LABEL: global_fadd_saddr_f32_nortn:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_add_f32 v0, v1, s[2:3]
 ; GCN-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)*
-  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep0, float %data)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1(ptr addrspace(1) %gep0, float %data)
   ret void
 }
 
-define amdgpu_ps void @global_fadd_saddr_f32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, float %data) {
+define amdgpu_ps void @global_fadd_saddr_f32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, float %data) {
 ; GCN-LABEL: global_fadd_saddr_f32_nortn_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_add_f32 v0, v1, s[2:3] offset:-128
 ; GCN-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)*
-  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep1, float %data)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1(ptr addrspace(1) %gep1, float %data)
   ret void
 }
 
-define amdgpu_ps void @global_fadd_saddr_v2f16_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x half> %data) {
+define amdgpu_ps void @global_fadd_saddr_v2f16_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x half> %data) {
 ; GCN-LABEL: global_fadd_saddr_v2f16_nortn:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_pk_add_f16 v0, v1, s[2:3]
 ; GCN-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to <2 x half> addrspace(1)*
-  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep0, <2 x half> %data)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1(ptr addrspace(1) %gep0, <2 x half> %data)
   ret void
 }
 
-define amdgpu_ps void @global_fadd_saddr_v2f16_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x half> %data) {
+define amdgpu_ps void @global_fadd_saddr_v2f16_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x half> %data) {
 ; GCN-LABEL: global_fadd_saddr_v2f16_nortn_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_pk_add_f16 v0, v1, s[2:3] offset:-128
 ; GCN-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to <2 x half> addrspace(1)*
-  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep1, <2 x half> %data)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1(ptr addrspace(1) %gep1, <2 x half> %data)
   ret void
 }
 
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* nocapture, float) #0
-declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0
+declare float @llvm.amdgcn.global.atomic.fadd.f32.p1(ptr addrspace(1) nocapture, float) #0
+declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1(ptr addrspace(1) nocapture, <2 x half>) #0
 
 attributes #0 = { argmemonly nounwind willreturn }

diff  --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
index 1a4bde73ae4c7..c1bbfa06e9d2b 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
@@ -5,7 +5,7 @@
 
 ; Test using saddr addressing mode of global_* flat atomic instructions.
 
-define amdgpu_ps void @global_xchg_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_xchg_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_xchg_saddr_i32_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -34,14 +34,13 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %unused = atomicrmw xchg i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data seq_cst
   ret void
 }
 
 ; Maximum positive offset on gfx10
-define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_2047(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_2047(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_xchg_saddr_i32_nortn_offset_2047:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -70,15 +69,14 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_2047(i8 addrspace(1)*
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2047
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2047
+  %unused = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data seq_cst
   ret void
 }
 
 ; Maximum negative offset on gfx10
-define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_neg2048(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_xchg_saddr_i32_nortn_offset_neg2048:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -107,14 +105,13 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_neg2048(i8 addrspace(1
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048
+  %unused = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data seq_cst
   ret void
 }
 
-define amdgpu_ps float @global_xchg_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_xchg_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_xchg_saddr_i32_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -143,14 +140,13 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps float @global_xchg_saddr_i32_rtn_2048(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_xchg_saddr_i32_rtn_2048(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_xchg_saddr_i32_rtn_2048:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -183,15 +179,14 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn_2048(i8 addrspace(1)* inreg %s
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2048
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2048
+  %rtn = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps float @global_xchg_saddr_i32_rtn_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_xchg_saddr_i32_rtn_neg2048(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_xchg_saddr_i32_rtn_neg2048:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -220,10 +215,9 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn_neg2048(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048
+  %rtn = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -232,7 +226,7 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn_neg2048(i8 addrspace(1)* inreg
 ; Uniformity edge cases
 ; --------------------------------------------------------------------------------
 
- at ptr.in.lds = internal addrspace(3) global i8 addrspace(1)* undef
+ at ptr.in.lds = internal addrspace(3) global ptr addrspace(1) undef
 
 ; Base pointer is uniform, but also in VGPRs
 define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i32 %data) {
@@ -279,11 +273,10 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset,
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
-  %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
+  %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -333,12 +326,11 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
-  %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
+  %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 42
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 42
+  %rtn = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -388,11 +380,10 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
-  %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
+  %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %unused = atomicrmw xchg i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data seq_cst
   ret void
 }
 
@@ -441,12 +432,11 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i3
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
-  %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
+  %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 42
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 42
+  %unused = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data seq_cst
   ret void
 }
 
@@ -458,7 +448,7 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i3
 ; atomicrmw xchg
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_xchg_saddr_i64_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -487,14 +477,13 @@ define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %rtn = atomicrmw xchg i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw xchg ptr addrspace(1) %gep0, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_xchg_saddr_i64_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -523,15 +512,14 @@ define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn_neg128(i8 addrspace(1)*
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %rtn = atomicrmw xchg i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw xchg ptr addrspace(1) %gep1, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps void @global_xchg_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_xchg_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_xchg_saddr_i64_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -560,13 +548,12 @@ define amdgpu_ps void @global_xchg_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %unused = atomicrmw xchg i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw xchg ptr addrspace(1) %gep0, i64 %data seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_xchg_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_xchg_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_xchg_saddr_i64_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -595,10 +582,9 @@ define amdgpu_ps void @global_xchg_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %unused = atomicrmw xchg i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw xchg ptr addrspace(1) %gep1, i64 %data seq_cst
   ret void
 }
 
@@ -606,7 +592,7 @@ define amdgpu_ps void @global_xchg_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; atomicrmw add
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps float @global_add_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_add_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_add_saddr_i32_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -635,14 +621,13 @@ define amdgpu_ps float @global_add_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %rtn = atomicrmw add i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw add ptr addrspace(1) %gep0, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps float @global_add_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_add_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_add_saddr_i32_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -671,15 +656,14 @@ define amdgpu_ps float @global_add_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %rtn = atomicrmw add i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw add ptr addrspace(1) %gep1, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps void @global_add_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_add_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_add_saddr_i32_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -708,13 +692,12 @@ define amdgpu_ps void @global_add_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %unused = atomicrmw add i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw add ptr addrspace(1) %gep0, i32 %data seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_add_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_add_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_add_saddr_i32_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -743,14 +726,13 @@ define amdgpu_ps void @global_add_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = atomicrmw add i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw add ptr addrspace(1) %gep1, i32 %data seq_cst
   ret void
 }
 
-define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_add_saddr_i64_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -779,14 +761,13 @@ define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn(i8 addrspace(1)* inreg %s
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %rtn = atomicrmw add i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw add ptr addrspace(1) %gep0, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_add_saddr_i64_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -815,15 +796,14 @@ define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn_neg128(i8 addrspace(1)* i
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %rtn = atomicrmw add i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw add ptr addrspace(1) %gep1, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps void @global_add_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_add_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_add_saddr_i64_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -852,13 +832,12 @@ define amdgpu_ps void @global_add_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %unused = atomicrmw add i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw add ptr addrspace(1) %gep0, i64 %data seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_add_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_add_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_add_saddr_i64_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -887,10 +866,9 @@ define amdgpu_ps void @global_add_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %unused = atomicrmw add i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw add ptr addrspace(1) %gep1, i64 %data seq_cst
   ret void
 }
 
@@ -898,7 +876,7 @@ define amdgpu_ps void @global_add_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; atomicrmw sub
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps float @global_sub_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_sub_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_sub_saddr_i32_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -927,14 +905,13 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %rtn = atomicrmw sub i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw sub ptr addrspace(1) %gep0, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps float @global_sub_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_sub_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_sub_saddr_i32_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -963,15 +940,14 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %rtn = atomicrmw sub i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw sub ptr addrspace(1) %gep1, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps void @global_sub_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_sub_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_sub_saddr_i32_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1000,13 +976,12 @@ define amdgpu_ps void @global_sub_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %unused = atomicrmw sub i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw sub ptr addrspace(1) %gep0, i32 %data seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_sub_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_sub_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_sub_saddr_i32_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1035,14 +1010,13 @@ define amdgpu_ps void @global_sub_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = atomicrmw sub i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw sub ptr addrspace(1) %gep1, i32 %data seq_cst
   ret void
 }
 
-define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_sub_saddr_i64_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1071,14 +1045,13 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn(i8 addrspace(1)* inreg %s
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %rtn = atomicrmw sub i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw sub ptr addrspace(1) %gep0, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_sub_saddr_i64_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1107,15 +1080,14 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn_neg128(i8 addrspace(1)* i
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %rtn = atomicrmw sub i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw sub ptr addrspace(1) %gep1, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps void @global_sub_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_sub_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_sub_saddr_i64_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1144,13 +1116,12 @@ define amdgpu_ps void @global_sub_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %unused = atomicrmw sub i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw sub ptr addrspace(1) %gep0, i64 %data seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_sub_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_sub_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_sub_saddr_i64_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1179,10 +1150,9 @@ define amdgpu_ps void @global_sub_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %unused = atomicrmw sub i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw sub ptr addrspace(1) %gep1, i64 %data seq_cst
   ret void
 }
 
@@ -1190,7 +1160,7 @@ define amdgpu_ps void @global_sub_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; atomicrmw and
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps float @global_and_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_and_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_and_saddr_i32_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1219,14 +1189,13 @@ define amdgpu_ps float @global_and_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %rtn = atomicrmw and i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw and ptr addrspace(1) %gep0, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps float @global_and_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_and_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_and_saddr_i32_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1255,15 +1224,14 @@ define amdgpu_ps float @global_and_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %rtn = atomicrmw and i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw and ptr addrspace(1) %gep1, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps void @global_and_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_and_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_and_saddr_i32_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1292,13 +1260,12 @@ define amdgpu_ps void @global_and_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %unused = atomicrmw and i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw and ptr addrspace(1) %gep0, i32 %data seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_and_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_and_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_and_saddr_i32_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1327,14 +1294,13 @@ define amdgpu_ps void @global_and_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = atomicrmw and i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw and ptr addrspace(1) %gep1, i32 %data seq_cst
   ret void
 }
 
-define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_and_saddr_i64_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1363,14 +1329,13 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn(i8 addrspace(1)* inreg %s
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %rtn = atomicrmw and i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw and ptr addrspace(1) %gep0, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_and_saddr_i64_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1399,15 +1364,14 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn_neg128(i8 addrspace(1)* i
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %rtn = atomicrmw and i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw and ptr addrspace(1) %gep1, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps void @global_and_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_and_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_and_saddr_i64_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1436,13 +1400,12 @@ define amdgpu_ps void @global_and_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %unused = atomicrmw and i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw and ptr addrspace(1) %gep0, i64 %data seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_and_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_and_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_and_saddr_i64_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1471,10 +1434,9 @@ define amdgpu_ps void @global_and_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %unused = atomicrmw and i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw and ptr addrspace(1) %gep1, i64 %data seq_cst
   ret void
 }
 
@@ -1482,7 +1444,7 @@ define amdgpu_ps void @global_and_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; atomicrmw or
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps float @global_or_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_or_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_or_saddr_i32_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1511,14 +1473,13 @@ define amdgpu_ps float @global_or_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %rtn = atomicrmw or i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw or ptr addrspace(1) %gep0, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps float @global_or_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_or_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_or_saddr_i32_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1547,15 +1508,14 @@ define amdgpu_ps float @global_or_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %s
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %rtn = atomicrmw or i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw or ptr addrspace(1) %gep1, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps void @global_or_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_or_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_or_saddr_i32_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1584,13 +1544,12 @@ define amdgpu_ps void @global_or_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %unused = atomicrmw or i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw or ptr addrspace(1) %gep0, i32 %data seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_or_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_or_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_or_saddr_i32_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1619,14 +1578,13 @@ define amdgpu_ps void @global_or_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = atomicrmw or i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw or ptr addrspace(1) %gep1, i32 %data seq_cst
   ret void
 }
 
-define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_or_saddr_i64_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1655,14 +1613,13 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn(i8 addrspace(1)* inreg %sb
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %rtn = atomicrmw or i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw or ptr addrspace(1) %gep0, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_or_saddr_i64_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1691,15 +1648,14 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn_neg128(i8 addrspace(1)* in
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %rtn = atomicrmw or i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw or ptr addrspace(1) %gep1, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps void @global_or_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_or_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_or_saddr_i64_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1728,13 +1684,12 @@ define amdgpu_ps void @global_or_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %unused = atomicrmw or i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw or ptr addrspace(1) %gep0, i64 %data seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_or_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_or_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_or_saddr_i64_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1763,10 +1718,9 @@ define amdgpu_ps void @global_or_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %unused = atomicrmw or i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw or ptr addrspace(1) %gep1, i64 %data seq_cst
   ret void
 }
 
@@ -1774,7 +1728,7 @@ define amdgpu_ps void @global_or_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %
 ; atomicrmw xor
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps float @global_xor_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_xor_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_xor_saddr_i32_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1803,14 +1757,13 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %rtn = atomicrmw xor i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw xor ptr addrspace(1) %gep0, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps float @global_xor_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_xor_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_xor_saddr_i32_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1839,15 +1792,14 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %rtn = atomicrmw xor i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw xor ptr addrspace(1) %gep1, i32 %data seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps void @global_xor_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_xor_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_xor_saddr_i32_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1876,13 +1828,12 @@ define amdgpu_ps void @global_xor_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %unused = atomicrmw xor i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw xor ptr addrspace(1) %gep0, i32 %data seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_xor_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_xor_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_xor_saddr_i32_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1911,14 +1862,13 @@ define amdgpu_ps void @global_xor_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = atomicrmw xor i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw xor ptr addrspace(1) %gep1, i32 %data seq_cst
   ret void
 }
 
-define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_xor_saddr_i64_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1947,14 +1897,13 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn(i8 addrspace(1)* inreg %s
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %rtn = atomicrmw xor i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw xor ptr addrspace(1) %gep0, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_xor_saddr_i64_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1983,15 +1932,14 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn_neg128(i8 addrspace(1)* i
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %rtn = atomicrmw xor i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw xor ptr addrspace(1) %gep1, i64 %data seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps void @global_xor_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_xor_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_xor_saddr_i64_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2020,13 +1968,12 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %unused = atomicrmw xor i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw xor ptr addrspace(1) %gep0, i64 %data seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_xor_saddr_i64_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2055,10 +2002,9 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %unused = atomicrmw xor i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw xor ptr addrspace(1) %gep1, i64 %data seq_cst
   ret void
 }
 
@@ -2066,7 +2012,7 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; atomicrmw max
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps float @global_max_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_max_saddr_i32_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2092,14 +2038,13 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %rtn = atomicrmw max i32 addrspace(1)* %cast.gep0, i32 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw max ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_max_saddr_i32_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2125,15 +2070,14 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %rtn = atomicrmw max i32 addrspace(1)* %cast.gep1, i32 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw max ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps void @global_max_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_max_saddr_i32_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2158,13 +2102,12 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %unused = atomicrmw max i32 addrspace(1)* %cast.gep0, i32 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw max ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_max_saddr_i32_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2189,14 +2132,13 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = atomicrmw max i32 addrspace(1)* %cast.gep1, i32 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw max ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_max_saddr_i64_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2222,14 +2164,13 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(i8 addrspace(1)* inreg %s
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %rtn = atomicrmw max i64 addrspace(1)* %cast.gep0, i64 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw max ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_max_saddr_i64_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2255,15 +2196,14 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(i8 addrspace(1)* i
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %rtn = atomicrmw max i64 addrspace(1)* %cast.gep1, i64 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw max ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps void @global_max_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_max_saddr_i64_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2288,13 +2228,12 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %unused = atomicrmw max i64 addrspace(1)* %cast.gep0, i64 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw max ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_max_saddr_i64_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2319,10 +2258,9 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %unused = atomicrmw max i64 addrspace(1)* %cast.gep1, i64 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw max ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
   ret void
 }
 
@@ -2330,7 +2268,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; atomicrmw min
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps float @global_min_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_min_saddr_i32_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2356,14 +2294,13 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %rtn = atomicrmw min i32 addrspace(1)* %cast.gep0, i32 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw min ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_min_saddr_i32_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2389,15 +2326,14 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %rtn = atomicrmw min i32 addrspace(1)* %cast.gep1, i32 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw min ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps void @global_min_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_min_saddr_i32_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2422,13 +2358,12 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %unused = atomicrmw min i32 addrspace(1)* %cast.gep0, i32 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw min ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_min_saddr_i32_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2453,14 +2388,13 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = atomicrmw min i32 addrspace(1)* %cast.gep1, i32 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw min ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_min_saddr_i64_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2486,14 +2420,13 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(i8 addrspace(1)* inreg %s
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %rtn = atomicrmw min i64 addrspace(1)* %cast.gep0, i64 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw min ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_min_saddr_i64_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2519,15 +2452,14 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(i8 addrspace(1)* i
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %rtn = atomicrmw min i64 addrspace(1)* %cast.gep1, i64 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw min ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps void @global_min_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_min_saddr_i64_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2552,13 +2484,12 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %unused = atomicrmw min i64 addrspace(1)* %cast.gep0, i64 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw min ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_min_saddr_i64_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2583,10 +2514,9 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %unused = atomicrmw min i64 addrspace(1)* %cast.gep1, i64 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw min ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
   ret void
 }
 
@@ -2594,7 +2524,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; atomicrmw umax
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps float @global_umax_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_umax_saddr_i32_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2620,14 +2550,13 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %rtn = atomicrmw umax i32 addrspace(1)* %cast.gep0, i32 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw umax ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_umax_saddr_i32_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2653,15 +2582,14 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %rtn = atomicrmw umax i32 addrspace(1)* %cast.gep1, i32 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw umax ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps void @global_umax_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_umax_saddr_i32_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2686,13 +2614,12 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %unused = atomicrmw umax i32 addrspace(1)* %cast.gep0, i32 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw umax ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_umax_saddr_i32_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2717,14 +2644,13 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = atomicrmw umax i32 addrspace(1)* %cast.gep1, i32 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw umax ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_umax_saddr_i64_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2750,14 +2676,13 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %rtn = atomicrmw umax i64 addrspace(1)* %cast.gep0, i64 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw umax ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_umax_saddr_i64_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2783,15 +2708,14 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(i8 addrspace(1)*
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %rtn = atomicrmw umax i64 addrspace(1)* %cast.gep1, i64 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw umax ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps void @global_umax_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_umax_saddr_i64_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2816,13 +2740,12 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %unused = atomicrmw umax i64 addrspace(1)* %cast.gep0, i64 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw umax ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_umax_saddr_i64_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2847,10 +2770,9 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %unused = atomicrmw umax i64 addrspace(1)* %cast.gep1, i64 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw umax ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
   ret void
 }
 
@@ -2858,7 +2780,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; atomicrmw umin
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps float @global_umin_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_umin_saddr_i32_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2884,14 +2806,13 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %rtn = atomicrmw umin i32 addrspace(1)* %cast.gep0, i32 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw umin ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_umin_saddr_i32_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2917,15 +2838,14 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %rtn = atomicrmw umin i32 addrspace(1)* %cast.gep1, i32 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw umin ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps void @global_umin_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_umin_saddr_i32_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2950,13 +2870,12 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %unused = atomicrmw umin i32 addrspace(1)* %cast.gep0, i32 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw umin ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: global_umin_saddr_i32_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2981,14 +2900,13 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = atomicrmw umin i32 addrspace(1)* %cast.gep1, i32 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw umin ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_umin_saddr_i64_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3014,14 +2932,13 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %rtn = atomicrmw umin i64 addrspace(1)* %cast.gep0, i64 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = atomicrmw umin ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_umin_saddr_i64_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3047,15 +2964,14 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(i8 addrspace(1)*
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %rtn = atomicrmw umin i64 addrspace(1)* %cast.gep1, i64 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = atomicrmw umin ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps void @global_umin_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_umin_saddr_i64_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3080,13 +2996,12 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %unused = atomicrmw umin i64 addrspace(1)* %cast.gep0, i64 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = atomicrmw umin ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: global_umin_saddr_i64_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3111,10 +3026,9 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %unused = atomicrmw umin i64 addrspace(1)* %cast.gep1, i64 %data syncscope("workgroup") seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = atomicrmw umin ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
   ret void
 }
 
@@ -3122,7 +3036,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; cmpxchg
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
+define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
 ; GFX9-LABEL: global_cmpxchg_saddr_i32_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
@@ -3154,15 +3068,14 @@ define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn(i8 addrspace(1)* inreg %sba
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %cmpxchg = cmpxchg i32 addrspace(1)* %cast.gep0, i32 %cmp, i32 %data seq_cst seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %cmpxchg = cmpxchg ptr addrspace(1) %gep0, i32 %cmp, i32 %data seq_cst seq_cst
   %rtn = extractvalue { i32, i1 } %cmpxchg, 0
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
+define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
 ; GFX9-LABEL: global_cmpxchg_saddr_i32_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
@@ -3194,16 +3107,15 @@ define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn_neg128(i8 addrspace(1)* inr
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %cmpxchg = cmpxchg i32 addrspace(1)* %cast.gep1, i32 %cmp, i32 %data seq_cst seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %cmpxchg = cmpxchg ptr addrspace(1) %gep1, i32 %cmp, i32 %data seq_cst seq_cst
   %rtn = extractvalue { i32, i1 } %cmpxchg, 0
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
+define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
 ; GFX9-LABEL: global_cmpxchg_saddr_i32_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
@@ -3235,13 +3147,12 @@ define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn(i8 addrspace(1)* inreg %sb
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %unused = cmpxchg i32 addrspace(1)* %cast.gep0, i32 %cmp, i32 %data seq_cst seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = cmpxchg ptr addrspace(1) %gep0, i32 %cmp, i32 %data seq_cst seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
+define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
 ; GFX9-LABEL: global_cmpxchg_saddr_i32_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
@@ -3273,14 +3184,13 @@ define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn_neg128(i8 addrspace(1)* in
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = cmpxchg i32 addrspace(1)* %cast.gep1, i32 %cmp, i32 %data seq_cst seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = cmpxchg ptr addrspace(1) %gep1, i32 %cmp, i32 %data seq_cst seq_cst
   ret void
 }
 
-define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
+define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
 ; GFX9-LABEL: global_cmpxchg_saddr_i64_rtn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v2
@@ -3315,15 +3225,14 @@ define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn(i8 addrspace(1)* inre
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %cmpxchg = cmpxchg i64 addrspace(1)* %cast.gep0, i64 %cmp, i64 %data seq_cst seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %cmpxchg = cmpxchg ptr addrspace(1) %gep0, i64 %cmp, i64 %data seq_cst seq_cst
   %rtn = extractvalue { i64, i1 } %cmpxchg, 0
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
+define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
 ; GFX9-LABEL: global_cmpxchg_saddr_i64_rtn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v2
@@ -3358,16 +3267,15 @@ define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn_neg128(i8 addrspace(1
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %cmpxchg = cmpxchg i64 addrspace(1)* %cast.gep1, i64 %cmp, i64 %data seq_cst seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %cmpxchg = cmpxchg ptr addrspace(1) %gep1, i64 %cmp, i64 %data seq_cst seq_cst
   %rtn = extractvalue { i64, i1 } %cmpxchg, 0
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
+define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
 ; GFX9-LABEL: global_cmpxchg_saddr_i64_nortn:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v2
@@ -3402,13 +3310,12 @@ define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn(i8 addrspace(1)* inreg %sb
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %unused = cmpxchg i64 addrspace(1)* %cast.gep0, i64 %cmp, i64 %data seq_cst seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = cmpxchg ptr addrspace(1) %gep0, i64 %cmp, i64 %data seq_cst seq_cst
   ret void
 }
 
-define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
+define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
 ; GFX9-LABEL: global_cmpxchg_saddr_i64_nortn_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v2
@@ -3443,10 +3350,9 @@ define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn_neg128(i8 addrspace(1)* in
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %unused = cmpxchg i64 addrspace(1)* %cast.gep1, i64 %cmp, i64 %data seq_cst seq_cst
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = cmpxchg ptr addrspace(1) %gep1, i64 %cmp, i64 %data seq_cst seq_cst
   ret void
 }
 
@@ -3454,10 +3360,10 @@ define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn_neg128(i8 addrspace(1)* in
 ; amdgcn atomic inc
 ; --------------------------------------------------------------------------------
 
-declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
-declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
 
-define amdgpu_ps float @global_inc_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_inc_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GCN-LABEL: global_inc_saddr_i32_rtn:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_inc v0, v0, v1, s[2:3] glc
@@ -3470,14 +3376,13 @@ define amdgpu_ps float @global_inc_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %rtn = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data, i32 0, i32 0, i1 false)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep0, i32 %data, i32 0, i32 0, i1 false)
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps float @global_inc_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_inc_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GCN-LABEL: global_inc_saddr_i32_rtn_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_inc v0, v0, v1, s[2:3] offset:-128 glc
@@ -3490,15 +3395,14 @@ define amdgpu_ps float @global_inc_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %rtn = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data, i32 0, i32 0, i1 false)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep1, i32 %data, i32 0, i32 0, i1 false)
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps void @global_inc_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_inc_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GCN-LABEL: global_inc_saddr_i32_nortn:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_inc v0, v1, s[2:3]
@@ -3510,13 +3414,12 @@ define amdgpu_ps void @global_inc_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %unused = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data, i32 0, i32 0, i1 false)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep0, i32 %data, i32 0, i32 0, i1 false)
   ret void
 }
 
-define amdgpu_ps void @global_inc_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_inc_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GCN-LABEL: global_inc_saddr_i32_nortn_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_inc v0, v1, s[2:3] offset:-128
@@ -3528,14 +3431,13 @@ define amdgpu_ps void @global_inc_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data, i32 0, i32 0, i1 false)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep1, i32 %data, i32 0, i32 0, i1 false)
   ret void
 }
 
-define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GCN-LABEL: global_inc_saddr_i64_rtn:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_inc_x2 v[0:1], v0, v[1:2], s[2:3] glc
@@ -3548,14 +3450,13 @@ define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn(i8 addrspace(1)* inreg %s
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %rtn = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %cast.gep0, i64 %data, i32 0, i32 0, i1 false)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep0, i64 %data, i32 0, i32 0, i1 false)
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GCN-LABEL: global_inc_saddr_i64_rtn_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_inc_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
@@ -3568,15 +3469,14 @@ define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn_neg128(i8 addrspace(1)* i
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %rtn = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %cast.gep1, i64 %data, i32 0, i32 0, i1 false)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep1, i64 %data, i32 0, i32 0, i1 false)
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps void @global_inc_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_inc_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GCN-LABEL: global_inc_saddr_i64_nortn:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_inc_x2 v0, v[1:2], s[2:3]
@@ -3588,13 +3488,12 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %unused = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %cast.gep0, i64 %data, i32 0, i32 0, i1 false)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep0, i64 %data, i32 0, i32 0, i1 false)
   ret void
 }
 
-define amdgpu_ps void @global_inc_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_inc_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GCN-LABEL: global_inc_saddr_i64_nortn_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_inc_x2 v0, v[1:2], s[2:3] offset:-128
@@ -3606,10 +3505,9 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %unused = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %cast.gep1, i64 %data, i32 0, i32 0, i1 false)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep1, i64 %data, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -3617,10 +3515,10 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; amdgcn atomic dec
 ; --------------------------------------------------------------------------------
 
-declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
-declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
 
-define amdgpu_ps float @global_dec_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_dec_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GCN-LABEL: global_dec_saddr_i32_rtn:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_dec v0, v0, v1, s[2:3] glc
@@ -3633,14 +3531,13 @@ define amdgpu_ps float @global_dec_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %rtn = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data, i32 0, i32 0, i1 false)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep0, i32 %data, i32 0, i32 0, i1 false)
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps float @global_dec_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_dec_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GCN-LABEL: global_dec_saddr_i32_rtn_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_dec v0, v0, v1, s[2:3] offset:-128 glc
@@ -3653,15 +3550,14 @@ define amdgpu_ps float @global_dec_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %rtn = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data, i32 0, i32 0, i1 false)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep1, i32 %data, i32 0, i32 0, i1 false)
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps void @global_dec_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_dec_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GCN-LABEL: global_dec_saddr_i32_nortn:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_dec v0, v1, s[2:3]
@@ -3673,13 +3569,12 @@ define amdgpu_ps void @global_dec_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %unused = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data, i32 0, i32 0, i1 false)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep0, i32 %data, i32 0, i32 0, i1 false)
   ret void
 }
 
-define amdgpu_ps void @global_dec_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_dec_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GCN-LABEL: global_dec_saddr_i32_nortn_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_dec v0, v1, s[2:3] offset:-128
@@ -3691,14 +3586,13 @@ define amdgpu_ps void @global_dec_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data, i32 0, i32 0, i1 false)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep1, i32 %data, i32 0, i32 0, i1 false)
   ret void
 }
 
-define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GCN-LABEL: global_dec_saddr_i64_rtn:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_dec_x2 v[0:1], v0, v[1:2], s[2:3] glc
@@ -3711,14 +3605,13 @@ define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn(i8 addrspace(1)* inreg %s
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %rtn = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %cast.gep0, i64 %data, i32 0, i32 0, i1 false)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep0, i64 %data, i32 0, i32 0, i1 false)
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GCN-LABEL: global_dec_saddr_i64_rtn_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_dec_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
@@ -3731,15 +3624,14 @@ define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn_neg128(i8 addrspace(1)* i
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %rtn = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %cast.gep1, i64 %data, i32 0, i32 0, i1 false)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep1, i64 %data, i32 0, i32 0, i1 false)
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
 
-define amdgpu_ps void @global_dec_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_dec_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GCN-LABEL: global_dec_saddr_i64_nortn:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_dec_x2 v0, v[1:2], s[2:3]
@@ -3751,13 +3643,12 @@ define amdgpu_ps void @global_dec_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %unused = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %cast.gep0, i64 %data, i32 0, i32 0, i1 false)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep0, i64 %data, i32 0, i32 0, i1 false)
   ret void
 }
 
-define amdgpu_ps void @global_dec_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_dec_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GCN-LABEL: global_dec_saddr_i64_nortn_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_dec_x2 v0, v[1:2], s[2:3] offset:-128
@@ -3769,10 +3660,9 @@ define amdgpu_ps void @global_dec_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %unused = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %cast.gep1, i64 %data, i32 0, i32 0, i1 false)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep1, i64 %data, i32 0, i32 0, i1 false)
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 56bf823854b0a..c59349d718a6e 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -10,7 +10,7 @@
 ; --------------------------------------------------------------------------------
 
 ; SGPR base only
-define amdgpu_ps float @global_load_saddr_i8_offset_0(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_0(ptr addrspace(1) inreg %sbase) {
 ; GCN-LABEL: global_load_saddr_i8_offset_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
@@ -24,14 +24,14 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0(i8 addrspace(1)* inreg %sb
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %load = load i8, i8 addrspace(1)* %sbase
+  %load = load i8, ptr addrspace(1) %sbase
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; SGPR base with maximum gfx9 immediate offset
-define amdgpu_ps float @global_load_saddr_i8_offset_4095(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_4095(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_4095:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -52,15 +52,15 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4095(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; SGPR base with maximum gfx9 immediate offset + 1
-define amdgpu_ps float @global_load_saddr_i8_offset_4096(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_4096(ptr addrspace(1) inreg %sbase) {
 ; GCN-LABEL: global_load_saddr_i8_offset_4096:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x1000
@@ -74,15 +74,15 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4096(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4096
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4096
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; SGPR base with maximum gfx9 immediate offset + 2
-define amdgpu_ps float @global_load_saddr_i8_offset_4097(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_4097(ptr addrspace(1) inreg %sbase) {
 ; GCN-LABEL: global_load_saddr_i8_offset_4097:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x1000
@@ -96,15 +96,15 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4097(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4097
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4097
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; SGPR base with maximum negative gfx9 immediate offset
-define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_neg4096:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -126,15 +126,15 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(i8 addrspace(1)* inr
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-4096
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4096
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4096
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; SGPR base with maximum negative gfx9 immediate offset -1
-define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_neg4097:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s2, 0xffffefff
@@ -160,15 +160,15 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(i8 addrspace(1)* inr
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4097
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4097
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; SGPR base with maximum negative gfx9 immediate offset -2
-define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_neg4098:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s2, 0xffffeffe
@@ -194,15 +194,15 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(i8 addrspace(1)* inr
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4098
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4098
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; SGPR base with maximum gfx10 immediate offset
-define amdgpu_ps float @global_load_saddr_i8_offset_2048(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_2048(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_2048:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -223,15 +223,15 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2048(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2048
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2048
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2048
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; SGPR base with maximum gfx10 immediate offset + 1
-define amdgpu_ps float @global_load_saddr_i8_offset_2049(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_2049(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_2049:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -252,15 +252,15 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2049(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2049
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2049
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2049
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; SGPR base with maximum gfx10 immediate offset + 2
-define amdgpu_ps float @global_load_saddr_i8_offset_2050(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_2050(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_2050:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -281,15 +281,15 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2050(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2050
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2050
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2050
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; SGPR base with maximum negative gfx10 immediate offset
-define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(ptr addrspace(1) inreg %sbase) {
 ; GCN-LABEL: global_load_saddr_i8_offset_neg2048:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
@@ -303,15 +303,15 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(i8 addrspace(1)* inr
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2048
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2048
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2048
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; SGPR base with maximum negative gfx10 immediate offset - 1
-define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_neg2049:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -333,15 +333,15 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(i8 addrspace(1)* inr
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2049
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2049
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2049
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; SGPR base with maximum negative gfx10 immediate offset - 1
-define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_neg2050:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -363,14 +363,14 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(i8 addrspace(1)* inr
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2050
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2050
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2050
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
-define amdgpu_ps float @global_load_saddr_i8_offset_4294967295(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_4294967295(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_4294967295:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xfffff000
@@ -391,14 +391,14 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967295(i8 addrspace(1)*
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967295
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967295
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
-define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_4294967296:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
@@ -424,14 +424,14 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(i8 addrspace(1)*
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967296
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967296
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
-define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_4294967297:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
@@ -457,14 +457,14 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(i8 addrspace(1)*
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967297
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967297
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
-define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_4294971391:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s2, 0xfff
@@ -490,14 +490,14 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(i8 addrspace(1)*
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971391
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971391
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
-define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_4294971392:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s2, 0x1000
@@ -523,14 +523,14 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(i8 addrspace(1)*
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971392
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971392
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
-define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967295:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -557,14 +557,14 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(i8 addrspace(1
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967295
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967295
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
-define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967296:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
@@ -590,14 +590,14 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(i8 addrspace(1
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967296
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967296
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
-define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(i8 addrspace(1)* inreg %sbase) {
+define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967297:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
@@ -623,8 +623,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(i8 addrspace(1
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967297
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967297
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
@@ -635,7 +635,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(i8 addrspace(1
 ; --------------------------------------------------------------------------------
 
 ; Basic pattern, no immediate offset.
-define amdgpu_ps float @global_load_saddr_i8_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_load_saddr_i8_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
@@ -648,15 +648,15 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr(i8 addrspace(1)* inreg %s
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; Maximum positive offset on gfx9
-define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:4095
@@ -679,16 +679,16 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(i8 addrspace(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095
-  %load = load i8, i8 addrspace(1)* %gep1
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4095
+  %load = load i8, ptr addrspace(1) %gep1
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; Maximum positive offset on gfx9 + 1
-define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
@@ -722,16 +722,16 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(i8 addrspace(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4096
-  %load = load i8, i8 addrspace(1)* %gep1
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4096
+  %load = load i8, ptr addrspace(1) %gep1
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; Maximum negative offset on gfx9
-define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-4096
@@ -754,16 +754,16 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(i8 addrspa
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4096
-  %load = load i8, i8 addrspace(1)* %gep1
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -4096
+  %load = load i8, ptr addrspace(1) %gep1
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; Maximum negative offset on gfx9 - 1
-define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
@@ -797,16 +797,16 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(i8 addrspa
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4097
-  %load = load i8, i8 addrspace(1)* %gep1
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -4097
+  %load = load i8, ptr addrspace(1) %gep1
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; Maximum positive offset on gfx10
-define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2047(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2047(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2047
@@ -819,16 +819,16 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2047(i8 addrspace(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2047
-  %load = load i8, i8 addrspace(1)* %gep1
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2047
+  %load = load i8, ptr addrspace(1) %gep1
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; Maximum positive offset on gfx10 + 1
-define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2048
@@ -851,16 +851,16 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(i8 addrspace(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2048
-  %load = load i8, i8 addrspace(1)* %gep1
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2048
+  %load = load i8, ptr addrspace(1) %gep1
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; Maximum negative offset on gfx10
-define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2048(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-2048
@@ -873,16 +873,16 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2048(i8 addrspa
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048
-  %load = load i8, i8 addrspace(1)* %gep1
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048
+  %load = load i8, ptr addrspace(1) %gep1
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; Maximum negative offset on gfx10 - 1
-define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-2049
@@ -905,16 +905,16 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(i8 addrspa
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2049
-  %load = load i8, i8 addrspace(1)* %gep1
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2049
+  %load = load i8, ptr addrspace(1) %gep1
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; Maximum positive offset on gfx9, and immediate needs to be moved lower.
-define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:4095
@@ -937,16 +937,16 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(i8
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 %zext.offset
-  %load = load i8, i8 addrspace(1)* %gep1
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %zext.offset
+  %load = load i8, ptr addrspace(1) %gep1
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; pointer addressing done in integers
-define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
@@ -959,17 +959,17 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint(i8 addrspace(1)*
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
+  %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
   %add = add i64 %sbase.as.int, %zext.offset
-  %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
-  %load = load i8, i8 addrspace(1)* %dirty.gep
+  %dirty.gep = inttoptr i64 %add to ptr addrspace(1)
+  %load = load i8, ptr addrspace(1) %dirty.gep
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; zext forced to LHS of addressing expression
-define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
@@ -982,17 +982,17 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(i8 a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
+  %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
   %add = add i64 %zext.offset, %sbase.as.int
-  %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
-  %load = load i8, i8 addrspace(1)* %dirty.gep
+  %dirty.gep = inttoptr i64 %add to ptr addrspace(1)
+  %load = load i8, ptr addrspace(1) %dirty.gep
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; zext forced to LHS of addressing expression, with immediate offset
-define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:128
@@ -1005,18 +1005,18 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
+  %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
   %add = add i64 %zext.offset, %sbase.as.int
   %add.immoffset = add i64 %add, 128
-  %dirty.gep = inttoptr i64 %add.immoffset to i8 addrspace(1)*
-  %load = load i8, i8 addrspace(1)* %dirty.gep
+  %dirty.gep = inttoptr i64 %add.immoffset to ptr addrspace(1)
+  %load = load i8, ptr addrspace(1) %dirty.gep
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; zext forced to LHS of addressing expression, with immediate offset in non-canonical position
-define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:128
@@ -1029,11 +1029,11 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
+  %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
   %add.immoffset = add i64 %sbase.as.int, 128
   %add = add i64 %zext.offset, %add.immoffset
-  %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
-  %load = load i8, i8 addrspace(1)* %dirty.gep
+  %dirty.gep = inttoptr i64 %add to ptr addrspace(1)
+  %load = load i8, ptr addrspace(1) %dirty.gep
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
@@ -1043,7 +1043,7 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_
 ; Uniformity edge cases
 ; --------------------------------------------------------------------------------
 
- at ptr.in.lds = internal addrspace(3) global i8 addrspace(1)* undef
+ at ptr.in.lds = internal addrspace(3) global ptr addrspace(1) undef
 
 ; Base pointer is uniform, but also in VGPRs
 define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
@@ -1080,10 +1080,10 @@ define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
+  %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
@@ -1124,18 +1124,18 @@ define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %vo
 ; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:42
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
+  %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 42
-  %load = load i8, i8 addrspace(1)* %gep1
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 42
+  %load = load i8, ptr addrspace(1) %gep1
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; Both 64-bit base and 32-bit offset are scalar
-define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
+define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) {
 ; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
@@ -1150,15 +1150,15 @@ define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(i8 addrspace(1)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %soffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; Both 64-bit base and 32-bit offset are scalar, with immediate offset.
-define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
+define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) {
 ; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
@@ -1173,16 +1173,16 @@ define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(i8 ad
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %soffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -24
-  %load = load i8, i8 addrspace(1)* %gep1
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -24
+  %load = load i8, ptr addrspace(1) %gep1
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; Both components uniform, zext forced to LHS of addressing expression
-define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
+define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) {
 ; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
@@ -1197,17 +1197,17 @@ define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(i8 a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %soffset to i64
-  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
+  %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
   %add = add i64 %zext.offset, %sbase.as.int
-  %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
-  %load = load i8, i8 addrspace(1)* %dirty.gep
+  %dirty.gep = inttoptr i64 %add to ptr addrspace(1)
+  %load = load i8, ptr addrspace(1) %dirty.gep
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; Both components uniform, zext forced to LHS of addressing expression, with immediate offset
-define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
+define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) {
 ; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
@@ -1222,18 +1222,18 @@ define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %soffset to i64
-  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
+  %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
   %add = add i64 %zext.offset, %sbase.as.int
   %add.immoffset = add i64 %add, 128
-  %dirty.gep = inttoptr i64 %add.immoffset to i8 addrspace(1)*
-  %load = load i8, i8 addrspace(1)* %dirty.gep
+  %dirty.gep = inttoptr i64 %add.immoffset to ptr addrspace(1)
+  %load = load i8, ptr addrspace(1) %dirty.gep
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; divergent 64-bit base, 32-bit scalar offset.
-define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(i8 addrspace(1)* %vbase, i32 inreg %soffset) {
+define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(ptr addrspace(1) %vbase, i32 inreg %soffset) {
 ; GFX9-LABEL: global_load_i8_vgpr64_sgpr32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
@@ -1258,15 +1258,15 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(i8 addrspace(1)* %vbase, i3
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %soffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset
+  %load = load i8, ptr addrspace(1) %gep0
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
 ; divergent 64-bit base, 32-bit scalar offset, with imm offset
-define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(i8 addrspace(1)* %vbase, i32 inreg %soffset) {
+define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(ptr addrspace(1) %vbase, i32 inreg %soffset) {
 ; GFX9-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
@@ -1293,9 +1293,9 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(i8 addrspace(1)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %soffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095
-  %load = load i8, i8 addrspace(1)* %gep1
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4095
+  %load = load i8, ptr addrspace(1) %gep1
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
@@ -1306,7 +1306,7 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(i8 addrspace(1)
 ; --------------------------------------------------------------------------------
 
 ; Cannot push the shift into 32-bits, and cannot match.
-define amdgpu_ps float @global_load_saddr_f32_natural_addressing(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
+define amdgpu_ps float @global_load_saddr_f32_natural_addressing(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
 ; GFX9-LABEL: global_load_saddr_f32_natural_addressing:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
@@ -1345,15 +1345,15 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing(float addrspace
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %voffset = load i32, i32 addrspace(1)* %voffset.ptr
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr
   %zext.offset = zext i32 %voffset to i64
-  %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
-  %load = load float, float addrspace(1)* %gep
+  %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load float, ptr addrspace(1) %gep
   ret float %load
 }
 
 ; Cannot push the shift into 32-bits, with an immediate offset.
-define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(i8 addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
+define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
 ; GCN-LABEL: global_load_saddr_f32_natural_addressing_immoffset:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dword v0, v[0:1], off
@@ -1369,17 +1369,16 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(i8 ad
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %voffset = load i32, i32 addrspace(1)* %voffset.ptr
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)*
-  %load = load float, float addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 128
+  %load = load float, ptr addrspace(1) %gep1
   ret float %load
 }
 
 ; Range is sufficiently restricted to push the shift into 32-bits.
-define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
+define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
 ; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dword v0, v[0:1], off
@@ -1397,15 +1396,15 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(float addrspace(1)
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !0
   %zext.offset = zext i32 %voffset to i64
-  %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
-  %load = load float, float addrspace(1)* %gep
+  %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load float, ptr addrspace(1) %gep
   ret float %load
 }
 
 ; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset
-define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
+define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
 ; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dword v0, v[0:1], off
@@ -1423,16 +1422,16 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(float a
 ; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:400
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !0
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds float, float addrspace(1)* %gep0, i64 100
-  %load = load float, float addrspace(1)* %gep1
+  %gep0 = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds float, ptr addrspace(1) %gep0, i64 100
+  %load = load float, ptr addrspace(1) %gep1
   ret float %load
 }
 
 ; Range is 1 beyond the limit where we can move the shift into 32-bits.
-define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
+define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
 ; GFX9-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
@@ -1471,10 +1470,10 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(float ad
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
-  %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !1
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !1
   %zext.offset = zext i32 %voffset to i64
-  %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
-  %load = load float, float addrspace(1)* %gep
+  %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load float, ptr addrspace(1) %gep
   ret float %load
 }
 
@@ -1482,7 +1481,7 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(float ad
 ; Stress various type loads
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps half @global_load_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps half @global_load_saddr_i16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_ushort v0, v0, s[2:3]
@@ -1495,14 +1494,13 @@ define amdgpu_ps half @global_load_saddr_i16(i8 addrspace(1)* inreg %sbase, i32
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
-  %load = load i16, i16 addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i16, ptr addrspace(1) %gep0
   %cast.load = bitcast i16 %load to half
   ret half %cast.load
 }
 
-define amdgpu_ps half @global_load_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps half @global_load_saddr_i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i16_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_ushort v0, v0, s[2:3] offset:-128
@@ -1515,15 +1513,14 @@ define amdgpu_ps half @global_load_saddr_i16_immneg128(i8 addrspace(1)* inreg %s
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
-  %load = load i16, i16 addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load i16, ptr addrspace(1) %gep1
   %cast.load = bitcast i16 %load to half
   ret half %cast.load
 }
 
-define amdgpu_ps half @global_load_saddr_f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps half @global_load_saddr_f16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_f16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_ushort v0, v0, s[2:3]
@@ -1536,13 +1533,12 @@ define amdgpu_ps half @global_load_saddr_f16(i8 addrspace(1)* inreg %sbase, i32
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to half addrspace(1)*
-  %load = load half, half addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load half, ptr addrspace(1) %gep0
   ret half %load
 }
 
-define amdgpu_ps half @global_load_saddr_f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps half @global_load_saddr_f16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_f16_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_ushort v0, v0, s[2:3] offset:-128
@@ -1555,14 +1551,13 @@ define amdgpu_ps half @global_load_saddr_f16_immneg128(i8 addrspace(1)* inreg %s
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to half addrspace(1)*
-  %load = load half, half addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load half, ptr addrspace(1) %gep1
   ret half %load
 }
 
-define amdgpu_ps float @global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_load_saddr_i32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
@@ -1575,14 +1570,13 @@ define amdgpu_ps float @global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %load = load i32, i32 addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i32, ptr addrspace(1) %gep0
   %cast.load = bitcast i32 %load to float
   ret float %cast.load
 }
 
-define amdgpu_ps float @global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_load_saddr_i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i32_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
@@ -1595,15 +1589,14 @@ define amdgpu_ps float @global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %load = load i32, i32 addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load i32, ptr addrspace(1) %gep1
   %cast.load = bitcast i32 %load to float
   ret float %cast.load
 }
 
-define amdgpu_ps float @global_load_saddr_f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_load_saddr_f32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
@@ -1616,13 +1609,12 @@ define amdgpu_ps float @global_load_saddr_f32(i8 addrspace(1)* inreg %sbase, i32
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)*
-  %load = load float, float addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load float, ptr addrspace(1) %gep0
   ret float %load
 }
 
-define amdgpu_ps float @global_load_saddr_f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_load_saddr_f32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_f32_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
@@ -1635,14 +1627,13 @@ define amdgpu_ps float @global_load_saddr_f32_immneg128(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)*
-  %load = load float, float addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load float, ptr addrspace(1) %gep1
   ret float %load
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_v2i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x half> @global_load_saddr_v2i16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v2i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
@@ -1655,14 +1646,13 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2i16(i8 addrspace(1)* inreg %sba
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i16> addrspace(1)*
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load <2 x i16>, ptr addrspace(1) %gep0
   %cast.load = bitcast <2 x i16> %load to <2 x half>
   ret <2 x half> %cast.load
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_v2i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x half> @global_load_saddr_v2i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v2i16_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
@@ -1675,15 +1665,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2i16_immneg128(i8 addrspace(1)*
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i16> addrspace(1)*
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load <2 x i16>, ptr addrspace(1) %gep1
   %cast.load = bitcast <2 x i16> %load to <2 x half>
   ret <2 x half> %cast.load
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_v2f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x half> @global_load_saddr_v2f16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v2f16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
@@ -1696,13 +1685,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2f16(i8 addrspace(1)* inreg %sba
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x half> addrspace(1)*
-  %load = load <2 x half>, <2 x half> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load <2 x half>, ptr addrspace(1) %gep0
   ret <2 x half> %load
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_v2f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x half> @global_load_saddr_v2f16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v2f16_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
@@ -1715,14 +1703,13 @@ define amdgpu_ps <2 x half> @global_load_saddr_v2f16_immneg128(i8 addrspace(1)*
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x half> addrspace(1)*
-  %load = load <2 x half>, <2 x half> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load <2 x half>, ptr addrspace(1) %gep1
   ret <2 x half> %load
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x half> @global_load_saddr_p3(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_p3:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
@@ -1735,15 +1722,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_p3(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(3)* addrspace(1)*
-  %load = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %gep0.cast
-  %cast.load0 = ptrtoint i8 addrspace(3)* %load to i32
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load ptr addrspace(3), ptr addrspace(1) %gep0
+  %cast.load0 = ptrtoint ptr addrspace(3) %load to i32
   %cast.load1 = bitcast i32 %cast.load0 to <2 x half>
   ret <2 x half> %cast.load1
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x half> @global_load_saddr_p3_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_p3_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
@@ -1756,16 +1742,15 @@ define amdgpu_ps <2 x half> @global_load_saddr_p3_immneg128(i8 addrspace(1)* inr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(3)* addrspace(1)*
-  %load = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %gep1.cast
-  %cast.load0 = ptrtoint i8 addrspace(3)* %load to i32
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load ptr addrspace(3), ptr addrspace(1) %gep1
+  %cast.load0 = ptrtoint ptr addrspace(3) %load to i32
   %cast.load1 = bitcast i32 %cast.load0 to <2 x half>
   ret <2 x half> %cast.load1
 }
 
-define amdgpu_ps <2 x float> @global_load_saddr_f64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x float> @global_load_saddr_f64(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
@@ -1778,14 +1763,13 @@ define amdgpu_ps <2 x float> @global_load_saddr_f64(i8 addrspace(1)* inreg %sbas
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to double addrspace(1)*
-  %load = load double, double addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load double, ptr addrspace(1) %gep0
   %cast.load = bitcast double %load to <2 x float>
   ret <2 x float> %cast.load
 }
 
-define amdgpu_ps <2 x float> @global_load_saddr_f64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x float> @global_load_saddr_f64_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_f64_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
@@ -1798,15 +1782,14 @@ define amdgpu_ps <2 x float> @global_load_saddr_f64_immneg128(i8 addrspace(1)* i
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to double addrspace(1)*
-  %load = load double, double addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load double, ptr addrspace(1) %gep1
   %cast.load = bitcast double %load to <2 x float>
   ret <2 x float> %cast.load
 }
 
-define amdgpu_ps <2 x float> @global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x float> @global_load_saddr_i64(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
@@ -1819,14 +1802,13 @@ define amdgpu_ps <2 x float> @global_load_saddr_i64(i8 addrspace(1)* inreg %sbas
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %load = load i64, i64 addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i64, ptr addrspace(1) %gep0
   %cast.load = bitcast i64 %load to <2 x float>
   ret <2 x float> %cast.load
 }
 
-define amdgpu_ps <2 x float> @global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x float> @global_load_saddr_i64_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i64_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
@@ -1839,15 +1821,14 @@ define amdgpu_ps <2 x float> @global_load_saddr_i64_immneg128(i8 addrspace(1)* i
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %load = load i64, i64 addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load i64, ptr addrspace(1) %gep1
   %cast.load = bitcast i64 %load to <2 x float>
   ret <2 x float> %cast.load
 }
 
-define amdgpu_ps <2 x float> @global_load_saddr_v2f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x float> @global_load_saddr_v2f32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v2f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
@@ -1860,13 +1841,12 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2f32(i8 addrspace(1)* inreg %sb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x float> addrspace(1)*
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load <2 x float>, ptr addrspace(1) %gep0
   ret <2 x float> %load
 }
 
-define amdgpu_ps <2 x float> @global_load_saddr_v2f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x float> @global_load_saddr_v2f32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v2f32_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
@@ -1879,14 +1859,13 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2f32_immneg128(i8 addrspace(1)*
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x float> addrspace(1)*
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load <2 x float>, ptr addrspace(1) %gep1
   ret <2 x float> %load
 }
 
-define amdgpu_ps <2 x float> @global_load_saddr_v2i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x float> @global_load_saddr_v2i32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
@@ -1899,14 +1878,13 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2i32(i8 addrspace(1)* inreg %sb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i32> addrspace(1)*
-  %load = load <2 x i32>, <2 x i32> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load <2 x i32>, ptr addrspace(1) %gep0
   %cast.load = bitcast <2 x i32> %load to <2 x float>
   ret <2 x float> %cast.load
 }
 
-define amdgpu_ps <2 x float> @global_load_saddr_v2i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x float> @global_load_saddr_v2i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v2i32_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
@@ -1919,15 +1897,14 @@ define amdgpu_ps <2 x float> @global_load_saddr_v2i32_immneg128(i8 addrspace(1)*
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i32> addrspace(1)*
-  %load = load <2 x i32>, <2 x i32> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load <2 x i32>, ptr addrspace(1) %gep1
   %cast.load = bitcast <2 x i32> %load to <2 x float>
   ret <2 x float> %cast.load
 }
 
-define amdgpu_ps <2 x float> @global_load_saddr_v4i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x float> @global_load_saddr_v4i16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v4i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
@@ -1940,14 +1917,13 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4i16(i8 addrspace(1)* inreg %sb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i16> addrspace(1)*
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load <4 x i16>, ptr addrspace(1) %gep0
   %cast.load = bitcast <4 x i16> %load to <2 x float>
   ret <2 x float> %cast.load
 }
 
-define amdgpu_ps <2 x float> @global_load_saddr_v4i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x float> @global_load_saddr_v4i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v4i16_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
@@ -1960,15 +1936,14 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4i16_immneg128(i8 addrspace(1)*
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i16> addrspace(1)*
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load <4 x i16>, ptr addrspace(1) %gep1
   %cast.load = bitcast <4 x i16> %load to <2 x float>
   ret <2 x float> %cast.load
 }
 
-define amdgpu_ps <2 x float> @global_load_saddr_v4f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x float> @global_load_saddr_v4f16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v4f16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
@@ -1981,14 +1956,13 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4f16(i8 addrspace(1)* inreg %sb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x half> addrspace(1)*
-  %load = load <4 x half>, <4 x half> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load <4 x half>, ptr addrspace(1) %gep0
   %cast.load = bitcast <4 x half> %load to <2 x float>
   ret <2 x float> %cast.load
 }
 
-define amdgpu_ps <2 x float> @global_load_saddr_v4f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x float> @global_load_saddr_v4f16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v4f16_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
@@ -2001,15 +1975,14 @@ define amdgpu_ps <2 x float> @global_load_saddr_v4f16_immneg128(i8 addrspace(1)*
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x half> addrspace(1)*
-  %load = load <4 x half>, <4 x half> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load <4 x half>, ptr addrspace(1) %gep1
   %cast.load = bitcast <4 x half> %load to <2 x float>
   ret <2 x float> %cast.load
 }
 
-define amdgpu_ps <2 x float> @global_load_saddr_p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x float> @global_load_saddr_p1(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_p1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
@@ -2022,15 +1995,14 @@ define amdgpu_ps <2 x float> @global_load_saddr_p1(i8 addrspace(1)* inreg %sbase
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* addrspace(1)*
-  %load = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %gep0.cast
-  %cast.load0 = ptrtoint i8 addrspace(1)* %load to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load ptr addrspace(1), ptr addrspace(1) %gep0
+  %cast.load0 = ptrtoint ptr addrspace(1) %load to i64
   %cast.load1 = bitcast i64 %cast.load0 to <2 x float>
   ret <2 x float> %cast.load1
 }
 
-define amdgpu_ps <2 x float> @global_load_saddr_p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x float> @global_load_saddr_p1_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_p1_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
@@ -2043,16 +2015,15 @@ define amdgpu_ps <2 x float> @global_load_saddr_p1_immneg128(i8 addrspace(1)* in
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* addrspace(1)*
-  %load = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %gep1.cast
-  %cast.load0 = ptrtoint i8 addrspace(1)* %load to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load ptr addrspace(1), ptr addrspace(1) %gep1
+  %cast.load0 = ptrtoint ptr addrspace(1) %load to i64
   %cast.load1 = bitcast i64 %cast.load0 to <2 x float>
   ret <2 x float> %cast.load1
 }
 
-define amdgpu_ps <3 x float> @global_load_saddr_v3f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <3 x float> @global_load_saddr_v3f32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v3f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3]
@@ -2065,13 +2036,12 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3f32(i8 addrspace(1)* inreg %sb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x float> addrspace(1)*
-  %load = load <3 x float>, <3 x float> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load <3 x float>, ptr addrspace(1) %gep0
   ret <3 x float> %load
 }
 
-define amdgpu_ps <3 x float> @global_load_saddr_v3f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <3 x float> @global_load_saddr_v3f32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v3f32_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
@@ -2084,14 +2054,13 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3f32_immneg128(i8 addrspace(1)*
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x float> addrspace(1)*
-  %load = load <3 x float>, <3 x float> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load <3 x float>, ptr addrspace(1) %gep1
   ret <3 x float> %load
 }
 
-define amdgpu_ps <3 x float> @global_load_saddr_v3i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <3 x float> @global_load_saddr_v3i32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v3i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3]
@@ -2104,14 +2073,13 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3i32(i8 addrspace(1)* inreg %sb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x i32> addrspace(1)*
-  %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load <3 x i32>, ptr addrspace(1) %gep0
   %cast.load = bitcast <3 x i32> %load to <3 x float>
   ret <3 x float> %cast.load
 }
 
-define amdgpu_ps <3 x float> @global_load_saddr_v3i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <3 x float> @global_load_saddr_v3i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v3i32_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
@@ -2124,15 +2092,14 @@ define amdgpu_ps <3 x float> @global_load_saddr_v3i32_immneg128(i8 addrspace(1)*
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x i32> addrspace(1)*
-  %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load <3 x i32>, ptr addrspace(1) %gep1
   %cast.load = bitcast <3 x i32> %load to <3 x float>
   ret <3 x float> %cast.load
 }
 
-define amdgpu_ps <6 x half> @global_load_saddr_v6f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <6 x half> @global_load_saddr_v6f16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v6f16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3]
@@ -2145,13 +2112,12 @@ define amdgpu_ps <6 x half> @global_load_saddr_v6f16(i8 addrspace(1)* inreg %sba
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <6 x half> addrspace(1)*
-  %load = load <6 x half>, <6 x half> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load <6 x half>, ptr addrspace(1) %gep0
   ret <6 x half> %load
 }
 
-define amdgpu_ps <6 x half> @global_load_saddr_v6f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <6 x half> @global_load_saddr_v6f16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v6f16_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
@@ -2164,14 +2130,13 @@ define amdgpu_ps <6 x half> @global_load_saddr_v6f16_immneg128(i8 addrspace(1)*
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <6 x half> addrspace(1)*
-  %load = load <6 x half>, <6 x half> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load <6 x half>, ptr addrspace(1) %gep1
   ret <6 x half> %load
 }
 
-define amdgpu_ps <4 x float> @global_load_saddr_v4f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <4 x float> @global_load_saddr_v4f32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v4f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
@@ -2184,13 +2149,12 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4f32(i8 addrspace(1)* inreg %sb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x float> addrspace(1)*
-  %load = load <4 x float>, <4 x float> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load <4 x float>, ptr addrspace(1) %gep0
   ret <4 x float> %load
 }
 
-define amdgpu_ps <4 x float> @global_load_saddr_v4f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <4 x float> @global_load_saddr_v4f32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v4f32_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
@@ -2203,14 +2167,13 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4f32_immneg128(i8 addrspace(1)*
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x float> addrspace(1)*
-  %load = load <4 x float>, <4 x float> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load <4 x float>, ptr addrspace(1) %gep1
   ret <4 x float> %load
 }
 
-define amdgpu_ps <4 x float> @global_load_saddr_v4i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <4 x float> @global_load_saddr_v4i32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v4i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
@@ -2223,14 +2186,13 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4i32(i8 addrspace(1)* inreg %sb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i32> addrspace(1)*
-  %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load <4 x i32>, ptr addrspace(1) %gep0
   %cast.load = bitcast <4 x i32> %load to <4 x float>
   ret <4 x float> %cast.load
 }
 
-define amdgpu_ps <4 x float> @global_load_saddr_v4i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <4 x float> @global_load_saddr_v4i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v4i32_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
@@ -2243,15 +2205,14 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4i32_immneg128(i8 addrspace(1)*
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i32> addrspace(1)*
-  %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load <4 x i32>, ptr addrspace(1) %gep1
   %cast.load = bitcast <4 x i32> %load to <4 x float>
   ret <4 x float> %cast.load
 }
 
-define amdgpu_ps <4 x float> @global_load_saddr_v2i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <4 x float> @global_load_saddr_v2i64(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v2i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
@@ -2264,14 +2225,13 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2i64(i8 addrspace(1)* inreg %sb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i64> addrspace(1)*
-  %load = load <2 x i64>, <2 x i64> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load <2 x i64>, ptr addrspace(1) %gep0
   %cast.load = bitcast <2 x i64> %load to <4 x float>
   ret <4 x float> %cast.load
 }
 
-define amdgpu_ps <4 x float> @global_load_saddr_v2i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <4 x float> @global_load_saddr_v2i64_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v2i64_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
@@ -2284,15 +2244,14 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2i64_immneg128(i8 addrspace(1)*
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i64> addrspace(1)*
-  %load = load <2 x i64>, <2 x i64> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load <2 x i64>, ptr addrspace(1) %gep1
   %cast.load = bitcast <2 x i64> %load to <4 x float>
   ret <4 x float> %cast.load
 }
 
-define amdgpu_ps <4 x float> @global_load_saddr_i128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <4 x float> @global_load_saddr_i128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
@@ -2305,14 +2264,13 @@ define amdgpu_ps <4 x float> @global_load_saddr_i128(i8 addrspace(1)* inreg %sba
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i128 addrspace(1)*
-  %load = load i128, i128 addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i128, ptr addrspace(1) %gep0
   %cast.load = bitcast i128 %load to <4 x float>
   ret <4 x float> %cast.load
 }
 
-define amdgpu_ps <4 x float> @global_load_saddr_i128_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <4 x float> @global_load_saddr_i128_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i128_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
@@ -2325,15 +2283,14 @@ define amdgpu_ps <4 x float> @global_load_saddr_i128_immneg128(i8 addrspace(1)*
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i128 addrspace(1)*
-  %load = load i128, i128 addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load i128, ptr addrspace(1) %gep1
   %cast.load = bitcast i128 %load to <4 x float>
   ret <4 x float> %cast.load
 }
 
-define amdgpu_ps <4 x float> @global_load_saddr_v2p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <4 x float> @global_load_saddr_v2p1(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v2p1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
@@ -2346,15 +2303,14 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2p1(i8 addrspace(1)* inreg %sba
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i8 addrspace(1)*> addrspace(1)*
-  %load = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %gep0.cast
-  %cast.load0 = ptrtoint <2 x i8 addrspace(1)*> %load to <2 x i64>
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load <2 x ptr addrspace(1)>, ptr addrspace(1) %gep0
+  %cast.load0 = ptrtoint <2 x ptr addrspace(1)> %load to <2 x i64>
   %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float>
   ret <4 x float> %cast.load1
 }
 
-define amdgpu_ps <4 x float> @global_load_saddr_v2p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <4 x float> @global_load_saddr_v2p1_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v2p1_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
@@ -2367,16 +2323,15 @@ define amdgpu_ps <4 x float> @global_load_saddr_v2p1_immneg128(i8 addrspace(1)*
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i8 addrspace(1)*> addrspace(1)*
-  %load = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %gep1.cast
-  %cast.load0 = ptrtoint <2 x i8 addrspace(1)*> %load to <2 x i64>
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load <2 x ptr addrspace(1)>, ptr addrspace(1) %gep1
+  %cast.load0 = ptrtoint <2 x ptr addrspace(1)> %load to <2 x i64>
   %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float>
   ret <4 x float> %cast.load1
 }
 
-define amdgpu_ps <4 x float> @global_load_saddr_v4p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <4 x float> @global_load_saddr_v4p3(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v4p3:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
@@ -2389,15 +2344,14 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4p3(i8 addrspace(1)* inreg %sba
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i8 addrspace(3)*> addrspace(1)*
-  %load = load <4 x i8 addrspace(3)*>, <4 x i8 addrspace(3)*> addrspace(1)* %gep0.cast
-  %cast.load0 = ptrtoint <4 x i8 addrspace(3)*> %load to <4 x i32>
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load <4 x ptr addrspace(3)>, ptr addrspace(1) %gep0
+  %cast.load0 = ptrtoint <4 x ptr addrspace(3)> %load to <4 x i32>
   %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float>
   ret <4 x float> %cast.load1
 }
 
-define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_v4p3_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
@@ -2410,11 +2364,10 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(i8 addrspace(1)*
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i8 addrspace(3)*> addrspace(1)*
-  %load = load <4 x i8 addrspace(3)*>, <4 x i8 addrspace(3)*> addrspace(1)* %gep1.cast
-  %cast.load0 = ptrtoint <4 x i8 addrspace(3)*> %load to <4 x i32>
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load <4 x ptr addrspace(3)>, ptr addrspace(1) %gep1
+  %cast.load0 = ptrtoint <4 x ptr addrspace(3)> %load to <4 x i32>
   %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float>
   ret <4 x float> %cast.load1
 }
@@ -2423,7 +2376,7 @@ define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(i8 addrspace(1)*
 ; Extending loads
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps float @global_sextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_sextload_saddr_i8(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_sextload_saddr_i8:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_sbyte v0, v0, s[2:3]
@@ -2436,14 +2389,14 @@ define amdgpu_ps float @global_sextload_saddr_i8(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i8, ptr addrspace(1) %gep0
   %sextload = sext i8 %load to i32
   %cast.load = bitcast i32 %sextload to float
   ret float %cast.load
 }
 
-define amdgpu_ps float @global_sextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_sextload_saddr_i8_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_sextload_saddr_i8_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_sbyte v0, v0, s[2:3] offset:-128
@@ -2456,15 +2409,15 @@ define amdgpu_ps float @global_sextload_saddr_i8_immneg128(i8 addrspace(1)* inre
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %load = load i8, i8 addrspace(1)* %gep1
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load i8, ptr addrspace(1) %gep1
   %sextload = sext i8 %load to i32
   %cast.load = bitcast i32 %sextload to float
   ret float %cast.load
 }
 
-define amdgpu_ps float @global_sextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_sextload_saddr_i16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_sextload_saddr_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_sshort v0, v0, s[2:3]
@@ -2477,15 +2430,14 @@ define amdgpu_ps float @global_sextload_saddr_i16(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
-  %load = load i16, i16 addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i16, ptr addrspace(1) %gep0
   %sextload = sext i16 %load to i32
   %cast.load = bitcast i32 %sextload to float
   ret float %cast.load
 }
 
-define amdgpu_ps float @global_sextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_sextload_saddr_i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_sextload_saddr_i16_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_sshort v0, v0, s[2:3] offset:-128
@@ -2498,16 +2450,15 @@ define amdgpu_ps float @global_sextload_saddr_i16_immneg128(i8 addrspace(1)* inr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
-  %load = load i16, i16 addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load i16, ptr addrspace(1) %gep1
   %sextload = sext i16 %load to i32
   %cast.load = bitcast i32 %sextload to float
   ret float %cast.load
 }
 
-define amdgpu_ps float @global_zextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_zextload_saddr_i8(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_zextload_saddr_i8:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
@@ -2520,14 +2471,14 @@ define amdgpu_ps float @global_zextload_saddr_i8(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %load = load i8, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i8, ptr addrspace(1) %gep0
   %zextload = zext i8 %load to i32
   %cast.load = bitcast i32 %zextload to float
   ret float %cast.load
 }
 
-define amdgpu_ps float @global_zextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_zextload_saddr_i8_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_zextload_saddr_i8_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-128
@@ -2540,15 +2491,15 @@ define amdgpu_ps float @global_zextload_saddr_i8_immneg128(i8 addrspace(1)* inre
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %load = load i8, i8 addrspace(1)* %gep1
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load i8, ptr addrspace(1) %gep1
   %zextload = zext i8 %load to i32
   %cast.load = bitcast i32 %zextload to float
   ret float %cast.load
 }
 
-define amdgpu_ps float @global_zextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_zextload_saddr_i16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_zextload_saddr_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_ushort v0, v0, s[2:3]
@@ -2561,15 +2512,14 @@ define amdgpu_ps float @global_zextload_saddr_i16(i8 addrspace(1)* inreg %sbase,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
-  %load = load i16, i16 addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i16, ptr addrspace(1) %gep0
   %zextload = zext i16 %load to i32
   %cast.load = bitcast i32 %zextload to float
   ret float %cast.load
 }
 
-define amdgpu_ps float @global_zextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @global_zextload_saddr_i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_zextload_saddr_i16_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_ushort v0, v0, s[2:3] offset:-128
@@ -2582,10 +2532,9 @@ define amdgpu_ps float @global_zextload_saddr_i16_immneg128(i8 addrspace(1)* inr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
-  %load = load i16, i16 addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load i16, ptr addrspace(1) %gep1
   %zextload = zext i16 %load to i32
   %cast.load = bitcast i32 %zextload to float
   ret float %cast.load
@@ -2595,7 +2544,7 @@ define amdgpu_ps float @global_zextload_saddr_i16_immneg128(i8 addrspace(1)* inr
 ; Atomic load
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps float @atomic_global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @atomic_global_load_saddr_i32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GFX9-LABEL: atomic_global_load_saddr_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2624,14 +2573,13 @@ define amdgpu_ps float @atomic_global_load_saddr_i32(i8 addrspace(1)* inreg %sba
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %load = load atomic i32, i32 addrspace(1)* %gep0.cast seq_cst, align 4
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load atomic i32, ptr addrspace(1) %gep0 seq_cst, align 4
   %cast.load = bitcast i32 %load to float
   ret float %cast.load
 }
 
-define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GFX9-LABEL: atomic_global_load_saddr_i32_immneg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2660,15 +2608,14 @@ define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(i8 addrspace(1)*
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %load = load atomic i32, i32 addrspace(1)* %gep1.cast seq_cst, align 4
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load atomic i32, ptr addrspace(1) %gep1 seq_cst, align 4
   %cast.load = bitcast i32 %load to float
   ret float %cast.load
 }
 
-define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GFX9-LABEL: atomic_global_load_saddr_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2697,14 +2644,13 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(i8 addrspace(1)* inre
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  %load = load atomic i64, i64 addrspace(1)* %gep0.cast seq_cst, align 8
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load atomic i64, ptr addrspace(1) %gep0 seq_cst, align 8
   %cast.load = bitcast i64 %load to <2 x float>
   ret <2 x float> %cast.load
 }
 
-define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GFX9-LABEL: atomic_global_load_saddr_i64_immneg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2733,10 +2679,9 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(i8 addrspac
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  %load = load atomic i64, i64 addrspace(1)* %gep1.cast seq_cst, align 8
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load atomic i64, ptr addrspace(1) %gep1 seq_cst, align 8
   %cast.load = bitcast i64 %load to <2 x float>
   ret <2 x float> %cast.load
 }
@@ -2745,7 +2690,7 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(i8 addrspac
 ; D16 load (low 16)
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_short_d16 v0, v0, s[2:3]
@@ -2758,15 +2703,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(i8 addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
-  %load = load i16, i16 addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i16, ptr addrspace(1) %gep0
   %build = insertelement <2 x i16> undef, i16 %load, i32 0
   %cast = bitcast <2 x i16> %build to <2 x half>
   ret <2 x half> %cast
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_short_d16 v0, v0, s[2:3] offset:-128
@@ -2779,16 +2723,15 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(i8 a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
-  %load = load i16, i16 addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load i16, ptr addrspace(1) %gep1
   %build = insertelement <2 x i16> undef, i16 %load, i32 0
   %cast = bitcast <2 x i16> %build to <2 x half>
   ret <2 x half> %cast
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
@@ -2805,15 +2748,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(i8 addrspace(1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
-  %load = load i16, i16 addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i16, ptr addrspace(1) %gep0
   %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
   %cast = bitcast <2 x i16> %build to <2 x half>
   ret <2 x half> %cast
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
@@ -2830,16 +2772,15 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(i8 ad
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
-  %load = load i16, i16 addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load i16, ptr addrspace(1) %gep1
   %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
   %cast = bitcast <2 x i16> %build to <2 x half>
   ret <2 x half> %cast
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_short_d16 v1, v0, s[2:3]
@@ -2854,15 +2795,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(i8 addrspace(1)*
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
-  %load = load i16, i16 addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i16, ptr addrspace(1) %gep0
   %build = insertelement <2 x i16> %reg, i16 %load, i32 0
   %cast = bitcast <2 x i16> %build to <2 x half>
   ret <2 x half> %cast
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_short_d16 v1, v0, s[2:3] offset:-128
@@ -2877,16 +2817,15 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(i8 add
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
-  %load = load i16, i16 addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load i16, ptr addrspace(1) %gep1
   %build = insertelement <2 x i16> %reg, i16 %load, i32 0
   %cast = bitcast <2 x i16> %build to <2 x half>
   ret <2 x half> %cast
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_ubyte_d16 v1, v0, s[2:3]
@@ -2901,16 +2840,15 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(i8 addrsp
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
-  %load = load i8, i8 addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i8, ptr addrspace(1) %gep0
   %zext.load = zext i8 %load to i16
   %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0
   %cast = bitcast <2 x i16> %build to <2 x half>
   ret <2 x half> %cast
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_ubyte_d16 v1, v0, s[2:3] offset:-128
@@ -2925,17 +2863,16 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
-  %load = load i8, i8 addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load i8, ptr addrspace(1) %gep1
   %zext.load = zext i8 %load to i16
   %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0
   %cast = bitcast <2 x i16> %build to <2 x half>
   ret <2 x half> %cast
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_sbyte_d16 v1, v0, s[2:3]
@@ -2950,16 +2887,15 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(i8 addrsp
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
-  %load = load i8, i8 addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i8, ptr addrspace(1) %gep0
   %sext.load = sext i8 %load to i16
   %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0
   %cast = bitcast <2 x i16> %build to <2 x half>
   ret <2 x half> %cast
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_sbyte_d16 v1, v0, s[2:3] offset:-128
@@ -2974,10 +2910,9 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
-  %load = load i8, i8 addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load i8, ptr addrspace(1) %gep1
   %sext.load = sext i8 %load to i16
   %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0
   %cast = bitcast <2 x i16> %build to <2 x half>
@@ -2988,7 +2923,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128
 ; D16 hi load (hi16)
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_short_d16_hi v0, v0, s[2:3]
@@ -3001,15 +2936,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(i8 addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
-  %load = load i16, i16 addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i16, ptr addrspace(1) %gep0
   %build = insertelement <2 x i16> undef, i16 %load, i32 1
   %cast = bitcast <2 x i16> %build to <2 x half>
   ret <2 x half> %cast
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_short_d16_hi v0, v0, s[2:3] offset:-128
@@ -3022,16 +2956,15 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(i8 a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
-  %load = load i16, i16 addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load i16, ptr addrspace(1) %gep1
   %build = insertelement <2 x i16> undef, i16 %load, i32 1
   %cast = bitcast <2 x i16> %build to <2 x half>
   ret <2 x half> %cast
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
@@ -3048,15 +2981,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(i8 addrspace(1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
-  %load = load i16, i16 addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i16, ptr addrspace(1) %gep0
   %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
   %cast = bitcast <2 x i16> %build to <2 x half>
   ret <2 x half> %cast
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
 ; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
@@ -3073,16 +3005,15 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(i8 ad
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
-  %load = load i16, i16 addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load i16, ptr addrspace(1) %gep1
   %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
   %cast = bitcast <2 x i16> %build to <2 x half>
   ret <2 x half> %cast
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_short_d16_hi v1, v0, s[2:3]
@@ -3097,15 +3028,14 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(i8 addrspace(1)*
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
-  %load = load i16, i16 addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i16, ptr addrspace(1) %gep0
   %build = insertelement <2 x i16> %reg, i16 %load, i32 1
   %cast = bitcast <2 x i16> %build to <2 x half>
   ret <2 x half> %cast
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_short_d16_hi v1, v0, s[2:3] offset:-128
@@ -3120,16 +3050,15 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(i8 add
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
-  %load = load i16, i16 addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load i16, ptr addrspace(1) %gep1
   %build = insertelement <2 x i16> %reg, i16 %load, i32 1
   %cast = bitcast <2 x i16> %build to <2 x half>
   ret <2 x half> %cast
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_ubyte_d16_hi v1, v0, s[2:3]
@@ -3144,16 +3073,15 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(i8 addrsp
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
-  %load = load i8, i8 addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i8, ptr addrspace(1) %gep0
   %zext.load = zext i8 %load to i16
   %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1
   %cast = bitcast <2 x i16> %build to <2 x half>
   ret <2 x half> %cast
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_ubyte_d16_hi v1, v0, s[2:3] offset:-128
@@ -3168,17 +3096,16 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
-  %load = load i8, i8 addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load i8, ptr addrspace(1) %gep1
   %zext.load = zext i8 %load to i16
   %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1
   %cast = bitcast <2 x i16> %build to <2 x half>
   ret <2 x half> %cast
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_sbyte_d16_hi v1, v0, s[2:3]
@@ -3193,16 +3120,15 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(i8 addrsp
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
-  %load = load i8, i8 addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = load i8, ptr addrspace(1) %gep0
   %sext.load = sext i8 %load to i16
   %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1
   %cast = bitcast <2 x i16> %build to <2 x half>
   ret <2 x half> %cast
 }
 
-define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_sbyte_d16_hi v1, v0, s[2:3] offset:-128
@@ -3217,10 +3143,9 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
-  %load = load i8, i8 addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %load = load i8, ptr addrspace(1) %gep1
   %sext.load = sext i8 %load to i16
   %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1
   %cast = bitcast <2 x i16> %build to <2 x half>
@@ -3232,7 +3157,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128
 ; --------------------------------------------------------------------------------
 
 ; Check add-as-or with split 64-bit or.
-define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(i8 addrspace(6)* inreg %sbase, i32 %idx) {
+define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrspace(6) inreg %sbase, i32 %idx) {
 ; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_or_b32_e32 v0, 16, v0
@@ -3250,14 +3175,14 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(i8 addr
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 16
-  %addr = inttoptr i64 %or to i8 addrspace(1)*
-  %load = load i8, i8 addrspace(1)* %addr
+  %addr = inttoptr i64 %or to ptr addrspace(1)
+  %load = load i8, ptr addrspace(1) %addr
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
 }
 
-define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(i8 addrspace(6)* inreg %sbase, i32 %idx) {
+define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) inreg %sbase, i32 %idx) {
 ; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_or_b32_e32 v0, 0x1040, v0
@@ -3275,8 +3200,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(i8 ad
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 4160
-  %addr = inttoptr i64 %or to i8 addrspace(1)*
-  %load = load i8, i8 addrspace(1)* %addr
+  %addr = inttoptr i64 %or to ptr addrspace(1)
+  %load = load i8, ptr addrspace(1) %addr
   %zext = zext i8 %load to i32
   %to.vgpr = bitcast i32 %zext to float
   ret float %to.vgpr
@@ -3286,7 +3211,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(i8 ad
 ; Full 64-bit scalar add.
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps void @global_addr_64bit_lsr_iv(float addrspace(1)* inreg %arg) {
+define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
 ; GFX9-LABEL: global_addr_64bit_lsr_iv:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_mov_b64 s[0:1], 0
@@ -3348,8 +3273,8 @@ bb2:                                              ; preds = %bb3
 bb3:                                              ; preds = %bb3, %bb
   %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ]
   %i4 = zext i32 %i to i64
-  %i5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %i4
-  %i6 = load volatile float, float addrspace(1)* %i5, align 4
+  %i5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %i4
+  %i6 = load volatile float, ptr addrspace(1) %i5, align 4
   %i8 = add nuw nsw i32 %i, 1
   %i9 = icmp eq i32 %i8, 256
   br i1 %i9, label %bb2, label %bb3
@@ -3357,7 +3282,7 @@ bb3:                                              ; preds = %bb3, %bb
 
 ; Make sure we only have a single zero vaddr initialization.
 
-define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(float addrspace(1)* inreg %arg, float addrspace(1)* inreg %arg.1) {
+define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg %arg, ptr addrspace(1) inreg %arg.1) {
 ; GFX9-LABEL: global_addr_64bit_lsr_iv_multiload:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_mov_b64 s[0:1], 0
@@ -3427,10 +3352,10 @@ bb2:                                              ; preds = %bb3
 bb3:                                              ; preds = %bb3, %bb
   %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ]
   %i4 = zext i32 %i to i64
-  %i5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %i4
-  %i6 = load volatile float, float addrspace(1)* %i5, align 4
-  %i5.1 = getelementptr inbounds float, float addrspace(1)* %arg.1, i64 %i4
-  %i6.1 = load volatile float, float addrspace(1)* %i5, align 4
+  %i5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %i4
+  %i6 = load volatile float, ptr addrspace(1) %i5, align 4
+  %i5.1 = getelementptr inbounds float, ptr addrspace(1) %arg.1, i64 %i4
+  %i6.1 = load volatile float, ptr addrspace(1) %i5, align 4
   %i8 = add nuw nsw i32 %i, 1
   %i9 = icmp eq i32 %i8, 256
   br i1 %i9, label %bb2, label %bb3

diff  --git a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll
index ea2dbf46a1d75..87d680aad4cf1 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll
@@ -5,7 +5,7 @@
 
 ; Test using saddr addressing mode of global_*store_* flat instructions.
 
-define amdgpu_ps void @global_store_saddr_i8_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr, i8 %data) {
+define amdgpu_ps void @global_store_saddr_i8_zext_vgpr(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr, i8 %data) {
 ; GCN-LABEL: global_store_saddr_i8_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dword v0, v[0:1], off
@@ -20,15 +20,15 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr(i8 addrspace(1)* inreg %s
 ; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %voffset = load i32, i32 addrspace(1)* %voffset.ptr
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  store i8 %data, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store i8 %data, ptr addrspace(1) %gep0
   ret void
 }
 
 ; Maximum positive offset on gfx10
-define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_2047(i8 addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr, i8 %data) {
+define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_2047(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr, i8 %data) {
 ; GCN-LABEL: global_store_saddr_i8_zext_vgpr_offset_2047:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dword v0, v[0:1], off
@@ -43,16 +43,16 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_2047(i8 addrspace(
 ; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3] offset:2047
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %voffset = load i32, i32 addrspace(1)* %voffset.ptr
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2047
-  store i8 %data, i8 addrspace(1)* %gep1
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2047
+  store i8 %data, ptr addrspace(1) %gep1
   ret void
 }
 
 ; Maximum negative offset on gfx10
-define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_neg2048(i8 addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr, i8 %data) {
+define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_neg2048(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr, i8 %data) {
 ; GCN-LABEL: global_store_saddr_i8_zext_vgpr_offset_neg2048:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_load_dword v0, v[0:1], off
@@ -67,11 +67,11 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_neg2048(i8 addrspa
 ; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3] offset:-2048
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %voffset = load i32, i32 addrspace(1)* %voffset.ptr
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048
-  store i8 %data, i8 addrspace(1)* %gep1
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048
+  store i8 %data, ptr addrspace(1) %gep1
   ret void
 }
 
@@ -79,7 +79,7 @@ define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_neg2048(i8 addrspa
 ; Uniformity edge cases
 ; --------------------------------------------------------------------------------
 
- at ptr.in.lds = internal addrspace(3) global i8 addrspace(1)* undef
+ at ptr.in.lds = internal addrspace(3) global ptr addrspace(1) undef
 
 ; Base pointer is uniform, but also in VGPRs
 define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %data) {
@@ -114,10 +114,10 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8
 ; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
+  %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  store i8 %data, i8 addrspace(1)* %gep0
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store i8 %data, ptr addrspace(1) %gep0
   ret void
 }
 
@@ -154,11 +154,11 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %vo
 ; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1] offset:-120
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
+  %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -120
-  store i8 %data, i8 addrspace(1)* %gep1
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -120
+  store i8 %data, ptr addrspace(1) %gep1
   ret void
 }
 
@@ -166,7 +166,7 @@ define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %vo
 ; Stress various type stores
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps void @global_store_saddr_i16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i16 %data) {
+define amdgpu_ps void @global_store_saddr_i16_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, i16 %data) {
 ; GCN-LABEL: global_store_saddr_i16_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_short v0, v1, s[2:3]
@@ -178,13 +178,12 @@ define amdgpu_ps void @global_store_saddr_i16_zext_vgpr(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
-  store i16 %data, i16 addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store i16 %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_i16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i16 %data) {
+define amdgpu_ps void @global_store_saddr_i16_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i16 %data) {
 ; GCN-LABEL: global_store_saddr_i16_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_short v0, v1, s[2:3] offset:-128
@@ -196,14 +195,13 @@ define amdgpu_ps void @global_store_saddr_i16_zext_vgpr_offset_neg128(i8 addrspa
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
-  store i16 %data, i16 addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store i16 %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_f16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, half %data) {
+define amdgpu_ps void @global_store_saddr_f16_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, half %data) {
 ; GCN-LABEL: global_store_saddr_f16_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_short v0, v1, s[2:3]
@@ -215,13 +213,12 @@ define amdgpu_ps void @global_store_saddr_f16_zext_vgpr(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to half addrspace(1)*
-  store half %data, half addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store half %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_f16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, half %data) {
+define amdgpu_ps void @global_store_saddr_f16_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, half %data) {
 ; GCN-LABEL: global_store_saddr_f16_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_short v0, v1, s[2:3] offset:-128
@@ -233,14 +230,13 @@ define amdgpu_ps void @global_store_saddr_f16_zext_vgpr_offset_neg128(i8 addrspa
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to half addrspace(1)*
-  store half %data, half addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store half %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_i32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_store_saddr_i32_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GCN-LABEL: global_store_saddr_i32_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dword v0, v1, s[2:3]
@@ -252,13 +248,12 @@ define amdgpu_ps void @global_store_saddr_i32_zext_vgpr(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  store i32 %data, i32 addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store i32 %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_i32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_store_saddr_i32_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GCN-LABEL: global_store_saddr_i32_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dword v0, v1, s[2:3] offset:-128
@@ -270,14 +265,13 @@ define amdgpu_ps void @global_store_saddr_i32_zext_vgpr_offset_neg128(i8 addrspa
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  store i32 %data, i32 addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store i32 %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_f32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, float %data) {
+define amdgpu_ps void @global_store_saddr_f32_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, float %data) {
 ; GCN-LABEL: global_store_saddr_f32_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dword v0, v1, s[2:3]
@@ -289,13 +283,12 @@ define amdgpu_ps void @global_store_saddr_f32_zext_vgpr(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)*
-  store float %data, float addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store float %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_f32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, float %data) {
+define amdgpu_ps void @global_store_saddr_f32_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, float %data) {
 ; GCN-LABEL: global_store_saddr_f32_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dword v0, v1, s[2:3] offset:-128
@@ -307,14 +300,13 @@ define amdgpu_ps void @global_store_saddr_f32_zext_vgpr_offset_neg128(i8 addrspa
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)*
-  store float %data, float addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store float %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_p3_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i8 addrspace(3)* %data) {
+define amdgpu_ps void @global_store_saddr_p3_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, ptr addrspace(3) %data) {
 ; GCN-LABEL: global_store_saddr_p3_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dword v0, v1, s[2:3]
@@ -326,13 +318,12 @@ define amdgpu_ps void @global_store_saddr_p3_zext_vgpr(i8 addrspace(1)* inreg %s
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(3)* addrspace(1)*
-  store i8 addrspace(3)* %data, i8 addrspace(3)* addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store ptr addrspace(3) %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_p3_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i8 addrspace(3)* %data) {
+define amdgpu_ps void @global_store_saddr_p3_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, ptr addrspace(3) %data) {
 ; GCN-LABEL: global_store_saddr_p3_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dword v0, v1, s[2:3] offset:-128
@@ -344,14 +335,13 @@ define amdgpu_ps void @global_store_saddr_p3_zext_vgpr_offset_neg128(i8 addrspac
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(3)* addrspace(1)*
-  store i8 addrspace(3)* %data, i8 addrspace(3)* addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store ptr addrspace(3) %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_i64_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_store_saddr_i64_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GCN-LABEL: global_store_saddr_i64_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx2 v0, v[1:2], s[2:3]
@@ -363,13 +353,12 @@ define amdgpu_ps void @global_store_saddr_i64_zext_vgpr(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  store i64 %data, i64 addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store i64 %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_i64_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @global_store_saddr_i64_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GCN-LABEL: global_store_saddr_i64_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128
@@ -381,14 +370,13 @@ define amdgpu_ps void @global_store_saddr_i64_zext_vgpr_offset_neg128(i8 addrspa
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  store i64 %data, i64 addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store i64 %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_f64_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, double %data) {
+define amdgpu_ps void @global_store_saddr_f64_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, double %data) {
 ; GCN-LABEL: global_store_saddr_f64_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx2 v0, v[1:2], s[2:3]
@@ -400,13 +388,12 @@ define amdgpu_ps void @global_store_saddr_f64_zext_vgpr(i8 addrspace(1)* inreg %
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to double addrspace(1)*
-  store double %data, double addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store double %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_f64_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, double %data) {
+define amdgpu_ps void @global_store_saddr_f64_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, double %data) {
 ; GCN-LABEL: global_store_saddr_f64_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128
@@ -418,14 +405,13 @@ define amdgpu_ps void @global_store_saddr_f64_zext_vgpr_offset_neg128(i8 addrspa
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to double addrspace(1)*
-  store double %data, double addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store double %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v2i32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i32> %data) {
+define amdgpu_ps void @global_store_saddr_v2i32_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i32> %data) {
 ; GCN-LABEL: global_store_saddr_v2i32_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx2 v0, v[1:2], s[2:3]
@@ -437,13 +423,12 @@ define amdgpu_ps void @global_store_saddr_v2i32_zext_vgpr(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i32> addrspace(1)*
-  store <2 x i32> %data, <2 x i32> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store <2 x i32> %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v2i32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i32> %data) {
+define amdgpu_ps void @global_store_saddr_v2i32_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i32> %data) {
 ; GCN-LABEL: global_store_saddr_v2i32_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128
@@ -455,14 +440,13 @@ define amdgpu_ps void @global_store_saddr_v2i32_zext_vgpr_offset_neg128(i8 addrs
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i32> addrspace(1)*
-  store <2 x i32> %data, <2 x i32> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store <2 x i32> %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v2f32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x float> %data) {
+define amdgpu_ps void @global_store_saddr_v2f32_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x float> %data) {
 ; GCN-LABEL: global_store_saddr_v2f32_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx2 v0, v[1:2], s[2:3]
@@ -474,13 +458,12 @@ define amdgpu_ps void @global_store_saddr_v2f32_zext_vgpr(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x float> addrspace(1)*
-  store <2 x float> %data, <2 x float> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store <2 x float> %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v2f32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x float> %data) {
+define amdgpu_ps void @global_store_saddr_v2f32_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x float> %data) {
 ; GCN-LABEL: global_store_saddr_v2f32_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128
@@ -492,14 +475,13 @@ define amdgpu_ps void @global_store_saddr_v2f32_zext_vgpr_offset_neg128(i8 addrs
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x float> addrspace(1)*
-  store <2 x float> %data, <2 x float> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store <2 x float> %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v4i16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x i16> %data) {
+define amdgpu_ps void @global_store_saddr_v4i16_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, <4 x i16> %data) {
 ; GCN-LABEL: global_store_saddr_v4i16_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx2 v0, v[1:2], s[2:3]
@@ -511,13 +493,12 @@ define amdgpu_ps void @global_store_saddr_v4i16_zext_vgpr(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i16> addrspace(1)*
-  store <4 x i16> %data, <4 x i16> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store <4 x i16> %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v4i16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x i16> %data) {
+define amdgpu_ps void @global_store_saddr_v4i16_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <4 x i16> %data) {
 ; GCN-LABEL: global_store_saddr_v4i16_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128
@@ -529,14 +510,13 @@ define amdgpu_ps void @global_store_saddr_v4i16_zext_vgpr_offset_neg128(i8 addrs
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i16> addrspace(1)*
-  store <4 x i16> %data, <4 x i16> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store <4 x i16> %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v4f16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x half> %data) {
+define amdgpu_ps void @global_store_saddr_v4f16_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, <4 x half> %data) {
 ; GCN-LABEL: global_store_saddr_v4f16_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx2 v0, v[1:2], s[2:3]
@@ -548,13 +528,12 @@ define amdgpu_ps void @global_store_saddr_v4f16_zext_vgpr(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x half> addrspace(1)*
-  store <4 x half> %data, <4 x half> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store <4 x half> %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v4f16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x half> %data) {
+define amdgpu_ps void @global_store_saddr_v4f16_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <4 x half> %data) {
 ; GCN-LABEL: global_store_saddr_v4f16_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128
@@ -566,14 +545,13 @@ define amdgpu_ps void @global_store_saddr_v4f16_zext_vgpr_offset_neg128(i8 addrs
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x half> addrspace(1)*
-  store <4 x half> %data, <4 x half> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store <4 x half> %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_p1_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i8 addrspace(1)* %data) {
+define amdgpu_ps void @global_store_saddr_p1_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, ptr addrspace(1) %data) {
 ; GCN-LABEL: global_store_saddr_p1_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx2 v0, v[1:2], s[2:3]
@@ -585,13 +563,12 @@ define amdgpu_ps void @global_store_saddr_p1_zext_vgpr(i8 addrspace(1)* inreg %s
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* addrspace(1)*
-  store i8 addrspace(1)* %data, i8 addrspace(1)* addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store ptr addrspace(1) %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_p1_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i8 addrspace(1)* %data) {
+define amdgpu_ps void @global_store_saddr_p1_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, ptr addrspace(1) %data) {
 ; GCN-LABEL: global_store_saddr_p1_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128
@@ -603,14 +580,13 @@ define amdgpu_ps void @global_store_saddr_p1_zext_vgpr_offset_neg128(i8 addrspac
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* addrspace(1)*
-  store i8 addrspace(1)* %data, i8 addrspace(1)* addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store ptr addrspace(1) %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v3i32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <3 x i32> %data) {
+define amdgpu_ps void @global_store_saddr_v3i32_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, <3 x i32> %data) {
 ; GCN-LABEL: global_store_saddr_v3i32_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx3 v0, v[1:3], s[2:3]
@@ -622,13 +598,12 @@ define amdgpu_ps void @global_store_saddr_v3i32_zext_vgpr(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x i32> addrspace(1)*
-  store <3 x i32> %data, <3 x i32> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store <3 x i32> %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v3i32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <3 x i32> %data) {
+define amdgpu_ps void @global_store_saddr_v3i32_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <3 x i32> %data) {
 ; GCN-LABEL: global_store_saddr_v3i32_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx3 v0, v[1:3], s[2:3] offset:-128
@@ -640,14 +615,13 @@ define amdgpu_ps void @global_store_saddr_v3i32_zext_vgpr_offset_neg128(i8 addrs
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x i32> addrspace(1)*
-  store <3 x i32> %data, <3 x i32> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store <3 x i32> %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v3f32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <3 x float> %data) {
+define amdgpu_ps void @global_store_saddr_v3f32_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, <3 x float> %data) {
 ; GCN-LABEL: global_store_saddr_v3f32_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx3 v0, v[1:3], s[2:3]
@@ -659,13 +633,12 @@ define amdgpu_ps void @global_store_saddr_v3f32_zext_vgpr(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x float> addrspace(1)*
-  store <3 x float> %data, <3 x float> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store <3 x float> %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v3f32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <3 x float> %data) {
+define amdgpu_ps void @global_store_saddr_v3f32_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <3 x float> %data) {
 ; GCN-LABEL: global_store_saddr_v3f32_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx3 v0, v[1:3], s[2:3] offset:-128
@@ -677,14 +650,13 @@ define amdgpu_ps void @global_store_saddr_v3f32_zext_vgpr_offset_neg128(i8 addrs
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x float> addrspace(1)*
-  store <3 x float> %data, <3 x float> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store <3 x float> %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v6i16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <6 x i16> %data) {
+define amdgpu_ps void @global_store_saddr_v6i16_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, <6 x i16> %data) {
 ; GCN-LABEL: global_store_saddr_v6i16_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx3 v0, v[1:3], s[2:3]
@@ -696,13 +668,12 @@ define amdgpu_ps void @global_store_saddr_v6i16_zext_vgpr(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <6 x i16> addrspace(1)*
-  store <6 x i16> %data, <6 x i16> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store <6 x i16> %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v6i16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <6 x i16> %data) {
+define amdgpu_ps void @global_store_saddr_v6i16_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <6 x i16> %data) {
 ; GCN-LABEL: global_store_saddr_v6i16_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx3 v0, v[1:3], s[2:3] offset:-128
@@ -714,14 +685,13 @@ define amdgpu_ps void @global_store_saddr_v6i16_zext_vgpr_offset_neg128(i8 addrs
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <6 x i16> addrspace(1)*
-  store <6 x i16> %data, <6 x i16> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store <6 x i16> %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v6f16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <6 x half> %data) {
+define amdgpu_ps void @global_store_saddr_v6f16_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, <6 x half> %data) {
 ; GCN-LABEL: global_store_saddr_v6f16_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx3 v0, v[1:3], s[2:3]
@@ -733,13 +703,12 @@ define amdgpu_ps void @global_store_saddr_v6f16_zext_vgpr(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <6 x half> addrspace(1)*
-  store <6 x half> %data, <6 x half> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store <6 x half> %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v6f16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <6 x half> %data) {
+define amdgpu_ps void @global_store_saddr_v6f16_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <6 x half> %data) {
 ; GCN-LABEL: global_store_saddr_v6f16_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx3 v0, v[1:3], s[2:3] offset:-128
@@ -751,14 +720,13 @@ define amdgpu_ps void @global_store_saddr_v6f16_zext_vgpr_offset_neg128(i8 addrs
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <6 x half> addrspace(1)*
-  store <6 x half> %data, <6 x half> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store <6 x half> %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v4i32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x i32> %data) {
+define amdgpu_ps void @global_store_saddr_v4i32_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, <4 x i32> %data) {
 ; GCN-LABEL: global_store_saddr_v4i32_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
@@ -770,13 +738,12 @@ define amdgpu_ps void @global_store_saddr_v4i32_zext_vgpr(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i32> addrspace(1)*
-  store <4 x i32> %data, <4 x i32> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store <4 x i32> %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v4i32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x i32> %data) {
+define amdgpu_ps void @global_store_saddr_v4i32_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <4 x i32> %data) {
 ; GCN-LABEL: global_store_saddr_v4i32_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128
@@ -788,14 +755,13 @@ define amdgpu_ps void @global_store_saddr_v4i32_zext_vgpr_offset_neg128(i8 addrs
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i32> addrspace(1)*
-  store <4 x i32> %data, <4 x i32> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store <4 x i32> %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v4f32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x float> %data) {
+define amdgpu_ps void @global_store_saddr_v4f32_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, <4 x float> %data) {
 ; GCN-LABEL: global_store_saddr_v4f32_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
@@ -807,13 +773,12 @@ define amdgpu_ps void @global_store_saddr_v4f32_zext_vgpr(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x float> addrspace(1)*
-  store <4 x float> %data, <4 x float> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store <4 x float> %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v4f32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x float> %data) {
+define amdgpu_ps void @global_store_saddr_v4f32_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <4 x float> %data) {
 ; GCN-LABEL: global_store_saddr_v4f32_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128
@@ -825,14 +790,13 @@ define amdgpu_ps void @global_store_saddr_v4f32_zext_vgpr_offset_neg128(i8 addrs
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x float> addrspace(1)*
-  store <4 x float> %data, <4 x float> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store <4 x float> %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v2i64_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i64> %data) {
+define amdgpu_ps void @global_store_saddr_v2i64_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i64> %data) {
 ; GCN-LABEL: global_store_saddr_v2i64_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
@@ -844,13 +808,12 @@ define amdgpu_ps void @global_store_saddr_v2i64_zext_vgpr(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i64> addrspace(1)*
-  store <2 x i64> %data, <2 x i64> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store <2 x i64> %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v2i64_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i64> %data) {
+define amdgpu_ps void @global_store_saddr_v2i64_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i64> %data) {
 ; GCN-LABEL: global_store_saddr_v2i64_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128
@@ -862,14 +825,13 @@ define amdgpu_ps void @global_store_saddr_v2i64_zext_vgpr_offset_neg128(i8 addrs
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i64> addrspace(1)*
-  store <2 x i64> %data, <2 x i64> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store <2 x i64> %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v2f64_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x double> %data) {
+define amdgpu_ps void @global_store_saddr_v2f64_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x double> %data) {
 ; GCN-LABEL: global_store_saddr_v2f64_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
@@ -881,13 +843,12 @@ define amdgpu_ps void @global_store_saddr_v2f64_zext_vgpr(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x double> addrspace(1)*
-  store <2 x double> %data, <2 x double> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store <2 x double> %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v2f64_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x double> %data) {
+define amdgpu_ps void @global_store_saddr_v2f64_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x double> %data) {
 ; GCN-LABEL: global_store_saddr_v2f64_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128
@@ -899,14 +860,13 @@ define amdgpu_ps void @global_store_saddr_v2f64_zext_vgpr_offset_neg128(i8 addrs
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x double> addrspace(1)*
-  store <2 x double> %data, <2 x double> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store <2 x double> %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v8i16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <8 x i16> %data) {
+define amdgpu_ps void @global_store_saddr_v8i16_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, <8 x i16> %data) {
 ; GCN-LABEL: global_store_saddr_v8i16_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
@@ -918,13 +878,12 @@ define amdgpu_ps void @global_store_saddr_v8i16_zext_vgpr(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <8 x i16> addrspace(1)*
-  store <8 x i16> %data, <8 x i16> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store <8 x i16> %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v8i16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <8 x i16> %data) {
+define amdgpu_ps void @global_store_saddr_v8i16_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <8 x i16> %data) {
 ; GCN-LABEL: global_store_saddr_v8i16_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128
@@ -936,14 +895,13 @@ define amdgpu_ps void @global_store_saddr_v8i16_zext_vgpr_offset_neg128(i8 addrs
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <8 x i16> addrspace(1)*
-  store <8 x i16> %data, <8 x i16> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store <8 x i16> %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v8f16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <8 x half> %data) {
+define amdgpu_ps void @global_store_saddr_v8f16_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, <8 x half> %data) {
 ; GCN-LABEL: global_store_saddr_v8f16_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
@@ -955,13 +913,12 @@ define amdgpu_ps void @global_store_saddr_v8f16_zext_vgpr(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <8 x half> addrspace(1)*
-  store <8 x half> %data, <8 x half> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store <8 x half> %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v8f16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <8 x half> %data) {
+define amdgpu_ps void @global_store_saddr_v8f16_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <8 x half> %data) {
 ; GCN-LABEL: global_store_saddr_v8f16_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128
@@ -973,14 +930,13 @@ define amdgpu_ps void @global_store_saddr_v8f16_zext_vgpr_offset_neg128(i8 addrs
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <8 x half> addrspace(1)*
-  store <8 x half> %data, <8 x half> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store <8 x half> %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v2p1_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i8 addrspace(1)*> %data) {
+define amdgpu_ps void @global_store_saddr_v2p1_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x ptr addrspace(1)> %data) {
 ; GCN-LABEL: global_store_saddr_v2p1_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
@@ -992,13 +948,12 @@ define amdgpu_ps void @global_store_saddr_v2p1_zext_vgpr(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i8 addrspace(1)*> addrspace(1)*
-  store <2 x i8 addrspace(1)*> %data, <2 x i8 addrspace(1)*> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store <2 x ptr addrspace(1)> %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v2p1_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i8 addrspace(1)*> %data) {
+define amdgpu_ps void @global_store_saddr_v2p1_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x ptr addrspace(1)> %data) {
 ; GCN-LABEL: global_store_saddr_v2p1_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128
@@ -1010,14 +965,13 @@ define amdgpu_ps void @global_store_saddr_v2p1_zext_vgpr_offset_neg128(i8 addrsp
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i8 addrspace(1)*> addrspace(1)*
-  store <2 x i8 addrspace(1)*> %data, <2 x i8 addrspace(1)*> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store <2 x ptr addrspace(1)> %data, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v4p3_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x i8 addrspace(3)*> %data) {
+define amdgpu_ps void @global_store_saddr_v4p3_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, <4 x ptr addrspace(3)> %data) {
 ; GCN-LABEL: global_store_saddr_v4p3_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
@@ -1029,13 +983,12 @@ define amdgpu_ps void @global_store_saddr_v4p3_zext_vgpr(i8 addrspace(1)* inreg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i8 addrspace(3)*> addrspace(1)*
-  store <4 x i8 addrspace(3)*> %data, <4 x i8 addrspace(3)*> addrspace(1)* %gep0.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store <4 x ptr addrspace(3)> %data, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_v4p3_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x i8 addrspace(3)*> %data) {
+define amdgpu_ps void @global_store_saddr_v4p3_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <4 x ptr addrspace(3)> %data) {
 ; GCN-LABEL: global_store_saddr_v4p3_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128
@@ -1047,10 +1000,9 @@ define amdgpu_ps void @global_store_saddr_v4p3_zext_vgpr_offset_neg128(i8 addrsp
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i8 addrspace(3)*> addrspace(1)*
-  store <4 x i8 addrspace(3)*> %data, <4 x i8 addrspace(3)*> addrspace(1)* %gep1.cast
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store <4 x ptr addrspace(3)> %data, ptr addrspace(1) %gep1
   ret void
 }
 
@@ -1058,7 +1010,7 @@ define amdgpu_ps void @global_store_saddr_v4p3_zext_vgpr_offset_neg128(i8 addrsp
 ; Atomic store
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: atomic_global_store_saddr_i32_zext_vgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1080,13 +1032,12 @@ define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr(i8 addrspace(1)*
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  store atomic i32 %data, i32 addrspace(1)* %gep0.cast seq_cst, align 4
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store atomic i32 %data, ptr addrspace(1) %gep0 seq_cst, align 4
   ret void
 }
 
-define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX9-LABEL: atomic_global_store_saddr_i32_zext_vgpr_offset_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1108,14 +1059,13 @@ define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr_offset_neg128(i8
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  store atomic i32 %data, i32 addrspace(1)* %gep1.cast seq_cst, align 4
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store atomic i32 %data, ptr addrspace(1) %gep1 seq_cst, align 4
   ret void
 }
 
-define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: atomic_global_store_saddr_i64_zext_vgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1137,13 +1087,12 @@ define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr(i8 addrspace(1)*
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
-  store atomic i64 %data, i64 addrspace(1)* %gep0.cast seq_cst, align 8
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  store atomic i64 %data, ptr addrspace(1) %gep0 seq_cst, align 8
   ret void
 }
 
-define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
+define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX9-LABEL: atomic_global_store_saddr_i64_zext_vgpr_offset_neg128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1165,10 +1114,9 @@ define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr_offset_neg128(i8
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
-  store atomic i64 %data, i64 addrspace(1)* %gep1.cast seq_cst, align 8
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  store atomic i64 %data, ptr addrspace(1) %gep1 seq_cst, align 8
   ret void
 }
 
@@ -1176,7 +1124,7 @@ define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr_offset_neg128(i8
 ; D16 HI store (hi 16)
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps void @global_store_saddr_i16_d16hi_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %data) {
+define amdgpu_ps void @global_store_saddr_i16_d16hi_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %data) {
 ; GCN-LABEL: global_store_saddr_i16_d16hi_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_short_d16_hi v0, v1, s[2:3]
@@ -1188,14 +1136,13 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_zext_vgpr(i8 addrspace(1)* i
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %data.hi = extractelement <2 x i16> %data, i32 1
-  store i16 %data.hi, i16 addrspace(1)* %gep0.cast
+  store i16 %data.hi, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %data) {
+define amdgpu_ps void @global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %data) {
 ; GCN-LABEL: global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_short_d16_hi v0, v1, s[2:3] offset:-128
@@ -1207,15 +1154,14 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128(i8 a
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
   %data.hi = extractelement <2 x i16> %data, i32 1
-  store i16 %data.hi, i16 addrspace(1)* %gep1.cast
+  store i16 %data.hi, ptr addrspace(1) %gep1
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_i16_d16hi_trunci8_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %data) {
+define amdgpu_ps void @global_store_saddr_i16_d16hi_trunci8_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %data) {
 ; GCN-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_byte_d16_hi v0, v1, s[2:3]
@@ -1227,14 +1173,14 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_trunci8_zext_vgpr(i8 addrspa
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %data.hi = extractelement <2 x i16> %data, i32 1
   %data.hi.trunc = trunc i16 %data.hi to i8
-  store i8 %data.hi.trunc, i8 addrspace(1)* %gep0
+  store i8 %data.hi.trunc, ptr addrspace(1) %gep0
   ret void
 }
 
-define amdgpu_ps void @global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %data) {
+define amdgpu_ps void @global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %data) {
 ; GCN-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_store_byte_d16_hi v0, v1, s[2:3] offset:-128
@@ -1246,10 +1192,10 @@ define amdgpu_ps void @global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
   %data.hi = extractelement <2 x i16> %data, i32 1
   %data.hi.trunc = trunc i16 %data.hi to i8
-  store i8 %data.hi.trunc, i8 addrspace(1)* %gep1
+  store i8 %data.hi.trunc, ptr addrspace(1) %gep1
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll b/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll
index ad14ac2280356..b8cfcbf2d2665 100644
--- a/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll
@@ -17,10 +17,10 @@
 ; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], private at rel32@lo+8
 ; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], private at rel32@hi+16
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]]
-define amdgpu_kernel void @private_test(i32 addrspace(1)* %out) {
-  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @private, i32 0, i32 1
-  %val = load i32, i32 addrspace(1)* %ptr
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @private_test(ptr addrspace(1) %out) {
+  %ptr = getelementptr [256 x i32], ptr addrspace(1) @private, i32 0, i32 1
+  %val = load i32, ptr addrspace(1) %ptr
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -29,10 +29,10 @@ define amdgpu_kernel void @private_test(i32 addrspace(1)* %out) {
 ; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], internal at rel32@lo+8
 ; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], internal at rel32@hi+16
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]]
-define amdgpu_kernel void @internal_test(i32 addrspace(1)* %out) {
-  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @internal, i32 0, i32 1
-  %val = load i32, i32 addrspace(1)* %ptr
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @internal_test(ptr addrspace(1) %out) {
+  %ptr = getelementptr [256 x i32], ptr addrspace(1) @internal, i32 0, i32 1
+  %val = load i32, ptr addrspace(1) %ptr
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -42,10 +42,10 @@ define amdgpu_kernel void @internal_test(i32 addrspace(1)* %out) {
 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], available_externally at gotpcrel32@hi+12
 ; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOTADDR_LO]]:[[GOTADDR_HI]]], 0x0
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
-define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) {
-  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @available_externally, i32 0, i32 1
-  %val = load i32, i32 addrspace(1)* %ptr
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @available_externally_test(ptr addrspace(1) %out) {
+  %ptr = getelementptr [256 x i32], ptr addrspace(1) @available_externally, i32 0, i32 1
+  %val = load i32, ptr addrspace(1) %ptr
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -55,10 +55,10 @@ define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) {
 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], linkonce at gotpcrel32@hi+12
 ; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOTADDR_LO]]:[[GOTADDR_HI]]], 0x0
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
-define amdgpu_kernel void @linkonce_test(i32 addrspace(1)* %out) {
-  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce, i32 0, i32 1
-  %val = load i32, i32 addrspace(1)* %ptr
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @linkonce_test(ptr addrspace(1) %out) {
+  %ptr = getelementptr [256 x i32], ptr addrspace(1) @linkonce, i32 0, i32 1
+  %val = load i32, ptr addrspace(1) %ptr
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -68,10 +68,10 @@ define amdgpu_kernel void @linkonce_test(i32 addrspace(1)* %out) {
 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], weak at gotpcrel32@hi+12
 ; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOTADDR_LO]]:[[GOTADDR_HI]]], 0x0
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
-define amdgpu_kernel void @weak_test(i32 addrspace(1)* %out) {
-  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak, i32 0, i32 1
-  %val = load i32, i32 addrspace(1)* %ptr
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @weak_test(ptr addrspace(1) %out) {
+  %ptr = getelementptr [256 x i32], ptr addrspace(1) @weak, i32 0, i32 1
+  %val = load i32, ptr addrspace(1) %ptr
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -81,10 +81,10 @@ define amdgpu_kernel void @weak_test(i32 addrspace(1)* %out) {
 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], common at gotpcrel32@hi+12
 ; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOTADDR_LO]]:[[GOTADDR_HI]]], 0x0
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
-define amdgpu_kernel void @common_test(i32 addrspace(1)* %out) {
-  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @common, i32 0, i32 1
-  %val = load i32, i32 addrspace(1)* %ptr
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @common_test(ptr addrspace(1) %out) {
+  %ptr = getelementptr [256 x i32], ptr addrspace(1) @common, i32 0, i32 1
+  %val = load i32, ptr addrspace(1) %ptr
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -94,10 +94,10 @@ define amdgpu_kernel void @common_test(i32 addrspace(1)* %out) {
 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], extern_weak at gotpcrel32@hi+12
 ; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOTADDR_LO]]:[[GOTADDR_HI]]], 0x0
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
-define amdgpu_kernel void @extern_weak_test(i32 addrspace(1)* %out) {
-  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @extern_weak, i32 0, i32 1
-  %val = load i32, i32 addrspace(1)* %ptr
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @extern_weak_test(ptr addrspace(1) %out) {
+  %ptr = getelementptr [256 x i32], ptr addrspace(1) @extern_weak, i32 0, i32 1
+  %val = load i32, ptr addrspace(1) %ptr
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -107,10 +107,10 @@ define amdgpu_kernel void @extern_weak_test(i32 addrspace(1)* %out) {
 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], linkonce_odr at gotpcrel32@hi+12
 ; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOTADDR_LO]]:[[GOTADDR_HI]]], 0x0
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
-define amdgpu_kernel void @linkonce_odr_test(i32 addrspace(1)* %out) {
-  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce_odr, i32 0, i32 1
-  %val = load i32, i32 addrspace(1)* %ptr
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @linkonce_odr_test(ptr addrspace(1) %out) {
+  %ptr = getelementptr [256 x i32], ptr addrspace(1) @linkonce_odr, i32 0, i32 1
+  %val = load i32, ptr addrspace(1) %ptr
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -120,10 +120,10 @@ define amdgpu_kernel void @linkonce_odr_test(i32 addrspace(1)* %out) {
 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], weak_odr at gotpcrel32@hi+12
 ; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOTADDR_LO]]:[[GOTADDR_HI]]], 0x0
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
-define amdgpu_kernel void @weak_odr_test(i32 addrspace(1)* %out) {
-  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak_odr, i32 0, i32 1
-  %val = load i32, i32 addrspace(1)* %ptr
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @weak_odr_test(ptr addrspace(1) %out) {
+  %ptr = getelementptr [256 x i32], ptr addrspace(1) @weak_odr, i32 0, i32 1
+  %val = load i32, ptr addrspace(1) %ptr
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -133,10 +133,10 @@ define amdgpu_kernel void @weak_odr_test(i32 addrspace(1)* %out) {
 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], external at gotpcrel32@hi+12
 ; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOTADDR_LO]]:[[GOTADDR_HI]]], 0x0
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
-define amdgpu_kernel void @external_test(i32 addrspace(1)* %out) {
-  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external, i32 0, i32 1
-  %val = load i32, i32 addrspace(1)* %ptr
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @external_test(ptr addrspace(1) %out) {
+  %ptr = getelementptr [256 x i32], ptr addrspace(1) @external, i32 0, i32 1
+  %val = load i32, ptr addrspace(1) %ptr
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -146,10 +146,10 @@ define amdgpu_kernel void @external_test(i32 addrspace(1)* %out) {
 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], external_w_init at gotpcrel32@hi+12
 ; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOTADDR_LO]]:[[GOTADDR_HI]]], 0x0
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
-define amdgpu_kernel void @external_w_init_test(i32 addrspace(1)* %out) {
-  %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external_w_init, i32 0, i32 1
-  %val = load i32, i32 addrspace(1)* %ptr
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @external_w_init_test(ptr addrspace(1) %out) {
+  %ptr = getelementptr [256 x i32], ptr addrspace(1) @external_w_init, i32 0, i32 1
+  %val = load i32, ptr addrspace(1) %ptr
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index c388bfe24d576..7c6058b7e382a 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=false -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 
-define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_add_i32_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -45,12 +45,12 @@ define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i32_max_neg_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_add_i32_max_neg_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -96,12 +96,12 @@ define amdgpu_kernel void @atomic_add_i32_max_neg_offset(i32 addrspace(1)* %out,
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 -1024
-  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_add_i32_soffset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -145,12 +145,12 @@ define amdgpu_kernel void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %i
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000
-  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 9000
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_add_i32_huge_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -198,13 +198,13 @@ define amdgpu_kernel void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i3
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595
 
-  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_add_i32_ret_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -257,13 +257,13 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_add_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -319,13 +319,13 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out,
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_add_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -392,14 +392,14 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_add_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -441,11 +441,11 @@ define amdgpu_kernel void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_add_i32_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -498,12 +498,12 @@ define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrsp
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_add_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -557,12 +557,12 @@ define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_add_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -627,13 +627,13 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_and_i32_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -675,12 +675,12 @@ define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_and_i32_ret_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -733,13 +733,13 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_and_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -795,13 +795,13 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out,
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_and_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -868,14 +868,14 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_and_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -917,11 +917,11 @@ define amdgpu_kernel void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_and_i32_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -974,12 +974,12 @@ define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrsp
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_and_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -1033,12 +1033,12 @@ define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_and_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1103,13 +1103,13 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_sub_i32_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -1151,12 +1151,12 @@ define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_sub_i32_ret_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1209,13 +1209,13 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_sub_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -1271,13 +1271,13 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out,
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_sub_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1344,14 +1344,14 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_sub_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -1393,11 +1393,11 @@ define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_sub_i32_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1450,12 +1450,12 @@ define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrsp
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_sub_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -1509,12 +1509,12 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_sub_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1579,13 +1579,13 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_max_i32_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -1670,12 +1670,12 @@ define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_max_i32_ret_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1725,13 +1725,13 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_max_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -1781,13 +1781,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out,
 ; GFX9-NEXT:    global_atomic_smax v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_max_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1851,14 +1851,14 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_max_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -1894,11 +1894,11 @@ define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
 ; GFX9-NEXT:    global_atomic_smax v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst
+  %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_max_i32_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1948,12 +1948,12 @@ define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrsp
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_max_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -2001,12 +2001,12 @@ define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in
 ; GFX9-NEXT:    global_atomic_smax v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_max_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2068,13 +2068,13 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_umax_i32_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -2110,12 +2110,12 @@ define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %i
 ; GFX9-NEXT:    global_atomic_umax v0, v1, s[2:3] offset:16
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in syncscope("workgroup") seq_cst
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_umax_i32_ret_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2165,13 +2165,13 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i3
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umax_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -2221,13 +2221,13 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out,
 ; GFX9-NEXT:    global_atomic_umax v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2291,14 +2291,14 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_umax_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -2334,11 +2334,11 @@ define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
 ; GFX9-NEXT:    global_atomic_umax v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst
+  %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_umax_i32_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2388,12 +2388,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrs
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umax_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -2441,12 +2441,12 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %i
 ; GFX9-NEXT:    global_atomic_umax v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umax_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2508,13 +2508,13 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i3
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_min_i32_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -2550,12 +2550,12 @@ define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in
 ; GFX9-NEXT:    global_atomic_smin v0, v1, s[2:3] offset:16
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in syncscope("workgroup") seq_cst
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_min_i32_ret_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2605,13 +2605,13 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_min_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -2661,13 +2661,13 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out,
 ; GFX9-NEXT:    global_atomic_smin v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_min_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2731,14 +2731,14 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_min_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -2774,11 +2774,11 @@ define amdgpu_kernel void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
 ; GFX9-NEXT:    global_atomic_smin v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst
+  %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_min_i32_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2828,12 +2828,12 @@ define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrsp
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_min_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -2881,12 +2881,12 @@ define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in
 ; GFX9-NEXT:    global_atomic_smin v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_min_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2948,13 +2948,13 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_umin_i32_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -2990,12 +2990,12 @@ define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %i
 ; GFX9-NEXT:    global_atomic_umin v0, v1, s[2:3] offset:16
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in syncscope("workgroup") seq_cst
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_umin_i32_ret_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3045,13 +3045,13 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i3
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umin_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -3101,13 +3101,13 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out,
 ; GFX9-NEXT:    global_atomic_umin v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3171,14 +3171,14 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_umin_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -3214,11 +3214,11 @@ define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
 ; GFX9-NEXT:    global_atomic_umin v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst
+  %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_umin_i32_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3268,12 +3268,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrs
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umin_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -3321,12 +3321,12 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %i
 ; GFX9-NEXT:    global_atomic_umin v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile umin ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umin_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3388,13 +3388,13 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i3
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in syncscope("workgroup") seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile umin ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_or_i32_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -3436,12 +3436,12 @@ define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_or_i32_ret_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3494,13 +3494,13 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_or_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -3556,13 +3556,13 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_or_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3629,14 +3629,14 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %ou
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_or_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -3678,11 +3678,11 @@ define amdgpu_kernel void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile or ptr addrspace(1) %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_or_i32_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3735,12 +3735,12 @@ define amdgpu_kernel void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspa
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile or ptr addrspace(1) %out, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_or_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -3794,12 +3794,12 @@ define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in,
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile or ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_or_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3864,13 +3864,13 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile or ptr addrspace(1) %ptr, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_xchg_i32_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -3912,12 +3912,12 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %i
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_f32_offset(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float %in) {
 ; SI-LABEL: atomic_xchg_f32_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -3959,12 +3959,12 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(float addrspace(1)* %out, floa
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr float, float addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile xchg float addrspace(1)* %gep, float %in seq_cst
+  %gep = getelementptr float, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile xchg ptr addrspace(1) %gep, float %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_xchg_i32_ret_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4017,13 +4017,13 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i3
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_xchg_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -4079,13 +4079,13 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out,
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_xchg_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4152,14 +4152,14 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_xchg_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -4201,11 +4201,11 @@ define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_xchg_i32_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4258,12 +4258,12 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrs
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_xchg_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -4317,12 +4317,12 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %i
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile xchg ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_xchg_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4387,13 +4387,13 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i3
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile xchg ptr addrspace(1) %ptr, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 %in, i32 %old) {
 ; SI-LABEL: atomic_cmpxchg_i32_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4439,12 +4439,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %old) {
 ; SI-LABEL: atomic_cmpxchg_i32_ret_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4500,14 +4500,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out,
 ; GFX9-NEXT:    global_store_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst seq_cst
   %extract0 = extractvalue { i32, i1 } %val, 0
-  store i32 %extract0, i32 addrspace(1)* %out2
+  store i32 %extract0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) {
 ; SI-LABEL: atomic_cmpxchg_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s6, s[0:1], 0xb
@@ -4569,13 +4569,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %o
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index, i32 %old) {
 ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4648,15 +4648,15 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)
 ; GFX9-NEXT:    global_store_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst seq_cst
   %extract0 = extractvalue { i32, i1 } %val, 0
-  store i32 %extract0, i32 addrspace(1)* %out2
+  store i32 %extract0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i32 %old) {
 ; SI-LABEL: atomic_cmpxchg_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4702,11 +4702,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %out, i32 %old, i32 %in seq_cst seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %old) {
 ; SI-LABEL: atomic_cmpxchg_i32_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4762,13 +4762,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 ad
 ; GFX9-NEXT:    global_store_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %out, i32 %old, i32 %in seq_cst seq_cst
   %extract0 = extractvalue { i32, i1 } %val, 0
-  store i32 %extract0, i32 addrspace(1)* %out2
+  store i32 %extract0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) {
 ; SI-LABEL: atomic_cmpxchg_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s6, s[0:1], 0xb
@@ -4828,12 +4828,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = cmpxchg volatile ptr addrspace(1) %ptr, i32 %old, i32 %in seq_cst seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index, i32 %old) {
 ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4904,14 +4904,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out,
 ; GFX9-NEXT:    global_store_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = cmpxchg volatile ptr addrspace(1) %ptr, i32 %old, i32 %in seq_cst seq_cst
   %extract0 = extractvalue { i32, i1 } %val, 0
-  store i32 %extract0, i32 addrspace(1)* %out2
+  store i32 %extract0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_xor_i32_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -4953,12 +4953,12 @@ define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_xor_i32_ret_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -5011,13 +5011,13 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_xor_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -5073,13 +5073,13 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out,
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_xor_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -5146,14 +5146,14 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_xor_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -5195,11 +5195,11 @@ define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile xor ptr addrspace(1) %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_xor_i32_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -5252,12 +5252,12 @@ define amdgpu_kernel void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrsp
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile xor ptr addrspace(1) %out, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_xor_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -5311,12 +5311,12 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile xor ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_xor_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -5381,13 +5381,13 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile xor ptr addrspace(1) %ptr, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; SI-LABEL: atomic_load_i32_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5435,13 +5435,13 @@ define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(1)* %in, i32 add
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %in, i64 4
-  %val = load atomic i32, i32 addrspace(1)* %gep  seq_cst, align 4
-  store i32 %val, i32 addrspace(1)* %out
+  %gep = getelementptr i32, ptr addrspace(1) %in, i64 4
+  %val = load atomic i32, ptr addrspace(1) %gep  seq_cst, align 4
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i32_negoffset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; SI-LABEL: atomic_load_i32_negoffset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5491,13 +5491,13 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(i32 addrspace(1)* %in, i32
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %in, i64 -128
-  %val = load atomic i32, i32 addrspace(1)* %gep  seq_cst, align 4
-  store i32 %val, i32 addrspace(1)* %out
+  %gep = getelementptr i32, ptr addrspace(1) %in, i64 -128
+  %val = load atomic i32, ptr addrspace(1) %gep  seq_cst, align 4
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_f32_offset(float addrspace(1)* %in, float addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; SI-LABEL: atomic_load_f32_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5545,13 +5545,13 @@ define amdgpu_kernel void @atomic_load_f32_offset(float addrspace(1)* %in, float
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr float, float addrspace(1)* %in, i64 4
-  %val = load atomic float, float addrspace(1)* %gep  seq_cst, align 4
-  store float %val, float addrspace(1)* %out
+  %gep = getelementptr float, ptr addrspace(1) %in, i64 4
+  %val = load atomic float, ptr addrspace(1) %gep  seq_cst, align 4
+  store float %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; SI-LABEL: atomic_load_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5597,12 +5597,12 @@ define amdgpu_kernel void @atomic_load_i32(i32 addrspace(1)* %in, i32 addrspace(
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4
-  store i32 %val, i32 addrspace(1)* %out
+  %val = load atomic i32, ptr addrspace(1) %in seq_cst, align 4
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) {
 ; SI-LABEL: atomic_load_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -5663,14 +5663,14 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32 addrspace(1)* %in,
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = load atomic i32, i32 addrspace(1)* %gep seq_cst, align 4
-  store i32 %val, i32 addrspace(1)* %out
+  %ptr = getelementptr i32, ptr addrspace(1) %in, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = load atomic i32, ptr addrspace(1) %gep seq_cst, align 4
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i32_addr64(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) {
 ; SI-LABEL: atomic_load_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -5729,13 +5729,13 @@ define amdgpu_kernel void @atomic_load_i32_addr64(i32 addrspace(1)* %in, i32 add
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index
-  %val = load atomic i32, i32 addrspace(1)* %ptr seq_cst, align 4
-  store i32 %val, i32 addrspace(1)* %out
+  %ptr = getelementptr i32, ptr addrspace(1) %in, i64 %index
+  %val = load atomic i32, ptr addrspace(1) %ptr seq_cst, align 4
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_f32_addr64_offset(float addrspace(1)* %in, float addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) {
 ; SI-LABEL: atomic_load_f32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -5796,14 +5796,14 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(float addrspace(1)* %in
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr float, float addrspace(1)* %in, i64 %index
-  %gep = getelementptr float, float addrspace(1)* %ptr, i64 4
-  %val = load atomic float, float addrspace(1)* %gep seq_cst, align 4
-  store float %val, float addrspace(1)* %out
+  %ptr = getelementptr float, ptr addrspace(1) %in, i64 %index
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4
+  %val = load atomic float, ptr addrspace(1) %gep seq_cst, align 4
+  store float %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr addrspace(1) %out) {
 ; SI-LABEL: atomic_store_i32_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0x9
@@ -5841,12 +5841,12 @@ define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3] offset:16
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  store atomic i32 %in, i32 addrspace(1)* %gep  seq_cst, align 4
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  store atomic i32 %in, ptr addrspace(1) %gep  seq_cst, align 4
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr addrspace(1) %out) {
 ; SI-LABEL: atomic_store_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0x9
@@ -5882,11 +5882,11 @@ define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4
+  store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_f32(float %in, float addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_f32(float %in, ptr addrspace(1) %out) {
 ; SI-LABEL: atomic_store_f32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0x9
@@ -5922,11 +5922,11 @@ define amdgpu_kernel void @atomic_store_f32(float %in, float addrspace(1)* %out)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  store atomic float %in, float addrspace(1)* %out seq_cst, align 4
+  store atomic float %in, ptr addrspace(1) %out seq_cst, align 4
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr addrspace(1) %out, i64 %index) {
 ; SI-LABEL: atomic_store_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
@@ -5973,13 +5973,13 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  store atomic i32 %in, i32 addrspace(1)* %gep seq_cst, align 4
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  store atomic i32 %in, ptr addrspace(1) %gep seq_cst, align 4
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, float addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr addrspace(1) %out, i64 %index) {
 ; SI-LABEL: atomic_store_f32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
@@ -6026,13 +6026,13 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, float addrs
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1] offset:16
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr float, float addrspace(1)* %out, i64 %index
-  %gep = getelementptr float, float addrspace(1)* %ptr, i64 4
-  store atomic float %in, float addrspace(1)* %gep seq_cst, align 4
+  %ptr = getelementptr float, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4
+  store atomic float %in, ptr addrspace(1) %gep seq_cst, align 4
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr addrspace(1) %out, i64 %index) {
 ; SI-LABEL: atomic_store_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
@@ -6078,12 +6078,12 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  store atomic i32 %in, i32 addrspace(1)* %ptr seq_cst, align 4
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  store atomic i32 %in, ptr addrspace(1) %ptr seq_cst, align 4
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_f32_addr64(float %in, float addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr addrspace(1) %out, i64 %index) {
 ; SI-LABEL: atomic_store_f32_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
@@ -6129,12 +6129,12 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, float addrspace(1)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr float, float addrspace(1)* %out, i64 %index
-  store atomic float %in, float addrspace(1)* %ptr seq_cst, align 4
+  %ptr = getelementptr float, ptr addrspace(1) %out, i64 %index
+  store atomic float %in, ptr addrspace(1) %ptr seq_cst, align 4
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i8_offset(i8 addrspace(1)* %in, i8 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; SI-LABEL: atomic_load_i8_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -6180,13 +6180,13 @@ define amdgpu_kernel void @atomic_load_i8_offset(i8 addrspace(1)* %in, i8 addrsp
 ; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i8, i8 addrspace(1)* %in, i64 16
-  %val = load atomic i8, i8 addrspace(1)* %gep  seq_cst, align 1
-  store i8 %val, i8 addrspace(1)* %out
+  %gep = getelementptr i8, ptr addrspace(1) %in, i64 16
+  %val = load atomic i8, ptr addrspace(1) %gep  seq_cst, align 1
+  store i8 %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i8_negoffset(i8 addrspace(1)* %in, i8 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; SI-LABEL: atomic_load_i8_negoffset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -6236,13 +6236,13 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(i8 addrspace(1)* %in, i8 add
 ; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i8, i8 addrspace(1)* %in, i64 -512
-  %val = load atomic i8, i8 addrspace(1)* %gep  seq_cst, align 1
-  store i8 %val, i8 addrspace(1)* %out
+  %gep = getelementptr i8, ptr addrspace(1) %in, i64 -512
+  %val = load atomic i8, ptr addrspace(1) %gep  seq_cst, align 1
+  store i8 %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, i8 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr addrspace(1) %out) {
 ; SI-LABEL: atomic_store_i8_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0x9
@@ -6280,12 +6280,12 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, i8 addrspace(1)* %out)
 ; GFX9-NEXT:    global_store_byte v0, v1, s[2:3] offset:16
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i8, i8 addrspace(1)* %out, i64 16
-  store atomic i8 %in, i8 addrspace(1)* %gep  seq_cst, align 1
+  %gep = getelementptr i8, ptr addrspace(1) %out, i64 16
+  store atomic i8 %in, ptr addrspace(1) %gep  seq_cst, align 1
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i8(i8 %in, i8 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr addrspace(1) %out) {
 ; SI-LABEL: atomic_store_i8:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0x9
@@ -6321,11 +6321,11 @@ define amdgpu_kernel void @atomic_store_i8(i8 %in, i8 addrspace(1)* %out) {
 ; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  store atomic i8 %in, i8 addrspace(1)* %out seq_cst, align 1
+  store atomic i8 %in, ptr addrspace(1) %out seq_cst, align 1
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i16_offset(i16 addrspace(1)* %in, i16 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; SI-LABEL: atomic_load_i16_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -6371,13 +6371,13 @@ define amdgpu_kernel void @atomic_load_i16_offset(i16 addrspace(1)* %in, i16 add
 ; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i16, i16 addrspace(1)* %in, i64 8
-  %val = load atomic i16, i16 addrspace(1)* %gep  seq_cst, align 2
-  store i16 %val, i16 addrspace(1)* %out
+  %gep = getelementptr i16, ptr addrspace(1) %in, i64 8
+  %val = load atomic i16, ptr addrspace(1) %gep  seq_cst, align 2
+  store i16 %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i16_negoffset(i16 addrspace(1)* %in, i16 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; SI-LABEL: atomic_load_i16_negoffset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -6427,13 +6427,13 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(i16 addrspace(1)* %in, i16
 ; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i16, i16 addrspace(1)* %in, i64 -256
-  %val = load atomic i16, i16 addrspace(1)* %gep  seq_cst, align 2
-  store i16 %val, i16 addrspace(1)* %out
+  %gep = getelementptr i16, ptr addrspace(1) %in, i64 -256
+  %val = load atomic i16, ptr addrspace(1) %gep  seq_cst, align 2
+  store i16 %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, i16 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr addrspace(1) %out) {
 ; SI-LABEL: atomic_store_i16_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0x9
@@ -6471,12 +6471,12 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, i16 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_short v0, v1, s[2:3] offset:16
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i16, i16 addrspace(1)* %out, i64 8
-  store atomic i16 %in, i16 addrspace(1)* %gep  seq_cst, align 2
+  %gep = getelementptr i16, ptr addrspace(1) %out, i64 8
+  store atomic i16 %in, ptr addrspace(1) %gep  seq_cst, align 2
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i16(i16 %in, i16 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr addrspace(1) %out) {
 ; SI-LABEL: atomic_store_i16:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0x9
@@ -6512,11 +6512,11 @@ define amdgpu_kernel void @atomic_store_i16(i16 %in, i16 addrspace(1)* %out) {
 ; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  store atomic i16 %in, i16 addrspace(1)* %out seq_cst, align 2
+  store atomic i16 %in, ptr addrspace(1) %out seq_cst, align 2
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_f16_offset(half %in, half addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr addrspace(1) %out) {
 ; SI-LABEL: atomic_store_f16_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0x9
@@ -6554,12 +6554,12 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, half addrspace(1)*
 ; GFX9-NEXT:    global_store_short v0, v1, s[2:3] offset:16
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr half, half addrspace(1)* %out, i64 8
-  store atomic half %in, half addrspace(1)* %gep  seq_cst, align 2
+  %gep = getelementptr half, ptr addrspace(1) %out, i64 8
+  store atomic half %in, ptr addrspace(1) %gep  seq_cst, align 2
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_f16(half %in, half addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_f16(half %in, ptr addrspace(1) %out) {
 ; SI-LABEL: atomic_store_f16:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0x9
@@ -6595,6 +6595,6 @@ define amdgpu_kernel void @atomic_store_f16(half %in, half addrspace(1)* %out) {
 ; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  store atomic half %in, half addrspace(1)* %out seq_cst, align 2
+  store atomic half %in, ptr addrspace(1) %out seq_cst, align 2
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index 3d019d21a2cbc..48aa210665e79 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=false -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 
-define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_add_i64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -45,12 +45,12 @@ define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_add_i64_ret_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -106,13 +106,13 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_add_i64_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -168,13 +168,13 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64 addrspace(1)* %out,
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_add_i64_ret_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -238,14 +238,14 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_add_i64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -291,11 +291,11 @@ define amdgpu_kernel void @atomic_add_i64(i64 addrspace(1)* %out, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile add ptr addrspace(1) %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_add_i64_ret:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -351,12 +351,12 @@ define amdgpu_kernel void @atomic_add_i64_ret(i64 addrspace(1)* %out, i64 addrsp
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %tmp0 = atomicrmw volatile add ptr addrspace(1) %out, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_add_i64_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -410,12 +410,12 @@ define amdgpu_kernel void @atomic_add_i64_addr64(i64 addrspace(1)* %out, i64 %in
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile add i64 addrspace(1)* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile add ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_add_i64_ret_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -477,13 +477,13 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile add i64 addrspace(1)* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile add ptr addrspace(1) %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_and_i64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -525,12 +525,12 @@ define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(1)* %out, i64 %in
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_and_i64_ret_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -586,13 +586,13 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_and_i64_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -648,13 +648,13 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64 addrspace(1)* %out,
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_and_i64_ret_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -718,14 +718,14 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_and_i64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -771,11 +771,11 @@ define amdgpu_kernel void @atomic_and_i64(i64 addrspace(1)* %out, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_and_i64_ret:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -831,12 +831,12 @@ define amdgpu_kernel void @atomic_and_i64_ret(i64 addrspace(1)* %out, i64 addrsp
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_and_i64_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -890,12 +890,12 @@ define amdgpu_kernel void @atomic_and_i64_addr64(i64 addrspace(1)* %out, i64 %in
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile and i64 addrspace(1)* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_and_i64_ret_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -957,13 +957,13 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile and i64 addrspace(1)* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_sub_i64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1005,12 +1005,12 @@ define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(1)* %out, i64 %in
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_sub_i64_ret_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1066,13 +1066,13 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_sub_i64_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1128,13 +1128,13 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64 addrspace(1)* %out,
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_sub_i64_ret_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -1198,14 +1198,14 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_sub_i64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1251,11 +1251,11 @@ define amdgpu_kernel void @atomic_sub_i64(i64 addrspace(1)* %out, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_sub_i64_ret:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1311,12 +1311,12 @@ define amdgpu_kernel void @atomic_sub_i64_ret(i64 addrspace(1)* %out, i64 addrsp
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_sub_i64_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1370,12 +1370,12 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(i64 addrspace(1)* %out, i64 %in
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_sub_i64_ret_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -1437,13 +1437,13 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_max_i64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1479,12 +1479,12 @@ define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in
 ; GFX9-NEXT:    global_atomic_smax_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in syncscope("workgroup") seq_cst
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_max_i64_ret_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1537,13 +1537,13 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_max_i64_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1593,13 +1593,13 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out,
 ; GFX9-NEXT:    global_atomic_smax_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_max_i64_ret_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -1660,14 +1660,14 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_max_i64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1707,11 +1707,11 @@ define amdgpu_kernel void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) {
 ; GFX9-NEXT:    global_atomic_smax_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in syncscope("workgroup") seq_cst
+  %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_max_i64_ret:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1764,12 +1764,12 @@ define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrsp
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_max_i64_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1817,12 +1817,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in
 ; GFX9-NEXT:    global_atomic_smax_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile max ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_max_i64_ret_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -1881,13 +1881,13 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile max ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_umax_i64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1923,12 +1923,12 @@ define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %i
 ; GFX9-NEXT:    global_atomic_umax_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in syncscope("workgroup") seq_cst
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_umax_i64_ret_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1981,13 +1981,13 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i6
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_umax_i64_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2037,13 +2037,13 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out,
 ; GFX9-NEXT:    global_atomic_umax_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_umax_i64_ret_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -2104,14 +2104,14 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_umax_i64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2151,11 +2151,11 @@ define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) {
 ; GFX9-NEXT:    global_atomic_umax_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in syncscope("workgroup") seq_cst
+  %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_umax_i64_ret:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2208,12 +2208,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrs
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_umax_i64_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2261,12 +2261,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %i
 ; GFX9-NEXT:    global_atomic_umax_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile umax ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_umax_i64_ret_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -2325,13 +2325,13 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i6
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile umax ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_min_i64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2367,12 +2367,12 @@ define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in
 ; GFX9-NEXT:    global_atomic_smin_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in syncscope("workgroup") seq_cst
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_min_i64_ret_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2425,13 +2425,13 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_min_i64_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2481,13 +2481,13 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out,
 ; GFX9-NEXT:    global_atomic_smin_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_min_i64_ret_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -2548,14 +2548,14 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_min_i64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2595,11 +2595,11 @@ define amdgpu_kernel void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) {
 ; GFX9-NEXT:    global_atomic_smin_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in syncscope("workgroup") seq_cst
+  %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_min_i64_ret:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2652,12 +2652,12 @@ define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrsp
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_min_i64_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2705,12 +2705,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in
 ; GFX9-NEXT:    global_atomic_smin_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile min ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_min_i64_ret_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -2769,13 +2769,13 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile min ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_umin_i64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2811,12 +2811,12 @@ define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %i
 ; GFX9-NEXT:    global_atomic_umin_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in syncscope("workgroup") seq_cst
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_umin_i64_ret_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2869,13 +2869,13 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i6
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_umin_i64_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2925,13 +2925,13 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out,
 ; GFX9-NEXT:    global_atomic_umin_x2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_umin_i64_ret_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -2992,14 +2992,14 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_umin_i64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -3039,11 +3039,11 @@ define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) {
 ; GFX9-NEXT:    global_atomic_umin_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in syncscope("workgroup") seq_cst
+  %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_umin_i64_ret:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3096,12 +3096,12 @@ define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrs
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_umin_i64_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3149,12 +3149,12 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %i
 ; GFX9-NEXT:    global_atomic_umin_x2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in syncscope("workgroup") seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile umin ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_umin_i64_ret_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -3213,13 +3213,13 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i6
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in syncscope("workgroup") seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile umin ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_or_i64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -3261,12 +3261,12 @@ define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(1)* %out, i64 %in)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_or_i64_ret_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3322,13 +3322,13 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_or_i64_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3384,13 +3384,13 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64 addrspace(1)* %out, i
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_or_i64_ret_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -3454,14 +3454,14 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64 addrspace(1)* %ou
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_or_i64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -3507,11 +3507,11 @@ define amdgpu_kernel void @atomic_or_i64(i64 addrspace(1)* %out, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_or_i64_ret:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3567,12 +3567,12 @@ define amdgpu_kernel void @atomic_or_i64_ret(i64 addrspace(1)* %out, i64 addrspa
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_or_i64_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3626,12 +3626,12 @@ define amdgpu_kernel void @atomic_or_i64_addr64(i64 addrspace(1)* %out, i64 %in,
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile or i64 addrspace(1)* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_or_i64_ret_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -3693,13 +3693,13 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile or i64 addrspace(1)* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_xchg_i64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -3741,12 +3741,12 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(i64 addrspace(1)* %out, i64 %i
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_f64_offset(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double %in) {
 ; CI-LABEL: atomic_xchg_f64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -3788,12 +3788,12 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(double addrspace(1)* %out, dou
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr double, double addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile xchg double addrspace(1)* %gep, double %in seq_cst
+  %gep = getelementptr double, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, double %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_pointer_offset(i8* addrspace(1)* %out, i8* %in) {
+define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr %in) {
 ; CI-LABEL: atomic_xchg_pointer_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -3835,12 +3835,12 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(i8* addrspace(1)* %out, i8
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i8*, i8* addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile xchg i8* addrspace(1)* %gep, i8* %in seq_cst
+  %gep = getelementptr ptr, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, ptr %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_xchg_i64_ret_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3896,13 +3896,13 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64 addrspace(1)* %out, i6
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_xchg_i64_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3958,13 +3958,13 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64 addrspace(1)* %out,
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_xchg_i64_ret_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -4028,14 +4028,14 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(1)* %
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_xchg_i64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4081,11 +4081,11 @@ define amdgpu_kernel void @atomic_xchg_i64(i64 addrspace(1)* %out, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_xchg_i64_ret:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4141,12 +4141,12 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(i64 addrspace(1)* %out, i64 addrs
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_xchg_i64_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4200,12 +4200,12 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(i64 addrspace(1)* %out, i64 %i
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_xchg_i64_ret_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -4267,13 +4267,13 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64 addrspace(1)* %out, i6
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_xor_i64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4315,12 +4315,12 @@ define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(1)* %out, i64 %in
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_xor_i64_ret_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4376,13 +4376,13 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_xor_i64_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4438,13 +4438,13 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64 addrspace(1)* %out,
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_xor_i64_ret_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -4508,14 +4508,14 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_xor_i64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4561,11 +4561,11 @@ define amdgpu_kernel void @atomic_xor_i64(i64 addrspace(1)* %out, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_xor_i64_ret:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4621,12 +4621,12 @@ define amdgpu_kernel void @atomic_xor_i64_ret(i64 addrspace(1)* %out, i64 addrsp
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_xor_i64_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4680,12 +4680,12 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(i64 addrspace(1)* %out, i64 %in
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_xor_i64_ret_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -4747,13 +4747,13 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64 %in, i64 %old) {
 ; CI-LABEL: atomic_cmpxchg_i64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4808,12 +4808,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in seq_cst seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64 %in, i64 %old) {
 ; CI-LABEL: atomic_cmpxchg_i64_soffset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -4870,12 +4870,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64 addrspace(1)* %out, i6
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 9000
-  %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 9000
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in seq_cst seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %old) {
 ; CI-LABEL: atomic_cmpxchg_i64_ret_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -4934,14 +4934,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out,
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
-  store i64 %extract0, i64 addrspace(1)* %out2
+  store i64 %extract0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index, i64 %old) {
 ; CI-LABEL: atomic_cmpxchg_i64_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -5001,13 +5001,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %o
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in seq_cst seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index, i64 %old) {
 ; CI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
@@ -5098,15 +5098,15 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(1)
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
-  store i64 %extract0, i64 addrspace(1)* %out2
+  store i64 %extract0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i64(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i64 %old) {
 ; CI-LABEL: atomic_cmpxchg_i64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -5161,11 +5161,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(i64 addrspace(1)* %out, i64 %in, i
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %out, i64 %old, i64 %in seq_cst seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %old) {
 ; CI-LABEL: atomic_cmpxchg_i64_ret:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -5224,13 +5224,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 ad
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %out, i64 %old, i64 %in seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
-  store i64 %extract0, i64 addrspace(1)* %out2
+  store i64 %extract0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index, i64 %old) {
 ; CI-LABEL: atomic_cmpxchg_i64_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -5288,12 +5288,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %val = cmpxchg volatile ptr addrspace(1) %ptr, i64 %old, i64 %in seq_cst seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index, i64 %old) {
 ; CI-LABEL: atomic_cmpxchg_i64_ret_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
@@ -5382,14 +5382,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(1)* %out,
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %val = cmpxchg volatile ptr addrspace(1) %ptr, i64 %old, i64 %in seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
-  store i64 %extract0, i64 addrspace(1)* %out2
+  store i64 %extract0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; CI-LABEL: atomic_load_i64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5437,13 +5437,13 @@ define amdgpu_kernel void @atomic_load_i64_offset(i64 addrspace(1)* %in, i64 add
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %in, i64 4
-  %val = load atomic i64, i64 addrspace(1)* %gep  seq_cst, align 8
-  store i64 %val, i64 addrspace(1)* %out
+  %gep = getelementptr i64, ptr addrspace(1) %in, i64 4
+  %val = load atomic i64, ptr addrspace(1) %gep  seq_cst, align 8
+  store i64 %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i64_neg_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; CI-LABEL: atomic_load_i64_neg_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5493,13 +5493,13 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(i64 addrspace(1)* %in, i64
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %in, i64 -4
-  %val = load atomic i64, i64 addrspace(1)* %gep  seq_cst, align 8
-  store i64 %val, i64 addrspace(1)* %out
+  %gep = getelementptr i64, ptr addrspace(1) %in, i64 -4
+  %val = load atomic i64, ptr addrspace(1) %gep  seq_cst, align 8
+  store i64 %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; CI-LABEL: atomic_load_i64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5545,12 +5545,12 @@ define amdgpu_kernel void @atomic_load_i64(i64 addrspace(1)* %in, i64 addrspace(
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = load atomic i64, i64 addrspace(1)* %in seq_cst, align 8
-  store i64 %val, i64 addrspace(1)* %out
+  %val = load atomic i64, ptr addrspace(1) %in seq_cst, align 8
+  store i64 %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) {
 ; CI-LABEL: atomic_load_i64_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -5611,14 +5611,14 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64 addrspace(1)* %in,
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %val = load atomic i64, i64 addrspace(1)* %gep seq_cst, align 8
-  store i64 %val, i64 addrspace(1)* %out
+  %ptr = getelementptr i64, ptr addrspace(1) %in, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %val = load atomic i64, ptr addrspace(1) %gep seq_cst, align 8
+  store i64 %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_i64_addr64(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) {
 ; CI-LABEL: atomic_load_i64_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -5677,13 +5677,13 @@ define amdgpu_kernel void @atomic_load_i64_addr64(i64 addrspace(1)* %in, i64 add
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index
-  %val = load atomic i64, i64 addrspace(1)* %ptr seq_cst, align 8
-  store i64 %val, i64 addrspace(1)* %out
+  %ptr = getelementptr i64, ptr addrspace(1) %in, i64 %index
+  %val = load atomic i64, ptr addrspace(1) %ptr seq_cst, align 8
+  store i64 %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_load_f64_addr64_offset(double addrspace(1)* %in, double addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) {
 ; CI-LABEL: atomic_load_f64_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -5744,14 +5744,14 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(double addrspace(1)* %i
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr double, double addrspace(1)* %in, i64 %index
-  %gep = getelementptr double, double addrspace(1)* %ptr, i64 4
-  %val = load atomic double, double addrspace(1)* %gep seq_cst, align 8
-  store double %val, double addrspace(1)* %out
+  %ptr = getelementptr double, ptr addrspace(1) %in, i64 %index
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 4
+  %val = load atomic double, ptr addrspace(1) %gep seq_cst, align 8
+  store double %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %out) {
 ; CI-LABEL: atomic_store_i64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5791,12 +5791,12 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] offset:32
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  store atomic i64 %in, i64 addrspace(1)* %gep  seq_cst, align 8
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  store atomic i64 %in, ptr addrspace(1) %gep  seq_cst, align 8
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) {
 ; CI-LABEL: atomic_store_i64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5834,11 +5834,11 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) {
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  store atomic i64 %in, i64 addrspace(1)* %out seq_cst, align 8
+  store atomic i64 %in, ptr addrspace(1) %out seq_cst, align 8
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace(1) %out, i64 %index) {
 ; CI-LABEL: atomic_store_i64_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -5889,13 +5889,13 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  store atomic i64 %in, i64 addrspace(1)* %gep seq_cst, align 8
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  store atomic i64 %in, ptr addrspace(1) %gep seq_cst, align 8
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %out, i64 %index) {
 ; CI-LABEL: atomic_store_i64_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -5944,12 +5944,12 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  store atomic i64 %in, i64 addrspace(1)* %ptr seq_cst, align 8
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  store atomic i64 %in, ptr addrspace(1) %ptr seq_cst, align 8
   ret void
 }
 
-define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, double addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrspace(1) %out, i64 %index) {
 ; CI-LABEL: atomic_store_f64_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -6000,8 +6000,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, double add
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] offset:32
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr double, double addrspace(1)* %out, i64 %index
-  %gep = getelementptr double, double addrspace(1)* %ptr, i64 4
-  store atomic double %in, double addrspace(1)* %gep seq_cst, align 8
+  %ptr = getelementptr double, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 4
+  store atomic double %in, ptr addrspace(1) %gep seq_cst, align 8
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_min_max_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_min_max_system.ll
index 7187f5d77d9a2..b0ea575610042 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_min_max_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_min_max_system.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=false -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 
-define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_max_i64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -108,12 +108,12 @@ define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_max_i64_ret_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -235,13 +235,13 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_max_i64_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -356,13 +356,13 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out,
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_max_i64_ret_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -489,14 +489,14 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_max_i64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -601,11 +601,11 @@ define amdgpu_kernel void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) {
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_max_i64_ret:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -726,12 +726,12 @@ define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrsp
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_max_i64_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -844,12 +844,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile max ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_max_i64_ret_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -974,13 +974,13 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile max ptr addrspace(1) %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_umax_i64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1085,12 +1085,12 @@ define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %i
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_umax_i64_ret_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1212,13 +1212,13 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i6
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_umax_i64_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -1333,13 +1333,13 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out,
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_umax_i64_ret_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -1466,14 +1466,14 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_umax_i64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1578,11 +1578,11 @@ define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) {
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_umax_i64_ret:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1703,12 +1703,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrs
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_umax_i64_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -1821,12 +1821,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %i
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile umax ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_umax_i64_ret_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -1951,13 +1951,13 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i6
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile umax ptr addrspace(1) %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_min_i64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2062,12 +2062,12 @@ define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_min_i64_ret_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2189,13 +2189,13 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_min_i64_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -2310,13 +2310,13 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out,
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_min_i64_ret_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -2443,14 +2443,14 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_min_i64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2555,11 +2555,11 @@ define amdgpu_kernel void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) {
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_min_i64_ret:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2680,12 +2680,12 @@ define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrsp
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_min_i64_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -2798,12 +2798,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile min ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_min_i64_ret_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -2928,13 +2928,13 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile min ptr addrspace(1) %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_umin_i64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -3039,12 +3039,12 @@ define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %i
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_umin_i64_ret_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3166,13 +3166,13 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i6
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
-  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
+  %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_umin_i64_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -3287,13 +3287,13 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out,
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_umin_i64_ret_addr64_offset:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -3420,14 +3420,14 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
-  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
+  %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) {
 ; CI-LABEL: atomic_umin_i64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -3532,11 +3532,11 @@ define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) {
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
 ; CI-LABEL: atomic_umin_i64_ret:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3657,12 +3657,12 @@ define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrs
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_umin_i64_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -3775,12 +3775,12 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %i
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in seq_cst
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile umin ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
 ; CI-LABEL: atomic_umin_i64_ret_addr64:
 ; CI:       ; %bb.0: ; %entry
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -3905,8 +3905,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i6
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
-  %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in seq_cst
-  store i64 %tmp0, i64 addrspace(1)* %out2
+  %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
+  %tmp0 = atomicrmw volatile umin ptr addrspace(1) %ptr, i64 %in seq_cst
+  store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_min_max_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_min_max_system.ll
index e61ac9a9cada2..1fd4c79bda46e 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_min_max_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_min_max_system.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=false -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 
-define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_max_i32_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -88,12 +88,12 @@ define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_max_i32_ret_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -194,13 +194,13 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_max_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -297,13 +297,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out,
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_max_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -415,14 +415,14 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_max_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -505,11 +505,11 @@ define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_max_i32_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -609,12 +609,12 @@ define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrsp
 ; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_max_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -709,12 +709,12 @@ define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_max_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -824,13 +824,13 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_umax_i32_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -915,12 +915,12 @@ define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %i
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_umax_i32_ret_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1021,13 +1021,13 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i3
 ; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umax_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -1124,13 +1124,13 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out,
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umax_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -1242,14 +1242,14 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %
 ; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_umax_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -1332,11 +1332,11 @@ define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_umax_i32_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1436,12 +1436,12 @@ define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrs
 ; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umax_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -1536,12 +1536,12 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %i
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umax_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -1651,13 +1651,13 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i3
 ; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_min_i32_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -1742,12 +1742,12 @@ define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_min_i32_ret_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1848,13 +1848,13 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_min_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -1951,13 +1951,13 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out,
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_min_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -2069,14 +2069,14 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %o
 ; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_min_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -2159,11 +2159,11 @@ define amdgpu_kernel void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_min_i32_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2263,12 +2263,12 @@ define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrsp
 ; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_min_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -2363,12 +2363,12 @@ define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_min_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -2478,13 +2478,13 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32
 ; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_umin_i32_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -2569,12 +2569,12 @@ define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %i
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_umin_i32_ret_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2675,13 +2675,13 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i3
 ; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
-  %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+  %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umin_i32_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -2778,13 +2778,13 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out,
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umin_i32_ret_addr64_offset:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -2896,14 +2896,14 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %
 ; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
-  %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+  %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_umin_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -2986,11 +2986,11 @@ define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
+  %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
 ; SI-LABEL: atomic_umin_i32_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3090,12 +3090,12 @@ define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrs
 ; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umin_i32_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -3190,12 +3190,12 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %i
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile umin ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
 }
 
-define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
 ; SI-LABEL: atomic_umin_i32_ret_addr64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xf
@@ -3305,8 +3305,8 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i3
 ; GFX9-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
-  %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
-  store i32 %val, i32 addrspace(1)* %out2
+  %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+  %val = atomicrmw volatile umin ptr addrspace(1) %ptr, i32 %in seq_cst
+  store i32 %val, ptr addrspace(1) %out2
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/global_smrd.ll b/llvm/test/CodeGen/AMDGPU/global_smrd.ll
index 82c682c7294af..3b71e8ffefbf8 100644
--- a/llvm/test/CodeGen/AMDGPU/global_smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_smrd.ll
@@ -5,21 +5,21 @@
 ; CHECK: s_load_dwordx4
 ; CHECK-NOT: flat_load_dword
 
-define amdgpu_kernel void @uniform_load(float addrspace(1)* %arg, [8 x i32], float addrspace(1)* %arg1) {
+define amdgpu_kernel void @uniform_load(ptr addrspace(1) %arg, [8 x i32], ptr addrspace(1) %arg1) {
 bb:
-  %tmp2 = load float, float addrspace(1)* %arg, align 4, !tbaa !8
+  %tmp2 = load float, ptr addrspace(1) %arg, align 4, !tbaa !8
   %tmp3 = fadd float %tmp2, 0.000000e+00
-  %tmp4 = getelementptr inbounds float, float addrspace(1)* %arg, i64 1
-  %tmp5 = load float, float addrspace(1)* %tmp4, align 4, !tbaa !8
+  %tmp4 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 1
+  %tmp5 = load float, ptr addrspace(1) %tmp4, align 4, !tbaa !8
   %tmp6 = fadd float %tmp3, %tmp5
-  %tmp7 = getelementptr inbounds float, float addrspace(1)* %arg, i64 2
-  %tmp8 = load float, float addrspace(1)* %tmp7, align 4, !tbaa !8
+  %tmp7 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 2
+  %tmp8 = load float, ptr addrspace(1) %tmp7, align 4, !tbaa !8
   %tmp9 = fadd float %tmp6, %tmp8
-  %tmp10 = getelementptr inbounds float, float addrspace(1)* %arg, i64 3
-  %tmp11 = load float, float addrspace(1)* %tmp10, align 4, !tbaa !8
+  %tmp10 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 3
+  %tmp11 = load float, ptr addrspace(1) %tmp10, align 4, !tbaa !8
   %tmp12 = fadd float %tmp9, %tmp11
-  %tmp13 = getelementptr inbounds float, float addrspace(1)* %arg1
-  store float %tmp12, float addrspace(1)* %tmp13, align 4, !tbaa !8
+  %tmp13 = getelementptr inbounds float, ptr addrspace(1) %arg1
+  store float %tmp12, ptr addrspace(1) %tmp13, align 4, !tbaa !8
   ret void
 }
 
@@ -31,12 +31,12 @@ bb:
 ; CHECK: flat_load_dword
 ; CHECK: flat_store_dword
 
-define amdgpu_kernel void @uniform_load_store_load(float addrspace(1)* %arg0, float addrspace(1)* %arg1) {
+define amdgpu_kernel void @uniform_load_store_load(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 bb:
-  %tmp2 = load float, float addrspace(1)* %arg0, !tbaa !8
-  store float %tmp2, float addrspace(1)* %arg1, !tbaa !8
-  %tmp3 = load float, float addrspace(1)* %arg0, !tbaa !8
-  store float %tmp3, float addrspace(1)* %arg1, !tbaa !8
+  %tmp2 = load float, ptr addrspace(1) %arg0, !tbaa !8
+  store float %tmp2, ptr addrspace(1) %arg1, !tbaa !8
+  %tmp3 = load float, ptr addrspace(1) %arg0, !tbaa !8
+  store float %tmp3, ptr addrspace(1) %arg1, !tbaa !8
   ret void
 }
 
@@ -45,26 +45,26 @@ bb:
 ; CHECK: flat_load_dword
 ; CHECK-NOT: s_load_dwordx4
 
-define amdgpu_kernel void @non-uniform_load(float addrspace(1)* %arg, [8 x i32], float addrspace(1)* %arg1) #0 {
+define amdgpu_kernel void @non-uniform_load(ptr addrspace(1) %arg, [8 x i32], ptr addrspace(1) %arg1) #0 {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %tmp2 = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tmp
-  %tmp3 = load float, float addrspace(1)* %tmp2, align 4, !tbaa !8
+  %tmp2 = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %tmp
+  %tmp3 = load float, ptr addrspace(1) %tmp2, align 4, !tbaa !8
   %tmp4 = fadd float %tmp3, 0.000000e+00
   %tmp5 = add i32 %tmp, 1
-  %tmp6 = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tmp5
-  %tmp7 = load float, float addrspace(1)* %tmp6, align 4, !tbaa !8
+  %tmp6 = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %tmp5
+  %tmp7 = load float, ptr addrspace(1) %tmp6, align 4, !tbaa !8
   %tmp8 = fadd float %tmp4, %tmp7
   %tmp9 = add i32 %tmp, 2
-  %tmp10 = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tmp9
-  %tmp11 = load float, float addrspace(1)* %tmp10, align 4, !tbaa !8
+  %tmp10 = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %tmp9
+  %tmp11 = load float, ptr addrspace(1) %tmp10, align 4, !tbaa !8
   %tmp12 = fadd float %tmp8, %tmp11
   %tmp13 = add i32 %tmp, 3
-  %tmp14 = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tmp13
-  %tmp15 = load float, float addrspace(1)* %tmp14, align 4, !tbaa !8
+  %tmp14 = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %tmp13
+  %tmp15 = load float, ptr addrspace(1) %tmp14, align 4, !tbaa !8
   %tmp16 = fadd float %tmp12, %tmp15
-  %tmp17 = getelementptr inbounds float, float addrspace(1)* %arg1, i32 %tmp
-  store float %tmp16, float addrspace(1)* %tmp17, align 4, !tbaa !8
+  %tmp17 = getelementptr inbounds float, ptr addrspace(1) %arg1, i32 %tmp
+  store float %tmp16, ptr addrspace(1) %tmp17, align 4, !tbaa !8
   ret void
 }
 
@@ -76,10 +76,10 @@ bb:
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
 
-define amdgpu_kernel void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, [8 x i32], i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1) {
-  store i32 0, i32 addrspace(1)* %out0
-  %val = load i32, i32 addrspace(1)* %in
-  store i32 %val, i32 addrspace(1)* %out1
+define amdgpu_kernel void @no_memdep_alias_arg(ptr addrspace(1) noalias %in, [8 x i32], ptr addrspace(1) %out0, [8 x i32], ptr addrspace(1) %out1) {
+  store i32 0, ptr addrspace(1) %out0
+  %val = load i32, ptr addrspace(1) %in
+  store i32 %val, ptr addrspace(1) %out1
   ret void
 }
 
@@ -88,10 +88,10 @@ define amdgpu_kernel void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, [8
 ; CHECK: flat_store_dword
 ; CHECK: flat_load_dword [[VVAL:v[0-9]+]]
 ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
-define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, [8 x i32], i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1) {
-  store i32 0, i32 addrspace(1)* %out0
-  %val = load i32, i32 addrspace(1)* %in
-  store i32 %val, i32 addrspace(1)* %out1
+define amdgpu_kernel void @memdep(ptr addrspace(1) %in, [8 x i32], ptr addrspace(1) %out0, [8 x i32], ptr addrspace(1) %out1) {
+  store i32 0, ptr addrspace(1) %out0
+  %val = load i32, ptr addrspace(1) %in
+  store i32 %val, ptr addrspace(1) %out1
   ret void
 }
 
@@ -104,13 +104,13 @@ define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, [8 x i32], i32 addrspac
 ; CHECK-DAG: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
- at A = common local_unnamed_addr addrspace(1) global i32 addrspace(1)* null, align 4
+ at A = common local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 4
 
-define amdgpu_kernel void @global_array(i32 addrspace(1)* nocapture %out) {
+define amdgpu_kernel void @global_array(ptr addrspace(1) nocapture %out) {
 entry:
-  %load0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
-  %load1 = load i32, i32 addrspace(1)* %load0, align 4
-  store i32 %load1, i32 addrspace(1)* %out, align 4
+  %load0 = load ptr addrspace(1), ptr addrspace(1) @A, align 4
+  %load1 = load i32, ptr addrspace(1) %load0, align 4
+  store i32 %load1, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -123,13 +123,13 @@ entry:
 ; CHECK: flat_load_dwordx2 [[A_ADDR:v\[[0-9]+:[0-9]+\]]], v[[[ADDR_LO]]:[[ADDR_HI]]]
 ; CHECK: flat_load_dword [[VVAL:v[0-9]+]], [[A_ADDR]]
 ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
-define amdgpu_kernel void @global_array_alias_store(i32 addrspace(1)* nocapture %out, [8 x i32], i32 %n) {
+define amdgpu_kernel void @global_array_alias_store(ptr addrspace(1) nocapture %out, [8 x i32], i32 %n) {
 entry:
-  %gep = getelementptr i32, i32 addrspace(1) * %out, i32 %n
-  store i32 12, i32 addrspace(1) * %gep
-  %load0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
-  %load1 = load i32, i32 addrspace(1)* %load0, align 4
-  store i32 %load1, i32 addrspace(1)* %out, align 4
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 %n
+  store i32 12, ptr addrspace(1) %gep
+  %load0 = load ptr addrspace(1), ptr addrspace(1) @A, align 4
+  %load1 = load i32, ptr addrspace(1) %load0, align 4
+  store i32 %load1, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll b/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll
index fd51f47bce892..670666b782434 100644
--- a/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll
@@ -25,11 +25,11 @@
 
 ; CHECK: flat_store_dword
 
-define amdgpu_kernel void @cfg(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) #0 {
+define amdgpu_kernel void @cfg(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) #0 {
 bb:
   %tmp = sext i32 %arg2 to i64
-  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp
-  %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4, !tbaa !0
+  %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp
+  %tmp4 = load i32, ptr addrspace(1) %tmp3, align 4, !tbaa !0
   %tmp5 = icmp sgt i32 %tmp4, 0
   br i1 %tmp5, label %bb6, label %bb8
 
@@ -41,8 +41,8 @@ bb7:                                              ; preds = %bb22
 
 bb8:                                              ; preds = %bb7, %bb
   %tmp9 = phi i32 [ 0, %bb ], [ %tmp30, %bb7 ]
-  %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp
-  store i32 %tmp9, i32 addrspace(1)* %tmp10, align 4, !tbaa !0
+  %tmp10 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp
+  store i32 %tmp9, ptr addrspace(1) %tmp10, align 4, !tbaa !0
   ret void
 
 bb11:                                             ; preds = %bb22, %bb6
@@ -50,24 +50,24 @@ bb11:                                             ; preds = %bb22, %bb6
   %tmp13 = phi i32 [ %tmp25, %bb22 ], [ 0, %bb6 ]
   %tmp14 = srem i32 %tmp13, %arg2
   %tmp15 = sext i32 %tmp14 to i64
-  %tmp16 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp15
-  %tmp17 = load i32, i32 addrspace(1)* %tmp16, align 4, !tbaa !0
+  %tmp16 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp15
+  %tmp17 = load i32, ptr addrspace(1) %tmp16, align 4, !tbaa !0
   %tmp18 = icmp sgt i32 %tmp17, 100
   %tmp19 = sext i32 %tmp13 to i64
   br i1 %tmp18, label %bb20, label %bb22
 
 bb20:                                             ; preds = %bb11
-  %tmp21 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp19
-  store i32 0, i32 addrspace(1)* %tmp21, align 4, !tbaa !0
+  %tmp21 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp19
+  store i32 0, ptr addrspace(1) %tmp21, align 4, !tbaa !0
   br label %bb22
 
 bb22:                                             ; preds = %bb20, %bb11
-  %tmp23 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp19
-  %tmp24 = load i32, i32 addrspace(1)* %tmp23, align 4, !tbaa !0
+  %tmp23 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp19
+  %tmp24 = load i32, ptr addrspace(1) %tmp23, align 4, !tbaa !0
   %tmp25 = add nuw nsw i32 %tmp13, 1
   %tmp26 = sext i32 %tmp25 to i64
-  %tmp27 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp26
-  %tmp28 = load i32, i32 addrspace(1)* %tmp27, align 4, !tbaa !0
+  %tmp27 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp26
+  %tmp28 = load i32, ptr addrspace(1) %tmp27, align 4, !tbaa !0
   %tmp29 = add i32 %tmp24, %tmp12
   %tmp30 = add i32 %tmp29, %tmp28
   %tmp31 = icmp eq i32 %tmp25, %tmp4
@@ -85,7 +85,7 @@ bb22:                                             ; preds = %bb20, %bb11
 
 ; CHECK: flat_load_dword
 
-define amdgpu_kernel void @cfg_selfloop(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) #0 {
+define amdgpu_kernel void @cfg_selfloop(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) #0 {
 bb:
   br label %bb1
 
@@ -96,11 +96,11 @@ bb1:
   %tmp13 = phi i32 [ %tmp25, %bb1 ], [ 0, %bb ]
   %tmp14 = srem i32 %tmp13, %arg2
   %tmp15 = sext i32 %tmp14 to i64
-  %tmp16 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp15
-  %tmp17 = load i32, i32 addrspace(1)* %tmp16, align 4, !tbaa !0
+  %tmp16 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp15
+  %tmp17 = load i32, ptr addrspace(1) %tmp16, align 4, !tbaa !0
   %tmp19 = sext i32 %tmp13 to i64
-  %tmp21 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp19
-  store i32 %tmp17, i32 addrspace(1)* %tmp21, align 4, !tbaa !0
+  %tmp21 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp19
+  store i32 %tmp17, ptr addrspace(1) %tmp21, align 4, !tbaa !0
   %tmp25 = add nuw nsw i32 %tmp13, 1
   %tmp31 = icmp eq i32 %tmp25, 100
   br i1 %tmp31, label %bb2, label %bb1

diff  --git a/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll b/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll
index 9d1bd4bde0723..6c921441c972d 100644
--- a/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll
@@ -15,7 +15,7 @@
 ; reverse order. With the kills inserted to artifically extend the
 ; pointer live ranges to hint the soft clause, we get worse
 ; allocation and need the extra copies before the loads.
-define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; FORWARDXNACK-LABEL: shuffle_v4f16_234u:
 ; FORWARDXNACK:       ; %bb.0:
 ; FORWARDXNACK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -46,8 +46,8 @@ define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; NOXNACK-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
 ; NOXNACK-NEXT:    s_waitcnt vmcnt(0)
 ; NOXNACK-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef>
   ret <4 x half> %shuffle
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/gv-offset-folding.ll b/llvm/test/CodeGen/AMDGPU/gv-offset-folding.ll
index e641d7266a793..72e5d0a9ae711 100644
--- a/llvm/test/CodeGen/AMDGPU/gv-offset-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/gv-offset-folding.ll
@@ -15,7 +15,7 @@
 ; CHECK: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:4
 define amdgpu_kernel void @lds_no_offset() {
 entry:
-  %ptr = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds, i32 0, i32 1
-  store i32 0, i32 addrspace(3)* %ptr
+  %ptr = getelementptr [4 x i32], ptr addrspace(3) @lds, i32 0, i32 1
+  store i32 0, ptr addrspace(3) %ptr
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll
index 83ddad21bf17d..31ca10c2d3614 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll
@@ -5,7 +5,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 |   FileCheck --check-prefix=PARSER %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 |   FileCheck --check-prefix=PARSER %s
 
- at llvm.global_ctors = appending addrspace(1) global [2 x { i32, void ()*, i8*  }] [{ i32, void ()*, i8*  } { i32 1, void ()* @foo, i8* null  }, { i32, void ()*, i8*  } { i32 1, void ()* @foo.5, i8* null  }]
+ at llvm.global_ctors = appending addrspace(1) global [2 x { i32, ptr, ptr  }] [{ i32, ptr, ptr  } { i32 1, ptr @foo, ptr null  }, { i32, ptr, ptr  } { i32 1, ptr @foo.5, ptr null  }]
 
 define internal void @foo() {
       ret void
@@ -21,7 +21,7 @@ define internal void @foo.5() {
 ; CHECK: .kind: init
 ; CHECK: .name: amdgcn.device.init
 
- at llvm.global_dtors = appending addrspace(1) global [2 x { i32, void ()*, i8*  }] [{ i32, void ()*, i8*  } { i32 1, void ()* @bar, i8* null  }, { i32, void ()*, i8*  } { i32 1, void ()* @bar.5, i8* null  }]
+ at llvm.global_dtors = appending addrspace(1) global [2 x { i32, ptr, ptr  }] [{ i32, ptr, ptr  } { i32 1, ptr @bar, ptr null  }, { i32, ptr, ptr  } { i32 1, ptr @bar.5, ptr null  }]
 
 define internal void @bar() {
       ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll
index 759f50cbbe626..cde62ebd5a16e 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll
@@ -24,14 +24,14 @@
 ; CHECK:          .name:           test0
 ; CHECK:          .symbol:         test0.kd
 define amdgpu_kernel void @test0(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -57,14 +57,14 @@ entry:
 ; CHECK:          .name:           test8
 ; CHECK:          .symbol:         test8.kd
 define amdgpu_kernel void @test8(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) #0 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) #0 {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -93,14 +93,14 @@ entry:
 ; CHECK:          .name:           test16
 ; CHECK:          .symbol:         test16.kd
 define amdgpu_kernel void @test16(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) #1 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) #1 {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -132,14 +132,14 @@ entry:
 ; CHECK:          .name:           test24
 ; CHECK:          .symbol:         test24.kd
 define amdgpu_kernel void @test24(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) #2 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) #2 {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -174,14 +174,14 @@ entry:
 ; CHECK:          .name:           test32
 ; CHECK:          .symbol:         test32.kd
 define amdgpu_kernel void @test32(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) #3 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) #3 {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -222,14 +222,14 @@ entry:
 ; CHECK:          .name:           test48
 ; CHECK:          .symbol:         test48.kd
 define amdgpu_kernel void @test48(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) #4 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) #4 {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -273,14 +273,14 @@ entry:
 ; CHECK:          .name:           test56
 ; CHECK:          .symbol:         test56.kd
 define amdgpu_kernel void @test56(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) #5 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) #5 {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll
index ebe46ffa45af3..464f2ad4993fc 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll
@@ -98,14 +98,14 @@
 ; CHECK-NEXT: - 1
 ; CHECK-NEXT: - 2
 define amdgpu_kernel void @test_v5(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) #0 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) #0 {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll
index cae375da0e089..1a80f9d7518d8 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll
@@ -26,14 +26,14 @@
 ; CHECK-NEXT:       AddrSpaceQual:   Global
 ; CHECK-NEXT:   CodeProps:
 define amdgpu_kernel void @test0(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -60,14 +60,14 @@ entry:
 ; CHECK-NEXT:       ValueKind:       HiddenGlobalOffsetX
 ; CHECK-NEXT:   CodeProps:
 define amdgpu_kernel void @test8(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) #0 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) #0 {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -97,14 +97,14 @@ entry:
 ; CHECK-NEXT:       ValueKind:       HiddenGlobalOffsetY
 ; CHECK-NEXT:   CodeProps:
 define amdgpu_kernel void @test16(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) #1 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) #1 {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -137,14 +137,14 @@ entry:
 ; CHECK-NEXT:       ValueKind:       HiddenGlobalOffsetZ
 ; CHECK-NEXT:   CodeProps:
 define amdgpu_kernel void @test24(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) #2 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) #2 {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -181,14 +181,14 @@ entry:
 ; CHECK-NEXT:       AddrSpaceQual:   Global
 ; CHECK-NEXT:   CodeProps:
 define amdgpu_kernel void @test32(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) #3 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) #3 {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -233,14 +233,14 @@ entry:
 ; CHECK-NEXT:       AddrSpaceQual:   Global
 ; CHECK-NEXT:   CodeProps:
 define amdgpu_kernel void @test48(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) #4 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) #4 {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -289,14 +289,14 @@ entry:
 ; CHECK-NEXT:       AddrSpaceQual:   Global
 ; CHECK-NEXT:   CodeProps:
 define amdgpu_kernel void @test56(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) #5 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) #5 {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll
index 56477008006b4..a74d22f72a9f9 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll
@@ -78,18 +78,18 @@
 ; CHECK:         .size:           8
 ; CHECK:         .type_name:      image3d_t
 ; CHECK:         .value_kind:     image
-define amdgpu_kernel void @test(%opencl.image1d_t addrspace(1)* %a,
-                                %opencl.image1d_array_t addrspace(1)* %b,
-                                %opencl.image1d_buffer_t addrspace(1)* %c,
-                                %opencl.image2d_t addrspace(1)* %d,
-                                %opencl.image2d_array_t addrspace(1)* %e,
-                                %opencl.image2d_array_depth_t addrspace(1)* %f,
-                                %opencl.image2d_array_msaa_t addrspace(1)* %g,
-                                %opencl.image2d_array_msaa_depth_t addrspace(1)* %h,
-                                %opencl.image2d_depth_t addrspace(1)* %i,
-                                %opencl.image2d_msaa_t addrspace(1)* %j,
-                                %opencl.image2d_msaa_depth_t addrspace(1)* %k,
-                                %opencl.image3d_t addrspace(1)* %l)
+define amdgpu_kernel void @test(ptr addrspace(1) %a,
+                                ptr addrspace(1) %b,
+                                ptr addrspace(1) %c,
+                                ptr addrspace(1) %d,
+                                ptr addrspace(1) %e,
+                                ptr addrspace(1) %f,
+                                ptr addrspace(1) %g,
+                                ptr addrspace(1) %h,
+                                ptr addrspace(1) %i,
+                                ptr addrspace(1) %j,
+                                ptr addrspace(1) %k,
+                                ptr addrspace(1) %l)
     !kernel_arg_type !1 !kernel_arg_base_type !1 {
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-images.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-images.ll
index 45b3a77417e13..908229176417b 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-images.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-images.ll
@@ -70,18 +70,18 @@
 ; CHECK:          TypeName:  image3d_t
 ; CHECK:          Size:      8
 ; CHECK:          ValueKind: Image
-define amdgpu_kernel void @test(%opencl.image1d_t addrspace(1)* %a,
-                                %opencl.image1d_array_t addrspace(1)* %b,
-                                %opencl.image1d_buffer_t addrspace(1)* %c,
-                                %opencl.image2d_t addrspace(1)* %d,
-                                %opencl.image2d_array_t addrspace(1)* %e,
-                                %opencl.image2d_array_depth_t addrspace(1)* %f,
-                                %opencl.image2d_array_msaa_t addrspace(1)* %g,
-                                %opencl.image2d_array_msaa_depth_t addrspace(1)* %h,
-                                %opencl.image2d_depth_t addrspace(1)* %i,
-                                %opencl.image2d_msaa_t addrspace(1)* %j,
-                                %opencl.image2d_msaa_depth_t addrspace(1)* %k,
-                                %opencl.image3d_t addrspace(1)* %l)
+define amdgpu_kernel void @test(ptr addrspace(1) %a,
+                                ptr addrspace(1) %b,
+                                ptr addrspace(1) %c,
+                                ptr addrspace(1) %d,
+                                ptr addrspace(1) %e,
+                                ptr addrspace(1) %f,
+                                ptr addrspace(1) %g,
+                                ptr addrspace(1) %h,
+                                ptr addrspace(1) %i,
+                                ptr addrspace(1) %j,
+                                ptr addrspace(1) %k,
+                                ptr addrspace(1) %l)
     !kernel_arg_type !1 !kernel_arg_base_type !1 {
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
index 0c5555547eec4..6fdf68b8b4925 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
@@ -21,28 +21,28 @@
 ; WAVE64:    .wavefront_size: 64
 ; WAVE32:    .wavefront_size: 32
 define amdgpu_kernel void @test(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
 ; CHECK:   - .args:
 ; CHECK:     .max_flat_workgroup_size: 256
 define amdgpu_kernel void @test_max_flat_workgroup_size(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) #2 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) #2 {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -53,35 +53,35 @@ entry:
 ; GFX1010:  .sgpr_spill_count: 48
 ; CHECK:   .symbol:     num_spilled_sgprs.kd
 define amdgpu_kernel void @num_spilled_sgprs(
-    i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32],
-    i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, [8 x i32],
-    i32 addrspace(1)* %out4, i32 addrspace(1)* %out5, [8 x i32],
-    i32 addrspace(1)* %out6, i32 addrspace(1)* %out7, [8 x i32],
-    i32 addrspace(1)* %out8, i32 addrspace(1)* %out9, [8 x i32],
-    i32 addrspace(1)* %outa, i32 addrspace(1)* %outb, [8 x i32],
-    i32 addrspace(1)* %outc, i32 addrspace(1)* %outd, [8 x i32],
-    i32 addrspace(1)* %oute, i32 addrspace(1)* %outf, [8 x i32],
+    ptr addrspace(1) %out0, ptr addrspace(1) %out1, [8 x i32],
+    ptr addrspace(1) %out2, ptr addrspace(1) %out3, [8 x i32],
+    ptr addrspace(1) %out4, ptr addrspace(1) %out5, [8 x i32],
+    ptr addrspace(1) %out6, ptr addrspace(1) %out7, [8 x i32],
+    ptr addrspace(1) %out8, ptr addrspace(1) %out9, [8 x i32],
+    ptr addrspace(1) %outa, ptr addrspace(1) %outb, [8 x i32],
+    ptr addrspace(1) %outc, ptr addrspace(1) %outd, [8 x i32],
+    ptr addrspace(1) %oute, ptr addrspace(1) %outf, [8 x i32],
     i32 %in0, i32 %in1, i32 %in2, i32 %in3, [8 x i32],
     i32 %in4, i32 %in5, i32 %in6, i32 %in7, [8 x i32],
     i32 %in8, i32 %in9, i32 %ina, i32 %inb, [8 x i32],
     i32 %inc, i32 %ind, i32 %ine, i32 %inf) #0 {
 entry:
-  store i32 %in0, i32 addrspace(1)* %out0
-  store i32 %in1, i32 addrspace(1)* %out1
-  store i32 %in2, i32 addrspace(1)* %out2
-  store i32 %in3, i32 addrspace(1)* %out3
-  store i32 %in4, i32 addrspace(1)* %out4
-  store i32 %in5, i32 addrspace(1)* %out5
-  store i32 %in6, i32 addrspace(1)* %out6
-  store i32 %in7, i32 addrspace(1)* %out7
-  store i32 %in8, i32 addrspace(1)* %out8
-  store i32 %in9, i32 addrspace(1)* %out9
-  store i32 %ina, i32 addrspace(1)* %outa
-  store i32 %inb, i32 addrspace(1)* %outb
-  store i32 %inc, i32 addrspace(1)* %outc
-  store i32 %ind, i32 addrspace(1)* %outd
-  store i32 %ine, i32 addrspace(1)* %oute
-  store i32 %inf, i32 addrspace(1)* %outf
+  store i32 %in0, ptr addrspace(1) %out0
+  store i32 %in1, ptr addrspace(1) %out1
+  store i32 %in2, ptr addrspace(1) %out2
+  store i32 %in3, ptr addrspace(1) %out3
+  store i32 %in4, ptr addrspace(1) %out4
+  store i32 %in5, ptr addrspace(1) %out5
+  store i32 %in6, ptr addrspace(1) %out6
+  store i32 %in7, ptr addrspace(1) %out7
+  store i32 %in8, ptr addrspace(1) %out8
+  store i32 %in9, ptr addrspace(1) %out9
+  store i32 %ina, ptr addrspace(1) %outa
+  store i32 %inb, ptr addrspace(1) %outb
+  store i32 %inc, ptr addrspace(1) %outc
+  store i32 %ind, ptr addrspace(1) %outd
+  store i32 %ine, ptr addrspace(1) %oute
+  store i32 %inf, ptr addrspace(1) %outf
   ret void
 }
 
@@ -89,69 +89,69 @@ entry:
 ; CHECK:   .symbol:     num_spilled_vgprs.kd
 ; CHECK:   .vgpr_spill_count: {{13|14}}
 define amdgpu_kernel void @num_spilled_vgprs() #1 {
-  %val0 = load volatile float, float addrspace(1)* @var
-  %val1 = load volatile float, float addrspace(1)* @var
-  %val2 = load volatile float, float addrspace(1)* @var
-  %val3 = load volatile float, float addrspace(1)* @var
-  %val4 = load volatile float, float addrspace(1)* @var
-  %val5 = load volatile float, float addrspace(1)* @var
-  %val6 = load volatile float, float addrspace(1)* @var
-  %val7 = load volatile float, float addrspace(1)* @var
-  %val8 = load volatile float, float addrspace(1)* @var
-  %val9 = load volatile float, float addrspace(1)* @var
-  %val10 = load volatile float, float addrspace(1)* @var
-  %val11 = load volatile float, float addrspace(1)* @var
-  %val12 = load volatile float, float addrspace(1)* @var
-  %val13 = load volatile float, float addrspace(1)* @var
-  %val14 = load volatile float, float addrspace(1)* @var
-  %val15 = load volatile float, float addrspace(1)* @var
-  %val16 = load volatile float, float addrspace(1)* @var
-  %val17 = load volatile float, float addrspace(1)* @var
-  %val18 = load volatile float, float addrspace(1)* @var
-  %val19 = load volatile float, float addrspace(1)* @var
-  %val20 = load volatile float, float addrspace(1)* @var
-  %val21 = load volatile float, float addrspace(1)* @var
-  %val22 = load volatile float, float addrspace(1)* @var
-  %val23 = load volatile float, float addrspace(1)* @var
-  %val24 = load volatile float, float addrspace(1)* @var
-  %val25 = load volatile float, float addrspace(1)* @var
-  %val26 = load volatile float, float addrspace(1)* @var
-  %val27 = load volatile float, float addrspace(1)* @var
-  %val28 = load volatile float, float addrspace(1)* @var
-  %val29 = load volatile float, float addrspace(1)* @var
-  %val30 = load volatile float, float addrspace(1)* @var
+  %val0 = load volatile float, ptr addrspace(1) @var
+  %val1 = load volatile float, ptr addrspace(1) @var
+  %val2 = load volatile float, ptr addrspace(1) @var
+  %val3 = load volatile float, ptr addrspace(1) @var
+  %val4 = load volatile float, ptr addrspace(1) @var
+  %val5 = load volatile float, ptr addrspace(1) @var
+  %val6 = load volatile float, ptr addrspace(1) @var
+  %val7 = load volatile float, ptr addrspace(1) @var
+  %val8 = load volatile float, ptr addrspace(1) @var
+  %val9 = load volatile float, ptr addrspace(1) @var
+  %val10 = load volatile float, ptr addrspace(1) @var
+  %val11 = load volatile float, ptr addrspace(1) @var
+  %val12 = load volatile float, ptr addrspace(1) @var
+  %val13 = load volatile float, ptr addrspace(1) @var
+  %val14 = load volatile float, ptr addrspace(1) @var
+  %val15 = load volatile float, ptr addrspace(1) @var
+  %val16 = load volatile float, ptr addrspace(1) @var
+  %val17 = load volatile float, ptr addrspace(1) @var
+  %val18 = load volatile float, ptr addrspace(1) @var
+  %val19 = load volatile float, ptr addrspace(1) @var
+  %val20 = load volatile float, ptr addrspace(1) @var
+  %val21 = load volatile float, ptr addrspace(1) @var
+  %val22 = load volatile float, ptr addrspace(1) @var
+  %val23 = load volatile float, ptr addrspace(1) @var
+  %val24 = load volatile float, ptr addrspace(1) @var
+  %val25 = load volatile float, ptr addrspace(1) @var
+  %val26 = load volatile float, ptr addrspace(1) @var
+  %val27 = load volatile float, ptr addrspace(1) @var
+  %val28 = load volatile float, ptr addrspace(1) @var
+  %val29 = load volatile float, ptr addrspace(1) @var
+  %val30 = load volatile float, ptr addrspace(1) @var
 
-  store volatile float %val0, float addrspace(1)* @var
-  store volatile float %val1, float addrspace(1)* @var
-  store volatile float %val2, float addrspace(1)* @var
-  store volatile float %val3, float addrspace(1)* @var
-  store volatile float %val4, float addrspace(1)* @var
-  store volatile float %val5, float addrspace(1)* @var
-  store volatile float %val6, float addrspace(1)* @var
-  store volatile float %val7, float addrspace(1)* @var
-  store volatile float %val8, float addrspace(1)* @var
-  store volatile float %val9, float addrspace(1)* @var
-  store volatile float %val10, float addrspace(1)* @var
-  store volatile float %val11, float addrspace(1)* @var
-  store volatile float %val12, float addrspace(1)* @var
-  store volatile float %val13, float addrspace(1)* @var
-  store volatile float %val14, float addrspace(1)* @var
-  store volatile float %val15, float addrspace(1)* @var
-  store volatile float %val16, float addrspace(1)* @var
-  store volatile float %val17, float addrspace(1)* @var
-  store volatile float %val18, float addrspace(1)* @var
-  store volatile float %val19, float addrspace(1)* @var
-  store volatile float %val20, float addrspace(1)* @var
-  store volatile float %val21, float addrspace(1)* @var
-  store volatile float %val22, float addrspace(1)* @var
-  store volatile float %val23, float addrspace(1)* @var
-  store volatile float %val24, float addrspace(1)* @var
-  store volatile float %val25, float addrspace(1)* @var
-  store volatile float %val26, float addrspace(1)* @var
-  store volatile float %val27, float addrspace(1)* @var
-  store volatile float %val28, float addrspace(1)* @var
-  store volatile float %val29, float addrspace(1)* @var
-  store volatile float %val30, float addrspace(1)* @var
+  store volatile float %val0, ptr addrspace(1) @var
+  store volatile float %val1, ptr addrspace(1) @var
+  store volatile float %val2, ptr addrspace(1) @var
+  store volatile float %val3, ptr addrspace(1) @var
+  store volatile float %val4, ptr addrspace(1) @var
+  store volatile float %val5, ptr addrspace(1) @var
+  store volatile float %val6, ptr addrspace(1) @var
+  store volatile float %val7, ptr addrspace(1) @var
+  store volatile float %val8, ptr addrspace(1) @var
+  store volatile float %val9, ptr addrspace(1) @var
+  store volatile float %val10, ptr addrspace(1) @var
+  store volatile float %val11, ptr addrspace(1) @var
+  store volatile float %val12, ptr addrspace(1) @var
+  store volatile float %val13, ptr addrspace(1) @var
+  store volatile float %val14, ptr addrspace(1) @var
+  store volatile float %val15, ptr addrspace(1) @var
+  store volatile float %val16, ptr addrspace(1) @var
+  store volatile float %val17, ptr addrspace(1) @var
+  store volatile float %val18, ptr addrspace(1) @var
+  store volatile float %val19, ptr addrspace(1) @var
+  store volatile float %val20, ptr addrspace(1) @var
+  store volatile float %val21, ptr addrspace(1) @var
+  store volatile float %val22, ptr addrspace(1) @var
+  store volatile float %val23, ptr addrspace(1) @var
+  store volatile float %val24, ptr addrspace(1) @var
+  store volatile float %val25, ptr addrspace(1) @var
+  store volatile float %val26, ptr addrspace(1) @var
+  store volatile float %val27, ptr addrspace(1) @var
+  store volatile float %val28, ptr addrspace(1) @var
+  store volatile float %val29, ptr addrspace(1) @var
+  store volatile float %val30, ptr addrspace(1) @var
 
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index e10f96072e254..fbb05b1a1527b 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -20,14 +20,14 @@
 ; CHECK:     NumVGPRs:                {{3|6}}
 ; CHECK:     MaxFlatWorkGroupSize:    1024
 define amdgpu_kernel void @test(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -43,14 +43,14 @@ entry:
 ; CHECK:     NumVGPRs:                {{3|6}}
 ; CHECK:     MaxFlatWorkGroupSize:    256
 define amdgpu_kernel void @test_max_flat_workgroup_size(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) #2 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) #2 {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %r.val = fadd half %a.val, %b.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -61,35 +61,35 @@ entry:
 ; GFX803:     NumSpilledSGPRs: 22
 ; GFX900:     NumSpilledSGPRs: {{22|48}}
 define amdgpu_kernel void @num_spilled_sgprs(
-    i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32],
-    i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, [8 x i32],
-    i32 addrspace(1)* %out4, i32 addrspace(1)* %out5, [8 x i32],
-    i32 addrspace(1)* %out6, i32 addrspace(1)* %out7, [8 x i32],
-    i32 addrspace(1)* %out8, i32 addrspace(1)* %out9, [8 x i32],
-    i32 addrspace(1)* %outa, i32 addrspace(1)* %outb, [8 x i32],
-    i32 addrspace(1)* %outc, i32 addrspace(1)* %outd, [8 x i32],
-    i32 addrspace(1)* %oute, i32 addrspace(1)* %outf, [8 x i32],
+    ptr addrspace(1) %out0, ptr addrspace(1) %out1, [8 x i32],
+    ptr addrspace(1) %out2, ptr addrspace(1) %out3, [8 x i32],
+    ptr addrspace(1) %out4, ptr addrspace(1) %out5, [8 x i32],
+    ptr addrspace(1) %out6, ptr addrspace(1) %out7, [8 x i32],
+    ptr addrspace(1) %out8, ptr addrspace(1) %out9, [8 x i32],
+    ptr addrspace(1) %outa, ptr addrspace(1) %outb, [8 x i32],
+    ptr addrspace(1) %outc, ptr addrspace(1) %outd, [8 x i32],
+    ptr addrspace(1) %oute, ptr addrspace(1) %outf, [8 x i32],
     i32 %in0, i32 %in1, i32 %in2, i32 %in3, [8 x i32],
     i32 %in4, i32 %in5, i32 %in6, i32 %in7, [8 x i32],
     i32 %in8, i32 %in9, i32 %ina, i32 %inb, [8 x i32],
     i32 %inc, i32 %ind, i32 %ine, i32 %inf) #0 {
 entry:
-  store i32 %in0, i32 addrspace(1)* %out0
-  store i32 %in1, i32 addrspace(1)* %out1
-  store i32 %in2, i32 addrspace(1)* %out2
-  store i32 %in3, i32 addrspace(1)* %out3
-  store i32 %in4, i32 addrspace(1)* %out4
-  store i32 %in5, i32 addrspace(1)* %out5
-  store i32 %in6, i32 addrspace(1)* %out6
-  store i32 %in7, i32 addrspace(1)* %out7
-  store i32 %in8, i32 addrspace(1)* %out8
-  store i32 %in9, i32 addrspace(1)* %out9
-  store i32 %ina, i32 addrspace(1)* %outa
-  store i32 %inb, i32 addrspace(1)* %outb
-  store i32 %inc, i32 addrspace(1)* %outc
-  store i32 %ind, i32 addrspace(1)* %outd
-  store i32 %ine, i32 addrspace(1)* %oute
-  store i32 %inf, i32 addrspace(1)* %outf
+  store i32 %in0, ptr addrspace(1) %out0
+  store i32 %in1, ptr addrspace(1) %out1
+  store i32 %in2, ptr addrspace(1) %out2
+  store i32 %in3, ptr addrspace(1) %out3
+  store i32 %in4, ptr addrspace(1) %out4
+  store i32 %in5, ptr addrspace(1) %out5
+  store i32 %in6, ptr addrspace(1) %out6
+  store i32 %in7, ptr addrspace(1) %out7
+  store i32 %in8, ptr addrspace(1) %out8
+  store i32 %in9, ptr addrspace(1) %out9
+  store i32 %ina, ptr addrspace(1) %outa
+  store i32 %inb, ptr addrspace(1) %outb
+  store i32 %inc, ptr addrspace(1) %outc
+  store i32 %ind, ptr addrspace(1) %outd
+  store i32 %ine, ptr addrspace(1) %oute
+  store i32 %inf, ptr addrspace(1) %outf
   ret void
 }
 
@@ -98,69 +98,69 @@ entry:
 ; CHECK:   CodeProps:
 ; CHECK:     NumSpilledVGPRs: {{13|14}}
 define amdgpu_kernel void @num_spilled_vgprs() #1 {
-  %val0 = load volatile float, float addrspace(1)* @var
-  %val1 = load volatile float, float addrspace(1)* @var
-  %val2 = load volatile float, float addrspace(1)* @var
-  %val3 = load volatile float, float addrspace(1)* @var
-  %val4 = load volatile float, float addrspace(1)* @var
-  %val5 = load volatile float, float addrspace(1)* @var
-  %val6 = load volatile float, float addrspace(1)* @var
-  %val7 = load volatile float, float addrspace(1)* @var
-  %val8 = load volatile float, float addrspace(1)* @var
-  %val9 = load volatile float, float addrspace(1)* @var
-  %val10 = load volatile float, float addrspace(1)* @var
-  %val11 = load volatile float, float addrspace(1)* @var
-  %val12 = load volatile float, float addrspace(1)* @var
-  %val13 = load volatile float, float addrspace(1)* @var
-  %val14 = load volatile float, float addrspace(1)* @var
-  %val15 = load volatile float, float addrspace(1)* @var
-  %val16 = load volatile float, float addrspace(1)* @var
-  %val17 = load volatile float, float addrspace(1)* @var
-  %val18 = load volatile float, float addrspace(1)* @var
-  %val19 = load volatile float, float addrspace(1)* @var
-  %val20 = load volatile float, float addrspace(1)* @var
-  %val21 = load volatile float, float addrspace(1)* @var
-  %val22 = load volatile float, float addrspace(1)* @var
-  %val23 = load volatile float, float addrspace(1)* @var
-  %val24 = load volatile float, float addrspace(1)* @var
-  %val25 = load volatile float, float addrspace(1)* @var
-  %val26 = load volatile float, float addrspace(1)* @var
-  %val27 = load volatile float, float addrspace(1)* @var
-  %val28 = load volatile float, float addrspace(1)* @var
-  %val29 = load volatile float, float addrspace(1)* @var
-  %val30 = load volatile float, float addrspace(1)* @var
+  %val0 = load volatile float, ptr addrspace(1) @var
+  %val1 = load volatile float, ptr addrspace(1) @var
+  %val2 = load volatile float, ptr addrspace(1) @var
+  %val3 = load volatile float, ptr addrspace(1) @var
+  %val4 = load volatile float, ptr addrspace(1) @var
+  %val5 = load volatile float, ptr addrspace(1) @var
+  %val6 = load volatile float, ptr addrspace(1) @var
+  %val7 = load volatile float, ptr addrspace(1) @var
+  %val8 = load volatile float, ptr addrspace(1) @var
+  %val9 = load volatile float, ptr addrspace(1) @var
+  %val10 = load volatile float, ptr addrspace(1) @var
+  %val11 = load volatile float, ptr addrspace(1) @var
+  %val12 = load volatile float, ptr addrspace(1) @var
+  %val13 = load volatile float, ptr addrspace(1) @var
+  %val14 = load volatile float, ptr addrspace(1) @var
+  %val15 = load volatile float, ptr addrspace(1) @var
+  %val16 = load volatile float, ptr addrspace(1) @var
+  %val17 = load volatile float, ptr addrspace(1) @var
+  %val18 = load volatile float, ptr addrspace(1) @var
+  %val19 = load volatile float, ptr addrspace(1) @var
+  %val20 = load volatile float, ptr addrspace(1) @var
+  %val21 = load volatile float, ptr addrspace(1) @var
+  %val22 = load volatile float, ptr addrspace(1) @var
+  %val23 = load volatile float, ptr addrspace(1) @var
+  %val24 = load volatile float, ptr addrspace(1) @var
+  %val25 = load volatile float, ptr addrspace(1) @var
+  %val26 = load volatile float, ptr addrspace(1) @var
+  %val27 = load volatile float, ptr addrspace(1) @var
+  %val28 = load volatile float, ptr addrspace(1) @var
+  %val29 = load volatile float, ptr addrspace(1) @var
+  %val30 = load volatile float, ptr addrspace(1) @var
 
-  store volatile float %val0, float addrspace(1)* @var
-  store volatile float %val1, float addrspace(1)* @var
-  store volatile float %val2, float addrspace(1)* @var
-  store volatile float %val3, float addrspace(1)* @var
-  store volatile float %val4, float addrspace(1)* @var
-  store volatile float %val5, float addrspace(1)* @var
-  store volatile float %val6, float addrspace(1)* @var
-  store volatile float %val7, float addrspace(1)* @var
-  store volatile float %val8, float addrspace(1)* @var
-  store volatile float %val9, float addrspace(1)* @var
-  store volatile float %val10, float addrspace(1)* @var
-  store volatile float %val11, float addrspace(1)* @var
-  store volatile float %val12, float addrspace(1)* @var
-  store volatile float %val13, float addrspace(1)* @var
-  store volatile float %val14, float addrspace(1)* @var
-  store volatile float %val15, float addrspace(1)* @var
-  store volatile float %val16, float addrspace(1)* @var
-  store volatile float %val17, float addrspace(1)* @var
-  store volatile float %val18, float addrspace(1)* @var
-  store volatile float %val19, float addrspace(1)* @var
-  store volatile float %val20, float addrspace(1)* @var
-  store volatile float %val21, float addrspace(1)* @var
-  store volatile float %val22, float addrspace(1)* @var
-  store volatile float %val23, float addrspace(1)* @var
-  store volatile float %val24, float addrspace(1)* @var
-  store volatile float %val25, float addrspace(1)* @var
-  store volatile float %val26, float addrspace(1)* @var
-  store volatile float %val27, float addrspace(1)* @var
-  store volatile float %val28, float addrspace(1)* @var
-  store volatile float %val29, float addrspace(1)* @var
-  store volatile float %val30, float addrspace(1)* @var
+  store volatile float %val0, ptr addrspace(1) @var
+  store volatile float %val1, ptr addrspace(1) @var
+  store volatile float %val2, ptr addrspace(1) @var
+  store volatile float %val3, ptr addrspace(1) @var
+  store volatile float %val4, ptr addrspace(1) @var
+  store volatile float %val5, ptr addrspace(1) @var
+  store volatile float %val6, ptr addrspace(1) @var
+  store volatile float %val7, ptr addrspace(1) @var
+  store volatile float %val8, ptr addrspace(1) @var
+  store volatile float %val9, ptr addrspace(1) @var
+  store volatile float %val10, ptr addrspace(1) @var
+  store volatile float %val11, ptr addrspace(1) @var
+  store volatile float %val12, ptr addrspace(1) @var
+  store volatile float %val13, ptr addrspace(1) @var
+  store volatile float %val14, ptr addrspace(1) @var
+  store volatile float %val15, ptr addrspace(1) @var
+  store volatile float %val16, ptr addrspace(1) @var
+  store volatile float %val17, ptr addrspace(1) @var
+  store volatile float %val18, ptr addrspace(1) @var
+  store volatile float %val19, ptr addrspace(1) @var
+  store volatile float %val20, ptr addrspace(1) @var
+  store volatile float %val21, ptr addrspace(1) @var
+  store volatile float %val22, ptr addrspace(1) @var
+  store volatile float %val23, ptr addrspace(1) @var
+  store volatile float %val24, ptr addrspace(1) @var
+  store volatile float %val25, ptr addrspace(1) @var
+  store volatile float %val26, ptr addrspace(1) @var
+  store volatile float %val27, ptr addrspace(1) @var
+  store volatile float %val28, ptr addrspace(1) @var
+  store volatile float %val29, ptr addrspace(1) @var
+  store volatile float %val30, ptr addrspace(1) @var
 
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-multigrid-sync-arg-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-multigrid-sync-arg-v5.ll
index 58479db645b1d..4eb472b9ccd86 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-multigrid-sync-arg-v5.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-multigrid-sync-arg-v5.ll
@@ -6,35 +6,34 @@ declare void @function1()
 declare void @function2() #0
 
 ; Function Attrs: noinline
-define void @function3(i8 addrspace(4)* %argptr, i8 addrspace(4)* addrspace(1)* %sink) #2 {
-  store i8 addrspace(4)* %argptr, i8 addrspace(4)* addrspace(1)* %sink, align 8
+define void @function3(ptr addrspace(4) %argptr, ptr addrspace(1) %sink) #2 {
+  store ptr addrspace(4) %argptr, ptr addrspace(1) %sink, align 8
   ret void
 }
 
 ; Function Attrs: noinline
-define void @function4(i64 %arg, i64* %a) #2 {
-  store i64 %arg, i64* %a
+define void @function4(i64 %arg, ptr %a) #2 {
+  store i64 %arg, ptr %a
   ret void
 }
 
 ; Function Attrs: noinline
-define void @function5(i8 addrspace(4)* %ptr, i64* %sink) #2 {
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 72
-  %cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)*
-  %x = load i64, i64 addrspace(4)* %cast
-  store i64 %x, i64* %sink
+define void @function5(ptr addrspace(4) %ptr, ptr %sink) #2 {
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 72
+  %x = load i64, ptr addrspace(4) %gep
+  store i64 %x, ptr %sink
   ret void
 }
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare align 4 i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #1
+declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #1
 
 ; CHECK: amdhsa.kernels:
 ; CHECK:  - .args:
 ; CHECK-NOT: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel10
-define amdgpu_kernel void @test_kernel10(i8* %a) {
-  store i8 3, i8* %a, align 1
+define amdgpu_kernel void @test_kernel10(ptr %a) {
+  store i8 3, ptr %a, align 1
   ret void
 }
 
@@ -43,9 +42,9 @@ define amdgpu_kernel void @test_kernel10(i8* %a) {
 ; CHECK:  - .args:
 ; CHECK: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel20
-define amdgpu_kernel void @test_kernel20(i8* %a) {
+define amdgpu_kernel void @test_kernel20(ptr %a) {
   call void @function1()
-  store i8 3, i8* %a, align 1
+  store i8 3, ptr %a, align 1
   ret void
 }
 
@@ -54,9 +53,9 @@ define amdgpu_kernel void @test_kernel20(i8* %a) {
 ; CHECK:  - .args:
 ; CHECK-NOT: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel21
-define amdgpu_kernel void @test_kernel21(i8* %a) #0 {
+define amdgpu_kernel void @test_kernel21(ptr %a) #0 {
   call void @function1()
-  store i8 3, i8* %a, align 1
+  store i8 3, ptr %a, align 1
   ret void
 }
 
@@ -65,9 +64,9 @@ define amdgpu_kernel void @test_kernel21(i8* %a) #0 {
 ; CHECK:  - .args:
 ; CHECK-NOT: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel22
-define amdgpu_kernel void @test_kernel22(i8* %a) {
+define amdgpu_kernel void @test_kernel22(ptr %a) {
   call void @function2()
-  store i8 3, i8* %a, align 1
+  store i8 3, ptr %a, align 1
   ret void
 }
 
@@ -76,12 +75,11 @@ define amdgpu_kernel void @test_kernel22(i8* %a) {
 ; CHECK:  - .args:
 ; CHECK: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel30
-define amdgpu_kernel void @test_kernel30(i128* %a) {
-  %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 80
-  %cast = bitcast i8 addrspace(4)* %gep to i128 addrspace(4)*
-  %x = load i128, i128 addrspace(4)* %cast
-  store i128 %x, i128* %a
+define amdgpu_kernel void @test_kernel30(ptr %a) {
+  %ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 80
+  %x = load i128, ptr addrspace(4) %gep
+  store i128 %x, ptr %a
   ret void
 }
 
@@ -90,12 +88,11 @@ define amdgpu_kernel void @test_kernel30(i128* %a) {
 ; CHECK:  - .args:
 ; CHECK: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel40
-define amdgpu_kernel void @test_kernel40(i64* %a) {
-  %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 88
-  %cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)*
-  %x = load i64, i64 addrspace(4)* %cast
-  store i64 %x, i64* %a
+define amdgpu_kernel void @test_kernel40(ptr %a) {
+  %ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 88
+  %x = load i64, ptr addrspace(4) %gep
+  store i64 %x, ptr %a
   ret void
 }
 
@@ -104,12 +101,11 @@ define amdgpu_kernel void @test_kernel40(i64* %a) {
 ; CHECK:  - .args:
 ; CHECK-NOT: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel41
-define amdgpu_kernel void @test_kernel41(i64* %a) #0 {
-  %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 88
-  %cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)*
-  %x = load i64, i64 addrspace(4)* %cast
-  store i64 %x, i64* %a
+define amdgpu_kernel void @test_kernel41(ptr %a) #0 {
+  %ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 88
+  %x = load i64, ptr addrspace(4) %gep
+  store i64 %x, ptr %a
   ret void
 }
 
@@ -118,12 +114,11 @@ define amdgpu_kernel void @test_kernel41(i64* %a) #0 {
 ; CHECK:  - .args:
 ; CHECK-NOT: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel42
-define amdgpu_kernel void @test_kernel42(i64* %a) {
-  %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 80
-  %cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)*
-  %x = load i64, i64 addrspace(4)* %cast
-  store i64 %x, i64* %a
+define amdgpu_kernel void @test_kernel42(ptr %a) {
+  %ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 80
+  %x = load i64, ptr addrspace(4) %gep
+  store i64 %x, ptr %a
   ret void
 }
 
@@ -132,12 +127,11 @@ define amdgpu_kernel void @test_kernel42(i64* %a) {
 ; CHECK:  - .args:
 ; CHECK-NOT: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel43
-define amdgpu_kernel void @test_kernel43(i64* %a) {
-  %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 96
-  %cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)*
-  %x = load i64, i64 addrspace(4)* %cast
-  store i64 %x, i64* %a
+define amdgpu_kernel void @test_kernel43(ptr %a) {
+  %ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 96
+  %x = load i64, ptr addrspace(4) %gep
+  store i64 %x, ptr %a
   ret void
 }
 
@@ -146,11 +140,11 @@ define amdgpu_kernel void @test_kernel43(i64* %a) {
 ; CHECK:  - .args:
 ; CHECK-NOT: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel44
-define amdgpu_kernel void @test_kernel44(i8* %a) {
-  %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 87
-  %x = load i8, i8 addrspace(4)* %gep, align 1
-  store i8 %x, i8* %a, align 1
+define amdgpu_kernel void @test_kernel44(ptr %a) {
+  %ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 87
+  %x = load i8, ptr addrspace(4) %gep, align 1
+  store i8 %x, ptr %a, align 1
   ret void
 }
 
@@ -159,11 +153,11 @@ define amdgpu_kernel void @test_kernel44(i8* %a) {
 ; CHECK:  - .args:
 ; CHECK: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel45
-define amdgpu_kernel void @test_kernel45(i8* %a) {
-  %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 88
-  %x = load i8, i8 addrspace(4)* %gep, align 1
-  store i8 %x, i8* %a, align 1
+define amdgpu_kernel void @test_kernel45(ptr %a) {
+  %ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 88
+  %x = load i8, ptr addrspace(4) %gep, align 1
+  store i8 %x, ptr %a, align 1
   ret void
 }
 
@@ -172,11 +166,11 @@ define amdgpu_kernel void @test_kernel45(i8* %a) {
 ; CHECK:  - .args:
 ; CHECK: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel46
-define amdgpu_kernel void @test_kernel46(i8* %a) {
-  %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 95
-  %x = load i8, i8 addrspace(4)* %gep, align 1
-  store i8 %x, i8* %a, align 1
+define amdgpu_kernel void @test_kernel46(ptr %a) {
+  %ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 95
+  %x = load i8, ptr addrspace(4) %gep, align 1
+  store i8 %x, ptr %a, align 1
   ret void
 }
 
@@ -185,11 +179,11 @@ define amdgpu_kernel void @test_kernel46(i8* %a) {
 ; CHECK:  - .args:
 ; CHECK-NOT: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel47
-define amdgpu_kernel void @test_kernel47(i8* %a) {
-  %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 96
-  %x = load i8, i8 addrspace(4)* %gep, align 1
-  store i8 %x, i8* %a, align 1
+define amdgpu_kernel void @test_kernel47(ptr %a) {
+  %ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 96
+  %x = load i8, ptr addrspace(4) %gep, align 1
+  store i8 %x, ptr %a, align 1
   ret void
 }
 
@@ -198,11 +192,11 @@ define amdgpu_kernel void @test_kernel47(i8* %a) {
 ; CHECK:  - .args:
 ; CHECK: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel50
-define amdgpu_kernel void @test_kernel50(i8* %a, i32 %b) {
-  %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i32 %b
-  %x = load i8, i8 addrspace(4)* %gep, align 1
-  store i8 %x, i8* %a, align 1
+define amdgpu_kernel void @test_kernel50(ptr %a, i32 %b) {
+  %ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i32 %b
+  %x = load i8, ptr addrspace(4) %gep, align 1
+  store i8 %x, ptr %a, align 1
   ret void
 }
 
@@ -211,12 +205,12 @@ define amdgpu_kernel void @test_kernel50(i8* %a, i32 %b) {
 ; CHECK:  - .args:
 ; CHECK: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel51
-define amdgpu_kernel void @test_kernel51(i8* %a) {
-  %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep1 = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 16
-  %gep2 = getelementptr inbounds i8, i8 addrspace(4)* %gep1, i64 72
-  %x = load i8, i8 addrspace(4)* %gep2, align 1
-  store i8 %x, i8* %a, align 1
+define amdgpu_kernel void @test_kernel51(ptr %a) {
+  %ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep1 = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 16
+  %gep2 = getelementptr inbounds i8, ptr addrspace(4) %gep1, i64 72
+  %x = load i8, ptr addrspace(4) %gep2, align 1
+  store i8 %x, ptr %a, align 1
   ret void
 }
 
@@ -225,12 +219,12 @@ define amdgpu_kernel void @test_kernel51(i8* %a) {
 ; CHECK:  - .args:
 ; CHECK-NOT: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel52
-define amdgpu_kernel void @test_kernel52(i8* %a) {
-  %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep1 = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 16
-  %gep2 = getelementptr inbounds i8, i8 addrspace(4)* %gep1, i64 16
-  %x = load i8, i8 addrspace(4)* %gep2, align 1
-  store i8 %x, i8* %a, align 1
+define amdgpu_kernel void @test_kernel52(ptr %a) {
+  %ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep1 = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 16
+  %gep2 = getelementptr inbounds i8, ptr addrspace(4) %gep1, i64 16
+  %x = load i8, ptr addrspace(4) %gep2, align 1
+  store i8 %x, ptr %a, align 1
   ret void
 }
 
@@ -239,12 +233,11 @@ define amdgpu_kernel void @test_kernel52(i8* %a) {
 ; CHECK:  - .args:
 ; CHECK: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel60
-define amdgpu_kernel void @test_kernel60(i64* %a) #2 {
-  %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 88
-  %cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)*
-  %x = load i64, i64 addrspace(4)* %cast
-  call void @function4(i64 %x, i64* %a)
+define amdgpu_kernel void @test_kernel60(ptr %a) #2 {
+  %ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 88
+  %x = load i64, ptr addrspace(4) %gep
+  call void @function4(i64 %x, ptr %a)
   ret void
 }
 
@@ -253,10 +246,10 @@ define amdgpu_kernel void @test_kernel60(i64* %a) #2 {
 ; CHECK:  - .args:
 ; CHECK: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel61
-define amdgpu_kernel void @test_kernel61(i64* %a) #2 {
-  %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 16
-  call void @function5(i8 addrspace(4)* %gep, i64* %a)
+define amdgpu_kernel void @test_kernel61(ptr %a) #2 {
+  %ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i64 16
+  call void @function5(ptr addrspace(4) %gep, ptr %a)
   ret void
 }
 
@@ -265,10 +258,10 @@ define amdgpu_kernel void @test_kernel61(i64* %a) #2 {
 ; CHECK:  - .args:
 ; CHECK: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel70
-define amdgpu_kernel void @test_kernel70(i8 addrspace(4)* addrspace(1)* %sink) #2 {
-  %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i32 42
-  store i8 addrspace(4)* %gep, i8 addrspace(4)* addrspace(1)* %sink, align 8
+define amdgpu_kernel void @test_kernel70(ptr addrspace(1) %sink) #2 {
+  %ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i32 42
+  store ptr addrspace(4) %gep, ptr addrspace(1) %sink, align 8
   ret void
 }
 
@@ -277,10 +270,10 @@ define amdgpu_kernel void @test_kernel70(i8 addrspace(4)* addrspace(1)* %sink) #
 ; CHECK:  - .args:
 ; CHECK: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel71
-define amdgpu_kernel void @test_kernel71(i8 addrspace(4)* addrspace(1)* %sink) #2 {
-  %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i32 42
-  call void @function3(i8 addrspace(4)* %gep, i8 addrspace(4)* addrspace(1)* %sink)
+define amdgpu_kernel void @test_kernel71(ptr addrspace(1) %sink) #2 {
+  %ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i32 42
+  call void @function3(ptr addrspace(4) %gep, ptr addrspace(1) %sink)
   ret void
 }
 
@@ -290,9 +283,9 @@ define amdgpu_kernel void @test_kernel71(i8 addrspace(4)* addrspace(1)* %sink) #
 ; CHECK-NOT: hidden_multigrid_sync_arg
 ; CHECK-LABEL:    .name:           test_kernel72
 define amdgpu_kernel void @test_kernel72() #2 {
-  %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i32 42
-  store i8 addrspace(4)* %gep, i8 addrspace(4)* addrspace(1)* undef, align 8
+  %ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %ptr, i32 42
+  store ptr addrspace(4) %gep, ptr addrspace(1) undef, align 8
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index 45cb7b84ce5c3..80fd4c46de78f 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -71,9 +71,9 @@
 ; HSA: .Lfunc_end0:
 ; HSA: .size   simple, .Lfunc_end0-simple
 
-define amdgpu_kernel void @simple(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @simple(ptr addrspace(1) %out) {
 entry:
-  store i32 0, i32 addrspace(1)* %out
+  store i32 0, ptr addrspace(1) %out
   ret void
 }
 
@@ -81,6 +81,6 @@ entry:
 ; HSA: enable_sgpr_kernarg_segment_ptr = 0
 define amdgpu_kernel void @simple_no_kernargs() {
 entry:
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
index aa5450047278e..134ef4d5fc787 100644
--- a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
@@ -8,10 +8,10 @@
 ; GCN: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
 define amdgpu_kernel void @scratch_buffer_known_high_masklo14() #0 {
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %alloca
-  %toint = ptrtoint i32 addrspace(5)* %alloca to i32
+  store volatile i32 0, ptr addrspace(5) %alloca
+  %toint = ptrtoint ptr addrspace(5) %alloca to i32
   %masked = and i32 %toint, 16383
-  store volatile i32 %masked, i32 addrspace(1)* undef
+  store volatile i32 %masked, ptr addrspace(1) undef
   ret void
 }
 
@@ -21,10 +21,10 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo14() #0 {
 ; GCN: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
 define amdgpu_kernel void @scratch_buffer_known_high_masklo16() #0 {
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %alloca
-  %toint = ptrtoint i32 addrspace(5)* %alloca to i32
+  store volatile i32 0, ptr addrspace(5) %alloca
+  %toint = ptrtoint ptr addrspace(5) %alloca to i32
   %masked = and i32 %toint, 65535
-  store volatile i32 %masked, i32 addrspace(1)* undef
+  store volatile i32 %masked, ptr addrspace(1) undef
   ret void
 }
 
@@ -37,10 +37,10 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo16() #0 {
 ; WAVE32: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
 define amdgpu_kernel void @scratch_buffer_known_high_masklo17() #0 {
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %alloca
-  %toint = ptrtoint i32 addrspace(5)* %alloca to i32
+  store volatile i32 0, ptr addrspace(5) %alloca
+  %toint = ptrtoint ptr addrspace(5) %alloca to i32
   %masked = and i32 %toint, 131071
-  store volatile i32 %masked, i32 addrspace(1)* undef
+  store volatile i32 %masked, ptr addrspace(1) undef
   ret void
 }
 
@@ -50,10 +50,10 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo17() #0 {
 ; GCN: {{flat|global}}_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
 define amdgpu_kernel void @scratch_buffer_known_high_mask18() #0 {
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %alloca
-  %toint = ptrtoint i32 addrspace(5)* %alloca to i32
+  store volatile i32 0, ptr addrspace(5) %alloca
+  %toint = ptrtoint ptr addrspace(5) %alloca to i32
   %masked = and i32 %toint, 262143
-  store volatile i32 %masked, i32 addrspace(1)* undef
+  store volatile i32 %masked, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
index 37d05c7ac414c..7eeb724154b2a 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
@@ -5,12 +5,12 @@
 ; SI-LABEL: {{^}}br_implicit_def:
 ; SI: %bb.0:
 ; SI-NEXT: s_cbranch_scc1
-define amdgpu_kernel void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 {
+define amdgpu_kernel void @br_implicit_def(ptr addrspace(1) %out, i32 %arg) #0 {
 bb:
   br i1 undef, label %bb1, label %bb2
 
 bb1:
-  store volatile i32 123, i32 addrspace(1)* %out
+  store volatile i32 123, ptr addrspace(1) %out
   ret void
 
 bb2:

diff  --git a/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
index 7286d9785ed9e..82887c73e9d5f 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -26,7 +26,7 @@ bb3:                                              ; preds = %bb2, %bb
   br i1 %tmp, label %bb4, label %bb6
 
 bb4:                                              ; preds = %bb3
-  %val = load volatile i32, i32 addrspace(1)* undef
+  %val = load volatile i32, ptr addrspace(1) undef
   %tmp5 = mul i32 %val, %arg
   br label %bb6
 
@@ -42,12 +42,12 @@ attributes #0 = { nounwind readnone }
 ; SI-LABEL: {{^}}vcopy_i1_undef
 ; SI: v_cndmask_b32_e64
 ; SI: v_cndmask_b32_e64
-define <2 x float> @vcopy_i1_undef(<2 x float> addrspace(1)* %p) {
+define <2 x float> @vcopy_i1_undef(ptr addrspace(1) %p) {
 entry:
   br i1 undef, label %exit, label %false
 
 false:
-  %x = load <2 x float>, <2 x float> addrspace(1)* %p
+  %x = load <2 x float>, ptr addrspace(1) %p
   %cmp = fcmp one <2 x float> %x, zeroinitializer
   br label %exit
 

diff  --git a/llvm/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll b/llvm/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
index 12cc440e48d9d..2a8b3c1b8c508 100644
--- a/llvm/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
@@ -6,13 +6,13 @@
 ;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;CHECK-NOT: SETNE_INT
 
-define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %0 = load i32, i32 addrspace(1)* %in
-  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %1 = load i32, i32 addrspace(1)* %arrayidx1
+  %0 = load i32, ptr addrspace(1) %in
+  %arrayidx1 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+  %1 = load i32, ptr addrspace(1) %arrayidx1
   %cmp = icmp eq i32 %0, %1
   %value = select i1 %cmp, i32 0, i32 -1
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
index 99c2138bbe64e..a46dcda4fb0a1 100644
--- a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
@@ -8,108 +8,108 @@
 ; GCN-LABEL: {{^}}i16_eq:
 ; VI: v_cmp_eq_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_eq(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_eq(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
-  %b = load i16, i16 addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds i16, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
+  %b = load i16, ptr addrspace(1) %b.gep
   %tmp0 = icmp eq i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}i16_ne:
 ; VI: v_cmp_ne_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ne_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_ne(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ne(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
-  %b = load i16, i16 addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds i16, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
+  %b = load i16, ptr addrspace(1) %b.gep
   %tmp0 = icmp ne i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}i16_ugt:
 ; VI: v_cmp_gt_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_ugt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ugt(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
-  %b = load i16, i16 addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds i16, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
+  %b = load i16, ptr addrspace(1) %b.gep
   %tmp0 = icmp ugt i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}i16_uge:
 ; VI: v_cmp_ge_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_uge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_uge(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
-  %b = load i16, i16 addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds i16, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
+  %b = load i16, ptr addrspace(1) %b.gep
   %tmp0 = icmp uge i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}i16_ult:
 ; VI: v_cmp_lt_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_ult(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ult(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
-  %b = load i16, i16 addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds i16, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
+  %b = load i16, ptr addrspace(1) %b.gep
   %tmp0 = icmp ult i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}i16_ule:
 ; VI: v_cmp_le_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_ule(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ule(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
-  %b = load i16, i16 addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds i16, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
+  %b = load i16, ptr addrspace(1) %b.gep
   %tmp0 = icmp ule i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 
 }
@@ -117,72 +117,72 @@ entry:
 ; GCN-LABEL: {{^}}i16_sgt:
 ; VI: v_cmp_gt_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_sgt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_sgt(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
-  %b = load i16, i16 addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds i16, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
+  %b = load i16, ptr addrspace(1) %b.gep
   %tmp0 = icmp sgt i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}i16_sge:
 ; VI: v_cmp_ge_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_sge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_sge(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
-  %b = load i16, i16 addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds i16, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
+  %b = load i16, ptr addrspace(1) %b.gep
   %tmp0 = icmp sge i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}i16_slt:
 ; VI: v_cmp_lt_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_slt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_slt(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
-  %b = load i16, i16 addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds i16, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
+  %b = load i16, ptr addrspace(1) %b.gep
   %tmp0 = icmp slt i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}i16_sle:
 ; VI: v_cmp_le_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_sle(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_sle(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
-  %b = load i16, i16 addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds i16, ptr addrspace(1) %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
+  %b = load i16, ptr addrspace(1) %b.gep
   %tmp0 = icmp sle i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -190,160 +190,160 @@ entry:
 ; GCN-LABEL: {{^}}i16_eq_v_s:
 ; VI: v_cmp_eq_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_eq_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_eq_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_eq_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
   %tmp0 = icmp eq i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}i16_ne_v_s:
 ; VI: v_cmp_ne_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ne_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_ne_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ne_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
   %tmp0 = icmp ne i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}i16_ugt_v_s:
 ; VI: v_cmp_lt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_ugt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ugt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
   %tmp0 = icmp ugt i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}i16_uge_v_s:
 ; VI: v_cmp_le_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_uge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_uge_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
   %tmp0 = icmp uge i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}i16_ult_v_s:
 ; VI: v_cmp_gt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_ult_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ult_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
   %tmp0 = icmp ult i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}i16_ule_v_s:
 ; VI: v_cmp_ge_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_ule_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ule_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
   %tmp0 = icmp ule i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}i16_sgt_v_s:
 ; VI: v_cmp_lt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_sgt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_sgt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
   %tmp0 = icmp sgt i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}i16_sge_v_s:
 ; VI: v_cmp_le_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_sge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_sge_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
   %tmp0 = icmp sge i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}i16_slt_v_s:
 ; VI: v_cmp_gt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_slt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_slt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
   %tmp0 = icmp slt i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}i16_sle_v_s:
 ; VI: v_cmp_ge_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @i16_sle_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_sle_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %a = load i16, i16 addrspace(1)* %a.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load i16, ptr addrspace(1) %a.gep
   %tmp0 = icmp sle i16 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out.gep
+  store i32 %tmp1, ptr addrspace(1) %out.gep
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/icmp64.ll b/llvm/test/CodeGen/AMDGPU/icmp64.ll
index 189df011688c6..a26cf7fde32c0 100644
--- a/llvm/test/CodeGen/AMDGPU/icmp64.ll
+++ b/llvm/test/CodeGen/AMDGPU/icmp64.ll
@@ -4,92 +4,92 @@
 ; GCN-LABEL: {{^}}test_i64_eq:
 ; VI: s_cmp_eq_u64
 ; SI: v_cmp_eq_u64
-define amdgpu_kernel void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_eq(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp eq i64 %a, %b
   %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_i64_ne:
 ; VI: s_cmp_lg_u64
 ; SI: v_cmp_ne_u64
-define amdgpu_kernel void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ne(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ne i64 %a, %b
   %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_i64_slt:
 ; GCN: v_cmp_lt_i64
-define amdgpu_kernel void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_slt(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp slt i64 %a, %b
   %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_i64_ult:
 ; GCN: v_cmp_lt_u64
-define amdgpu_kernel void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ult(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ult i64 %a, %b
   %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_i64_sle:
 ; GCN: v_cmp_le_i64
-define amdgpu_kernel void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_sle(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sle i64 %a, %b
   %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_i64_ule:
 ; GCN: v_cmp_le_u64
-define amdgpu_kernel void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ule(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ule i64 %a, %b
   %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_i64_sgt:
 ; GCN: v_cmp_gt_i64
-define amdgpu_kernel void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_sgt(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sgt i64 %a, %b
   %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_i64_ugt:
 ; GCN: v_cmp_gt_u64
-define amdgpu_kernel void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ugt(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ugt i64 %a, %b
   %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_i64_sge:
 ; GCN: v_cmp_ge_i64
-define amdgpu_kernel void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_sge(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sge i64 %a, %b
   %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_i64_uge:
 ; GCN: v_cmp_ge_u64
-define amdgpu_kernel void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_uge(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp uge i64 %a, %b
   %result = sext i1 %cmp to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 2781d1e1b5213..28865c0b7c5d0 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
 
-define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
 ; GFX9-LABEL: udiv32_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -155,14 +155,14 @@ bb3:                                              ; preds = %bb3, %bb
   %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
   %tmp4 = udiv i32 %tmp, %arg1
   %tmp5 = zext i32 %tmp to i64
-  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
-  store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
+  %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp5
+  store i32 %tmp4, ptr addrspace(1) %tmp6, align 4
   %tmp7 = add nuw nsw i32 %tmp, 1
   %tmp8 = icmp eq i32 %tmp7, 1024
   br i1 %tmp8, label %bb2, label %bb3
 }
 
-define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
 ; GFX9-LABEL: urem32_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -309,14 +309,14 @@ bb3:                                              ; preds = %bb3, %bb
   %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
   %tmp4 = urem i32 %tmp, %arg1
   %tmp5 = zext i32 %tmp to i64
-  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
-  store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
+  %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp5
+  store i32 %tmp4, ptr addrspace(1) %tmp6, align 4
   %tmp7 = add nuw nsw i32 %tmp, 1
   %tmp8 = icmp eq i32 %tmp7, 1024
   br i1 %tmp8, label %bb2, label %bb3
 }
 
-define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
 ; GFX9-LABEL: sdiv32_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x2c
@@ -471,14 +471,14 @@ bb3:                                              ; preds = %bb3, %bb
   %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
   %tmp4 = sdiv i32 %tmp, %arg1
   %tmp5 = zext i32 %tmp to i64
-  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
-  store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
+  %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp5
+  store i32 %tmp4, ptr addrspace(1) %tmp6, align 4
   %tmp7 = add nuw nsw i32 %tmp, 1
   %tmp8 = icmp eq i32 %tmp7, 1024
   br i1 %tmp8, label %bb2, label %bb3
 }
 
-define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
 ; GFX9-LABEL: srem32_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -620,14 +620,14 @@ bb3:                                              ; preds = %bb3, %bb
   %tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
   %tmp4 = srem i32 %tmp, %arg1
   %tmp5 = zext i32 %tmp to i64
-  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
-  store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
+  %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp5
+  store i32 %tmp4, ptr addrspace(1) %tmp6, align 4
   %tmp7 = add nuw nsw i32 %tmp, 1
   %tmp8 = icmp eq i32 %tmp7, 1024
   br i1 %tmp8, label %bb2, label %bb3
 }
 
-define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
+define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) {
 ; GFX9-LABEL: udiv16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -746,14 +746,14 @@ bb3:                                              ; preds = %bb3, %bb
   %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
   %tmp4 = udiv i16 %tmp, %arg1
   %tmp5 = zext i16 %tmp to i64
-  %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
-  store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
+  %tmp6 = getelementptr inbounds i16, ptr addrspace(1) %arg, i64 %tmp5
+  store i16 %tmp4, ptr addrspace(1) %tmp6, align 2
   %tmp7 = add nuw nsw i16 %tmp, 1
   %tmp8 = icmp eq i16 %tmp7, 1024
   br i1 %tmp8, label %bb2, label %bb3
 }
 
-define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
+define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) {
 ; GFX9-LABEL: urem16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -872,14 +872,14 @@ bb3:                                              ; preds = %bb3, %bb
   %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
   %tmp4 = urem i16 %tmp, %arg1
   %tmp5 = zext i16 %tmp to i64
-  %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
-  store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
+  %tmp6 = getelementptr inbounds i16, ptr addrspace(1) %arg, i64 %tmp5
+  store i16 %tmp4, ptr addrspace(1) %tmp6, align 2
   %tmp7 = add nuw nsw i16 %tmp, 1
   %tmp8 = icmp eq i16 %tmp7, 1024
   br i1 %tmp8, label %bb2, label %bb3
 }
 
-define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
+define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) {
 ; GFX9-LABEL: sdiv16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -1014,14 +1014,14 @@ bb3:                                              ; preds = %bb3, %bb
   %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
   %tmp4 = sdiv i16 %tmp, %arg1
   %tmp5 = zext i16 %tmp to i64
-  %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
-  store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
+  %tmp6 = getelementptr inbounds i16, ptr addrspace(1) %arg, i64 %tmp5
+  store i16 %tmp4, ptr addrspace(1) %tmp6, align 2
   %tmp7 = add nuw nsw i16 %tmp, 1
   %tmp8 = icmp eq i16 %tmp7, 1024
   br i1 %tmp8, label %bb2, label %bb3
 }
 
-define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
+define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %arg, i16 %arg1) {
 ; GFX9-LABEL: srem16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -1165,8 +1165,8 @@ bb3:                                              ; preds = %bb3, %bb
   %tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
   %tmp4 = srem i16 %tmp, %arg1
   %tmp5 = zext i16 %tmp to i64
-  %tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
-  store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
+  %tmp6 = getelementptr inbounds i16, ptr addrspace(1) %arg, i64 %tmp5
+  store i16 %tmp4, ptr addrspace(1) %tmp6, align 2
   %tmp7 = add nuw nsw i16 %tmp, 1
   %tmp8 = icmp eq i16 %tmp7, 1024
   br i1 %tmp8, label %bb2, label %bb3

diff  --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index b68c1d158b939..ccde5efce08dc 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -9,7 +9,7 @@
 ; add(mul(S0.x, S1.y),
 ;     add (mul (S0.y, S1.y), S3)) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
 
-define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
+define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot2:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -115,14 +115,14 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s2
 ; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                 <2 x i16> addrspace(1)* %src2,
-                                 i32 addrspace(1)* nocapture %dst) {
+                                 ptr addrspace(1) %src2,
+                                 ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
-  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
+  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
 
   %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
   %conv = zext i16 %s1.elt1 to i32
@@ -136,17 +136,17 @@ entry:
   %conv4 = zext i16 %s2.elt2 to i32
   %mul2 = mul nuw i32 %conv4, %conv3
 
-  %s3 = load i32, i32 addrspace(1)* %dst, align 4
+  %s3 = load i32, ptr addrspace(1) %dst, align 4
   %add = add i32 %mul2, %s3
   %add6 = add i32 %add, %mul1
-  store i32 %add6, i32 addrspace(1)* %dst, align 4
+  store i32 %add6, ptr addrspace(1) %dst, align 4
   ret void
 }
 
 ; TODO: Support this pattern
 ;      add(S3,
 ;          add (mul (S0.y, S1.y), mul (S0.y, S1.y))) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
-define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
+define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot2_MulMul:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -258,14 +258,14 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                        <2 x i16> addrspace(1)* %src2,
-                                        i32 addrspace(1)* nocapture %dst) {
+                                        ptr addrspace(1) %src2,
+                                        ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
-  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
+  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
 
   %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
   %conv = zext i16 %s1.elt1 to i32
@@ -278,14 +278,14 @@ entry:
   %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
   %conv4 = zext i16 %s2.elt2 to i32
   %mul2 = mul nuw i32 %conv4, %conv3
-  %s3 = load i32, i32 addrspace(1)* %dst, align 4
+  %s3 = load i32, ptr addrspace(1) %dst, align 4
   %add = add i32 %mul2, %mul1
   %add6 = add i32 %add, %s3
-  store i32 %add6, i32 addrspace(1)* %dst, align 4
+  store i32 %add6, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
+define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot2:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -391,14 +391,14 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_dot2_i32_i16 v1, v2, v1, s2
 ; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                 <2 x i16> addrspace(1)* %src2,
-                                 i32 addrspace(1)* nocapture %dst) {
+                                 ptr addrspace(1) %src2,
+                                 ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
-  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
+  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
 
   %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
   %conv = sext i16 %s1.elt1 to i32
@@ -412,14 +412,14 @@ entry:
   %conv4 = sext i16 %s2.elt2 to i32
   %mul2 = mul nuw i32 %conv4, %conv3
 
-  %s3 = load i32, i32 addrspace(1)* %dst, align 4
+  %s3 = load i32, ptr addrspace(1) %dst, align 4
   %add = add i32 %mul2, %s3
   %add6 = add i32 %add, %mul1
-  store i32 %add6, i32 addrspace(1)* %dst, align 4
+  store i32 %add6, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
+define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot2_MixedTypedMul:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -531,14 +531,14 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s2, v0
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                               <2 x i16> addrspace(1)* %src2,
-                                               i32 addrspace(1)* nocapture %dst) {
+                                               ptr addrspace(1) %src2,
+                                               ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
-  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
+  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
 
   %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
   %conv = sext i16 %s1.elt1 to i32
@@ -552,14 +552,14 @@ entry:
   %conv4 = zext i16 %s2.elt2 to i32
   %mul2 = mul nuw i32 %conv4, %conv3
 
-  %s3 = load i32, i32 addrspace(1)* %dst, align 4
+  %s3 = load i32, ptr addrspace(1) %dst, align 4
   %add = add i32 %mul2, %s3
   %add6 = add i32 %add, %mul1
-  store i32 %add6, i32 addrspace(1)* %dst, align 4
+  store i32 %add6, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
+define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot2_alt_AddOperands:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -669,14 +669,14 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s2
 ; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                                 <2 x i16> addrspace(1)* %src2,
-                                                 i32 addrspace(1)* nocapture %dst) {
+                                                 ptr addrspace(1) %src2,
+                                                 ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
-  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
+  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
 
   %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
   %conv = zext i16 %s1.elt1 to i32
@@ -690,14 +690,14 @@ entry:
   %conv4 = zext i16 %s2.elt2 to i32
   %mul2 = mul nuw i32 %conv4, %conv3
 
-  %s3 = load i32, i32 addrspace(1)* %dst, align 4
+  %s3 = load i32, ptr addrspace(1) %dst, align 4
   %add = add i32 %s3, %mul2
   %add6 = add i32 %mul1, %add
-  store i32 %add6, i32 addrspace(1)* %dst, align 4
+  store i32 %add6, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
+define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot2_MixedExt:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -809,14 +809,14 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s2, v0
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                          <2 x i16> addrspace(1)* %src2,
-                                          i32 addrspace(1)* nocapture %dst) {
+                                          ptr addrspace(1) %src2,
+                                          ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
-  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
+  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
 
   %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
   %conv = sext i16 %s1.elt1 to i32
@@ -830,14 +830,14 @@ entry:
   %conv4 = sext i16 %s2.elt2 to i32
   %mul2 = mul nuw i32 %conv4, %conv3
 
-  %s3 = load i32, i32 addrspace(1)* %dst, align 4
+  %s3 = load i32, ptr addrspace(1) %dst, align 4
   %add = add i32 %mul2, %s3
   %add6 = add i32 %add, %mul1
-  store i32 %add6, i32 addrspace(1)* %dst, align 4
+  store i32 %add6, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
+define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1,
 ; GFX7-LABEL: notudot2_SameVec:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -948,14 +948,14 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s2, v0
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                            <2 x i16> addrspace(1)* %src2,
-                                            i32 addrspace(1)* nocapture %dst) {
+                                            ptr addrspace(1) %src2,
+                                            ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
-  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
+  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
 
   %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
   %conv = zext i16 %s1.elt1 to i32
@@ -969,14 +969,14 @@ entry:
   %conv4 = zext i16 %s2.elt2 to i32
   %mul2 = mul i32 %conv4, %conv3
 
-  %s3 = load i32, i32 addrspace(1)* %dst, align 4
+  %s3 = load i32, ptr addrspace(1) %dst, align 4
   %add = add i32 %mul2, %s3
   %add6 = add i32 %add, %mul1
-  store i32 %add6, i32 addrspace(1)* %dst, align 4
+  store i32 %add6, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
+define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot2_v4i16:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1083,14 +1083,14 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s2
 ; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                       <4 x i16> addrspace(1)* %src2,
-                                       i32 addrspace(1)* nocapture %dst) {
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i16>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i16>, ptr addrspace(1) %gep2
 
   %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
   %conv = zext i16 %s1.elt1 to i32
@@ -1104,14 +1104,14 @@ entry:
   %conv4 = zext i16 %s2.elt2 to i32
   %mul2 = mul i32 %conv4, %conv3
 
-  %s3 = load i32, i32 addrspace(1)* %dst, align 4
+  %s3 = load i32, ptr addrspace(1) %dst, align 4
   %add = add i32 %mul2, %s3
   %add6 = add i32 %add, %mul1
-  store i32 %add6, i32 addrspace(1)* %dst, align 4
+  store i32 %add6, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
+define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot2_v4i16_Hi:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1221,14 +1221,14 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s2
 ; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                          <4 x i16> addrspace(1)* %src2,
-                                          i32 addrspace(1)* nocapture %dst) {
+                                          ptr addrspace(1) %src2,
+                                          ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i16>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i16>, ptr addrspace(1) %gep2
 
   %s1.elt1 = extractelement <4 x i16> %vec1, i64 2
   %conv = zext i16 %s1.elt1 to i32
@@ -1242,14 +1242,14 @@ entry:
   %conv4 = zext i16 %s2.elt2 to i32
   %mul2 = mul i32 %conv4, %conv3
 
-  %s3 = load i32, i32 addrspace(1)* %dst, align 4
+  %s3 = load i32, ptr addrspace(1) %dst, align 4
   %add = add i32 %mul2, %s3
   %add6 = add i32 %add, %mul1
-  store i32 %add6, i32 addrspace(1)* %dst, align 4
+  store i32 %add6, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
+define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
 ; GFX7-LABEL: notudot2_v4i16_Even:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1362,14 +1362,14 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s2, v0
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                               <4 x i16> addrspace(1)* %src2,
-                                               i32 addrspace(1)* nocapture %dst) {
+                                               ptr addrspace(1) %src2,
+                                               ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i16>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i16>, ptr addrspace(1) %gep2
 
   %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
   %conv = zext i16 %s1.elt1 to i32
@@ -1383,14 +1383,14 @@ entry:
   %conv4 = zext i16 %s2.elt2 to i32
   %mul2 = mul i32 %conv4, %conv3
 
-  %s3 = load i32, i32 addrspace(1)* %dst, align 4
+  %s3 = load i32, ptr addrspace(1) %dst, align 4
   %add = add i32 %mul2, %s3
   %add6 = add i32 %add, %mul1
-  store i32 %add6, i32 addrspace(1)* %dst, align 4
+  store i32 %add6, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
+define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
 ; GFX7-LABEL: notudot2_v4i16_Middle:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1503,14 +1503,14 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s2, v0
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                                 <4 x i16> addrspace(1)* %src2,
-                                                 i32 addrspace(1)* nocapture %dst) {
+                                                 ptr addrspace(1) %src2,
+                                                 ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i16>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i16>, ptr addrspace(1) %gep2
 
   %s1.elt1 = extractelement <4 x i16> %vec1, i64 1
   %conv = zext i16 %s1.elt1 to i32
@@ -1524,14 +1524,14 @@ entry:
   %conv4 = zext i16 %s2.elt2 to i32
   %mul2 = mul i32 %conv4, %conv3
 
-  %s3 = load i32, i32 addrspace(1)* %dst, align 4
+  %s3 = load i32, ptr addrspace(1) %dst, align 4
   %add = add i32 %mul2, %s3
   %add6 = add i32 %add, %mul1
-  store i32 %add6, i32 addrspace(1)* %dst, align 4
+  store i32 %add6, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
+define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
 ; GFX7-LABEL: notudot2_DiffIndex:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1643,14 +1643,14 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s2, v0
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                              <2 x i16> addrspace(1)* %src2,
-                                              i32 addrspace(1)* nocapture %dst) {
+                                              ptr addrspace(1) %src2,
+                                              ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
-  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
+  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
 
   %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
   %conv = zext i16 %s1.elt1 to i32
@@ -1664,14 +1664,14 @@ entry:
   %conv4 = zext i16 %s2.elt2 to i32
   %mul2 = mul i32 %conv4, %conv3
 
-  %s3 = load i32, i32 addrspace(1)* %dst, align 4
+  %s3 = load i32, ptr addrspace(1) %dst, align 4
   %add = add i32 %mul2, %s3
   %add6 = add i32 %add, %mul1
-  store i32 %add6, i32 addrspace(1)* %dst, align 4
+  store i32 %add6, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1,
+define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot2_MultipleUses_add1:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1792,14 +1792,14 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v0
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                                   <2 x i16> addrspace(1)* %src2,
-                                                   i32 addrspace(1)* nocapture %dst) {
+                                                   ptr addrspace(1) %src2,
+                                                   ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
-  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
+  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
 
   %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
   %conv = zext i16 %s1.elt1 to i32
@@ -1813,16 +1813,16 @@ entry:
   %conv4 = zext i16 %s2.elt2 to i32
   %mul2 = mul i32 %conv4, %conv3
 
-  %s3 = load i32, i32 addrspace(1)* %dst, align 4
+  %s3 = load i32, ptr addrspace(1) %dst, align 4
   %add1 = add i32 %mul2, %s3
   %add2 = add i32 %add1, %mul1
 
   %res = add i32 %add2, %add1
-  store i32 %res, i32 addrspace(1)* %dst, align 4
+  store i32 %res, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1,
+define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot2_MultipleUses_add1:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1943,14 +1943,14 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v0
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                                   <2 x i16> addrspace(1)* %src2,
-                                                   i32 addrspace(1)* nocapture %dst) {
+                                                   ptr addrspace(1) %src2,
+                                                   ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
-  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
+  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
 
   %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
   %conv = sext i16 %s1.elt1 to i32
@@ -1964,16 +1964,16 @@ entry:
   %conv4 = sext i16 %s2.elt2 to i32
   %mul2 = mul i32 %conv4, %conv3
 
-  %s3 = load i32, i32 addrspace(1)* %dst, align 4
+  %s3 = load i32, ptr addrspace(1) %dst, align 4
   %add1 = add i32 %mul2, %s3
   %add2 = add i32 %add1, %mul1
 
   %res = add i32 %add2, %add1
-  store i32 %res, i32 addrspace(1)* %dst, align 4
+  store i32 %res, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1,
+define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot2_MultipleUses_mul1:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2099,14 +2099,14 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v1, v0, v2
 ; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                                   <2 x i16> addrspace(1)* %src2,
-                                                   i32 addrspace(1)* nocapture %dst) {
+                                                   ptr addrspace(1) %src2,
+                                                   ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
-  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
+  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
 
   %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
   %conv = zext i16 %s1.elt1 to i32
@@ -2120,17 +2120,17 @@ entry:
   %conv4 = zext i16 %s2.elt2 to i32
   %mul2 = mul i32 %conv4, %conv3
 
-  %s3 = load i32, i32 addrspace(1)* %dst, align 4
+  %s3 = load i32, ptr addrspace(1) %dst, align 4
   %add0 = add i32 %mul1, %s3
 
   %add1 = add i32 %mul2, %add0
   %add2 = add i32 %add1, %mul1
 
-  store i32 %add2, i32 addrspace(1)* %dst, align 4
+  store i32 %add2, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1,
+define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot2_MultipleUses_mul1:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2256,14 +2256,14 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v1, v0, v2
 ; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                                   <2 x i16> addrspace(1)* %src2,
-                                                   i32 addrspace(1)* nocapture %dst) {
+                                                   ptr addrspace(1) %src2,
+                                                   ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
-  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
+  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
 
   %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
   %conv = sext i16 %s1.elt1 to i32
@@ -2277,17 +2277,17 @@ entry:
   %conv4 = sext i16 %s2.elt2 to i32
   %mul2 = mul i32 %conv4, %conv3
 
-  %s3 = load i32, i32 addrspace(1)* %dst, align 4
+  %s3 = load i32, ptr addrspace(1) %dst, align 4
   %add0 = add i32 %mul1, %s3
 
   %add1 = add i32 %mul2, %add0
   %add2 = add i32 %add1, %mul1
 
-  store i32 %add2, i32 addrspace(1)* %dst, align 4
+  store i32 %add2, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1,
+define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot2_MultipleUses_mul2:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2411,14 +2411,14 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v2, v0, v1
 ; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                                   <2 x i16> addrspace(1)* %src2,
-                                                   i32 addrspace(1)* nocapture %dst) {
+                                                   ptr addrspace(1) %src2,
+                                                   ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
-  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
+  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
 
   %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
   %conv = zext i16 %s1.elt1 to i32
@@ -2432,17 +2432,17 @@ entry:
   %conv4 = zext i16 %s2.elt2 to i32
   %mul2 = mul i32 %conv4, %conv3
 
-  %s3 = load i32, i32 addrspace(1)* %dst, align 4
+  %s3 = load i32, ptr addrspace(1) %dst, align 4
   %add0 = add i32 %mul2, %s3
 
   %add1 = add i32 %mul2, %add0
   %add2 = add i32 %add1, %mul1
 
-  store i32 %add2, i32 addrspace(1)* %dst, align 4
+  store i32 %add2, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1,
+define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot2_MultipleUses_mul2:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2566,14 +2566,14 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v2, v0, v1
 ; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                                   <2 x i16> addrspace(1)* %src2,
-                                                   i32 addrspace(1)* nocapture %dst) {
+                                                   ptr addrspace(1) %src2,
+                                                   ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
-  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
+  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
 
   %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
   %conv = sext i16 %s1.elt1 to i32
@@ -2587,17 +2587,17 @@ entry:
   %conv4 = sext i16 %s2.elt2 to i32
   %mul2 = mul i32 %conv4, %conv3
 
-  %s3 = load i32, i32 addrspace(1)* %dst, align 4
+  %s3 = load i32, ptr addrspace(1) %dst, align 4
   %add0 = add i32 %mul2, %s3
 
   %add1 = add i32 %mul2, %add0
   %add2 = add i32 %add1, %mul1
 
-  store i32 %add2, i32 addrspace(1)* %dst, align 4
+  store i32 %add2, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
+define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot2_acc16:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2714,14 +2714,14 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
 ; GFX10-DL-NEXT:    global_store_short v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
-                                       <2 x i16> addrspace(1)* %src2,
-                                       i16 addrspace(1)* nocapture %dst) {
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
-  %v1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
-  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
-  %v2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
+  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
+  %v1 = load <2 x i16>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
+  %v2 = load <2 x i16>, ptr addrspace(1) %gep2
 
   %v1e1 = extractelement <2 x i16> %v1, i64 0
   %v2e1 = extractelement <2 x i16> %v2, i64 0
@@ -2731,14 +2731,14 @@ entry:
   %v2e2 = extractelement <2 x i16> %v2, i64 1
   %mul2 = mul i16 %v1e2, %v2e2
 
-  %s2 = load i16, i16 addrspace(1)* %dst, align 2
+  %s2 = load i16, ptr addrspace(1) %dst, align 2
   %add1 = add i16 %mul2, %s2
   %add2 = add i16 %add1, %mul1
-  store i16 %add2, i16 addrspace(1)* %dst, align 2
+  store i16 %add2, ptr addrspace(1) %dst, align 2
   ret void
 }
 
-define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
+define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
 ; GFX7-LABEL: notsdot2_sext8:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2859,14 +2859,14 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, s2, v1
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                          <2 x i8> addrspace(1)* %src2,
-                                          i32 addrspace(1)* nocapture %dst) {
+                                          ptr addrspace(1) %src2,
+                                          ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <2 x i8>, <2 x i8> addrspace(1)* %gep1
-  %gep2 = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <2 x i8>, <2 x i8> addrspace(1)* %gep2
+  %gep1 = getelementptr <2 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <2 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <2 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <2 x i8>, ptr addrspace(1) %gep2
 
   %s1.elt1 = extractelement <2 x i8> %vec1, i64 0
   %conv = sext i8 %s1.elt1 to i32
@@ -2880,10 +2880,10 @@ entry:
   %conv4 = sext i8 %s2.elt2 to i32
   %mul2 = mul nuw i32 %conv4, %conv3
 
-  %s3 = load i32, i32 addrspace(1)* %dst, align 4
+  %s3 = load i32, ptr addrspace(1) %dst, align 4
   %add = add i32 %mul2, %s3
   %add6 = add i32 %add, %mul1
-  store i32 %add6, i32 addrspace(1)* %dst, align 4
+  store i32 %add6, ptr addrspace(1) %dst, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 867f0c3d2d0b0..6ecde579e8416 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
 
-define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
+define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot4_acc32:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -127,14 +127,14 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, s2
 ; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                       <4 x i8> addrspace(1)* %src2,
-                                       i32 addrspace(1)* nocapture %dst) {
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <4 x i8> %vec1, i64 0
   %cv1e0 = sext i8 %v1e0 to i32
@@ -160,18 +160,18 @@ entry:
   %cv2e3 = sext i8 %v2e3 to i32
   %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
 
-  %acc = load i32, i32 addrspace(1)* %dst, align 4
+  %acc = load i32, ptr addrspace(1) %dst, align 4
   %add1 = add i32 %mul1, %acc
   %add2 = add i32 %add1, %mul2
   %add3 = add i32 %add2, %mul3
   %add4 = add i32 %add3, %mul4
-  store i32 %add4, i32 addrspace(1)* %dst, align 4
+  store i32 %add4, ptr addrspace(1) %dst, align 4
   ret void
 }
 
 ; TODO: Currently, vector elements{0 and 3} get zero_extended from i16 to i32 which should
 ; be sign_extended directly to i32; prevents the pattern recognizer to recognize this pattern.
-define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
+define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot4_acc16:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -358,14 +358,14 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
-                                       <4 x i8> addrspace(1)* %src2,
-                                       i16 addrspace(1)* nocapture %dst) {
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <4 x i8> %vec1, i64 0
   %cv1e0 = sext i8 %v1e0 to i16
@@ -391,16 +391,16 @@ entry:
   %cv2e3 = sext i8 %v2e3 to i16
   %mul4 = mul nsw i16 %cv1e3, %cv2e3
 
-  %acc = load i16, i16 addrspace(1)* %dst, align 2
+  %acc = load i16, ptr addrspace(1) %dst, align 2
   %add1 = add i16 %mul1, %acc
   %add2 = add i16 %add1, %mul2
   %add3 = add i16 %add2, %mul3
   %add4 = add i16 %add3, %mul4
-  store i16 %add4, i16 addrspace(1)* %dst, align 2
+  store i16 %add4, ptr addrspace(1) %dst, align 2
   ret void
 }
 
-define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
+define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot4_acc8:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -547,14 +547,14 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
-                                      <4 x i8> addrspace(1)* %src2,
-                                      i8 addrspace(1)* nocapture %dst) {
+                                      ptr addrspace(1) %src2,
+                                      ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <4 x i8> %vec1, i64 0
   %v2e0 = extractelement <4 x i8> %vec2, i64 0
@@ -572,16 +572,16 @@ entry:
   %v2e3 = extractelement <4 x i8> %vec2, i64 3
   %mul4 = mul i8 %v1e3, %v2e3
 
-  %acc = load i8, i8 addrspace(1)* %dst, align 2
+  %acc = load i8, ptr addrspace(1) %dst, align 2
   %add1 = add i8 %mul1, %acc
   %add2 = add i8 %add1, %mul2
   %add3 = add i8 %add2, %mul3
   %add4 = add nsw i8 %add3, %mul4
-  store i8 %add4, i8 addrspace(1)* %dst, align 2
+  store i8 %add4, ptr addrspace(1) %dst, align 2
   ret void
 }
 
-define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
+define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot4_multiuse_mul1:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -728,14 +728,14 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                               <4 x i8> addrspace(1)* %src2,
-                                               i32 addrspace(1)* nocapture %dst) {
+                                               ptr addrspace(1) %src2,
+                                               ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <4 x i8> %vec1, i64 0
   %cv1e0 = sext i8 %v1e0 to i32
@@ -761,19 +761,19 @@ entry:
   %cv2e3 = sext i8 %v2e3 to i32
   %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
 
-  %acc = load i32, i32 addrspace(1)* %dst, align 4
+  %acc = load i32, ptr addrspace(1) %dst, align 4
   %add = add i32 %mul1, %acc
   %add1 = add i32 %mul2, %add
   %add2 = add i32 %add1, %mul1
   %add3 = add i32 %add2, %mul3
   %add4 = add i32 %add3, %mul4
 
-  store i32 %add4, i32 addrspace(1)* %dst, align 4
+  store i32 %add4, ptr addrspace(1) %dst, align 4
   ret void
 }
 
 ; TODO: Support this pattern.
-define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
+define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot4_acc32_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -917,14 +917,14 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                              <4 x i8> addrspace(1)* %src2,
-                                              i32 addrspace(1)* nocapture %dst) {
+                                              ptr addrspace(1) %src2,
+                                              ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
 
   %cvec1 = sext <4 x i8> %vec1 to <4 x i32>
   %cvec2 = sext <4 x i8> %vec2 to <4 x i32>
@@ -935,17 +935,17 @@ entry:
   %mul2 = extractelement <4 x i32> %mul, i64 2
   %mul3 = extractelement <4 x i32> %mul, i64 3
 
-  %acc = load i32, i32 addrspace(1)* %dst, align 4
+  %acc = load i32, ptr addrspace(1) %dst, align 4
   %add1 = add i32 %mul0, %acc
   %add2 = add i32 %add1, %mul1
   %add3 = add i32 %add2, %mul2
   %add4 = add i32 %add3, %mul3
 
-  store i32 %add4, i32 addrspace(1)* %dst, align 4
+  store i32 %add4, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
+define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot4_acc16_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1139,14 +1139,14 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                              <4 x i8> addrspace(1)* %src2,
-                                              i16 addrspace(1)* nocapture %dst) {
+                                              ptr addrspace(1) %src2,
+                                              ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
 
   %cvec1 = sext <4 x i8> %vec1 to <4 x i16>
   %cvec2 = sext <4 x i8> %vec2 to <4 x i16>
@@ -1157,13 +1157,13 @@ entry:
   %mul2 = extractelement <4 x i16> %mul, i64 2
   %mul3 = extractelement <4 x i16> %mul, i64 3
 
-  %acc = load i16, i16 addrspace(1)* %dst, align 4
+  %acc = load i16, ptr addrspace(1) %dst, align 4
   %add1 = add i16 %mul0, %acc
   %add2 = add i16 %add1, %mul1
   %add3 = add i16 %add2, %mul2
   %add4 = add i16 %add3, %mul3
 
-  store i16 %add4, i16 addrspace(1)* %dst, align 4
+  store i16 %add4, ptr addrspace(1) %dst, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index a4177a8c263a5..5aad8c6880b9d 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
 
-define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
+define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot4_acc32:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -127,14 +127,14 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s2
 ; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                       <4 x i8> addrspace(1)* %src2,
-                                       i32 addrspace(1)* nocapture %dst) {
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <4 x i8> %vec1, i64 0
   %cv1e0 = zext i8 %v1e0 to i32
@@ -160,17 +160,17 @@ entry:
   %cv2e3 = zext i8 %v2e3 to i32
   %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
 
-  %acc = load i32, i32 addrspace(1)* %dst, align 4
+  %acc = load i32, ptr addrspace(1) %dst, align 4
   %mad1 = add i32 %mul1, %acc
   %mad2 = add i32 %mad1, %mul2
   %mad3 = add i32 %mad2, %mul3
   %mad4 = add i32 %mad3, %mul4
 
-  store i32 %mad4, i32 addrspace(1)* %dst, align 4
+  store i32 %mad4, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
+define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot4_acc16:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -337,14 +337,14 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
-                                       <4 x i8> addrspace(1)* %src2,
-                                       i16 addrspace(1)* nocapture %dst) {
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <4 x i8> %vec1, i64 0
   %cv1e0 = zext i8 %v1e0 to i16
@@ -370,17 +370,17 @@ entry:
   %cv2e3 = zext i8 %v2e3 to i16
   %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3
 
-  %acc = load i16, i16 addrspace(1)* %dst, align 2
+  %acc = load i16, ptr addrspace(1) %dst, align 2
   %mad1 = add i16 %mul1, %acc
   %mad2 = add i16 %mad1, %mul2
   %mad3 = add i16 %mad2, %mul3
   %mad4 = add i16 %mad3, %mul4
 
-  store i16 %mad4, i16 addrspace(1)* %dst, align 2
+  store i16 %mad4, ptr addrspace(1) %dst, align 2
   ret void
 }
 
-define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
+define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot4_acc8:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -527,14 +527,14 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
-                                      <4 x i8> addrspace(1)* %src2,
-                                      i8 addrspace(1)* nocapture %dst) {
+                                      ptr addrspace(1) %src2,
+                                      ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <4 x i8> %vec1, i64 0
   %v2e0 = extractelement <4 x i8> %vec2, i64 0
@@ -552,18 +552,18 @@ entry:
   %v2e3 = extractelement <4 x i8> %vec2, i64 3
   %mul4 = mul nuw nsw i8 %v1e3, %v2e3
 
-  %acc = load i8, i8 addrspace(1)* %dst, align 2
+  %acc = load i8, ptr addrspace(1) %dst, align 2
   %mad1 = add i8 %mul1, %acc
   %mad2 = add i8 %mad1, %mul2
   %mad3 = add i8 %mad2, %mul3
   %mad4 = add i8 %mad3, %mul4
 
-  store i8 %mad4, i8 addrspace(1)* %dst, align 2
+  store i8 %mad4, ptr addrspace(1) %dst, align 2
   ret void
 }
 
 ; TODO: Generate udot4?
-define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
+define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot2_8:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -680,14 +680,14 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v2
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
-                                   <4 x i8> addrspace(1)* %src2,
-                                   i8 addrspace(1)* nocapture %dst) {
+                                   ptr addrspace(1) %src2,
+                                   ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <4 x i8> %vec1, i64 0
   %v2e0 = extractelement <4 x i8> %vec2, i64 0
@@ -697,14 +697,14 @@ entry:
   %v2e1 = extractelement <4 x i8> %vec2, i64 1
   %mul2 = mul nuw nsw i8 %v1e1, %v2e1
 
-  %acc = load i8, i8 addrspace(1)* %dst, align 2
+  %acc = load i8, ptr addrspace(1) %dst, align 2
   %mad1 = add i8 %mul1, %acc
   %mad2 = add i8 %mad1, %mul2
-  store i8 %mad2, i8 addrspace(1)* %dst, align 2
+  store i8 %mad2, ptr addrspace(1) %dst, align 2
   ret void
 }
 
-define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %src1,
+define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot4_CommutationInsideMAD:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -851,14 +851,14 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v2, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
-                                                      <4 x i8> addrspace(1)* %src2,
-                                                      i8 addrspace(1)* nocapture %dst) {
+                                                      ptr addrspace(1) %src2,
+                                                      ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <4 x i8> %vec1, i64 0
   %v2e0 = extractelement <4 x i8> %vec2, i64 0
@@ -876,18 +876,18 @@ entry:
   %v2e3 = extractelement <4 x i8> %vec2, i64 3
   %mul4 = mul nuw nsw i8 %v2e3, %v1e3
 
-  %acc = load i8, i8 addrspace(1)* %dst, align 2
+  %acc = load i8, ptr addrspace(1) %dst, align 2
   %mad1 = add i8 %acc, %mul1
   %mad2 = add i8 %mul2, %mad1
   %mad3 = add i8 %mul3, %mad2
   %mad4 = add i8 %mul4, %mad3
 
-  store i8 %mad4, i8 addrspace(1)* %dst, align 2
+  store i8 %mad4, ptr addrspace(1) %dst, align 2
   ret void
 }
 
 ; TODO: Support commutation accross the adds.
-define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %src1,
+define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot4_CommutationAccrossMADs:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1034,14 +1034,14 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v2, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
-                                                        <4 x i8> addrspace(1)* %src2,
-                                                        i8 addrspace(1)* nocapture %dst) {
+                                                        ptr addrspace(1) %src2,
+                                                        ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <4 x i8> %vec1, i64 0
   %v2e0 = extractelement <4 x i8> %vec2, i64 0
@@ -1059,17 +1059,17 @@ entry:
   %v2e3 = extractelement <4 x i8> %vec2, i64 3
   %mul4 = mul nuw nsw i8 %v2e3, %v1e3
 
-  %acc = load i8, i8 addrspace(1)* %dst, align 2
+  %acc = load i8, ptr addrspace(1) %dst, align 2
   %mad1 = add i8 %acc, %mul2
   %mad2 = add i8 %mad1, %mul1
   %mad3 = add i8 %mad2, %mul3
   %mad4 = add i8 %mad3, %mul4
 
-  store i8 %mad4, i8 addrspace(1)* %dst, align 2
+  store i8 %mad4, ptr addrspace(1) %dst, align 2
   ret void
 }
 
-define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
+define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot4_multiuse_mul1:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1216,14 +1216,14 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                               <4 x i8> addrspace(1)* %src2,
-                                               i32 addrspace(1)* nocapture %dst) {
+                                               ptr addrspace(1) %src2,
+                                               ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <4 x i8> %vec1, i64 0
   %cv1e0 = zext i8 %v1e0 to i32
@@ -1249,18 +1249,18 @@ entry:
   %cv2e3 = zext i8 %v2e3 to i32
   %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
 
-  %acc = load i32, i32 addrspace(1)* %dst, align 4
+  %acc = load i32, ptr addrspace(1) %dst, align 4
   %add = add i32 %mul1, %acc
   %add1 = add i32 %mul2, %add
   %add2 = add i32 %add1, %mul1
   %add3 = add i32 %add2, %mul3
   %add4 = add i32 %add3, %mul4
 
-  store i32 %add4, i32 addrspace(1)* %dst, align 4
+  store i32 %add4, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
+define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot4_multiuse_add1:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1409,14 +1409,14 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v2
 ; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                               <4 x i8> addrspace(1)* %src2,
-                                               i32 addrspace(1)* nocapture %dst) {
+                                               ptr addrspace(1) %src2,
+                                               ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <4 x i8> %vec1, i64 0
   %cv1e0 = zext i8 %v1e0 to i32
@@ -1442,18 +1442,18 @@ entry:
   %cv2e3 = zext i8 %v2e3 to i32
   %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
 
-  %acc = load i32, i32 addrspace(1)* %dst, align 4
+  %acc = load i32, ptr addrspace(1) %dst, align 4
   %add1 = add i32 %mul2, %acc
   %add = add i32 %add1, %acc
   %add2 = add i32 %add1, %mul1
   %add3 = add i32 %add2, %mul3
   %add4 = add i32 %add3, %mul4
   %res = add i32 %add4, %add
-  store i32 %res, i32 addrspace(1)* %dst, align 4
+  store i32 %res, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
+define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
 ; GFX7-LABEL: notdot4_mixedtypes:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1622,14 +1622,14 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
-                                              <4 x i8> addrspace(1)* %src2,
-                                              i16 addrspace(1)* nocapture %dst) {
+                                              ptr addrspace(1) %src2,
+                                              ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <4 x i8> %vec1, i64 0
   %cv1e0 = sext i8 %v1e0 to i16
@@ -1655,18 +1655,18 @@ entry:
   %cv2e3 = zext i8 %v2e3 to i16
   %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3
 
-  %acc = load i16, i16 addrspace(1)* %dst, align 2
+  %acc = load i16, ptr addrspace(1) %dst, align 2
   %add1 = add i16 %mul2, %acc
   %add2 = add i16 %add1, %mul1
   %add3 = add i16 %add2, %mul3
   %add4 = add i16 %add3, %mul4
 
-  store i16 %add4, i16 addrspace(1)* %dst, align 2
+  store i16 %add4, ptr addrspace(1) %dst, align 2
   ret void
 }
 
 ; TODO: cleanup s_lshr_b32 and support this pattern.
-define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
+define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot4_acc32_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1803,14 +1803,14 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                              <4 x i8> addrspace(1)* %src2,
-                                              i32 addrspace(1)* nocapture %dst) {
+                                              ptr addrspace(1) %src2,
+                                              ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
 
   %cvec1 = zext <4 x i8> %vec1 to <4 x i32>
   %cvec2 = zext <4 x i8> %vec2 to <4 x i32>
@@ -1821,18 +1821,18 @@ entry:
   %mul2 = extractelement <4 x i32> %mul, i64 2
   %mul3 = extractelement <4 x i32> %mul, i64 3
 
-  %acc = load i32, i32 addrspace(1)* %dst, align 4
+  %acc = load i32, ptr addrspace(1) %dst, align 4
   %add1 = add i32 %mul0, %acc
   %add2 = add i32 %add1, %mul1
   %add3 = add i32 %add2, %mul2
   %add4 = add i32 %add3, %mul3
 
-  store i32 %add4, i32 addrspace(1)* %dst, align 4
+  store i32 %add4, ptr addrspace(1) %dst, align 4
   ret void
 }
 
 ; TODO: This pattern should be recognized.
-define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
+define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot4_acc16_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2016,14 +2016,14 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                              <4 x i8> addrspace(1)* %src2,
-                                              i16 addrspace(1)* nocapture %dst) {
+                                              ptr addrspace(1) %src2,
+                                              ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
 
   %cvec1 = zext <4 x i8> %vec1 to <4 x i16>
   %cvec2 = zext <4 x i8> %vec2 to <4 x i16>
@@ -2034,18 +2034,18 @@ entry:
   %mul2 = extractelement <4 x i16> %mul, i64 2
   %mul3 = extractelement <4 x i16> %mul, i64 3
 
-  %acc = load i16, i16 addrspace(1)* %dst, align 4
+  %acc = load i16, ptr addrspace(1) %dst, align 4
   %add1 = add i16 %mul0, %acc
   %add2 = add i16 %add1, %mul1
   %add3 = add i16 %add2, %mul2
   %add4 = add i16 %add3, %mul3
 
-  store i16 %add4, i16 addrspace(1)* %dst, align 4
+  store i16 %add4, ptr addrspace(1) %dst, align 4
   ret void
 }
 
 ; TODO: Support this pattern.
-define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
+define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot4_acc8_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2208,14 +2208,14 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v2
 ; GFX10-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                             <4 x i8> addrspace(1)* %src2,
-                                             i8 addrspace(1)* nocapture %dst) {
+                                             ptr addrspace(1) %src2,
+                                             ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
-  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
 
   %mul = mul <4 x i8> %vec1, %vec2
   %mul0 = extractelement <4 x i8> %mul, i64 0
@@ -2223,13 +2223,13 @@ entry:
   %mul2 = extractelement <4 x i8> %mul, i64 2
   %mul3 = extractelement <4 x i8> %mul, i64 3
 
-  %acc = load i8, i8 addrspace(1)* %dst, align 4
+  %acc = load i8, ptr addrspace(1) %dst, align 4
   %add1 = add i8 %mul0, %acc
   %add2 = add i8 %add1, %mul1
   %add3 = add i8 %add2, %mul2
   %add4 = add i8 %add3, %mul3
 
-  store i8 %add4, i8 addrspace(1)* %dst, align 4
+  store i8 %add4, ptr addrspace(1) %dst, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index f47c20ac68325..55a57ef67a3e3 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -8,7 +8,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s
 
-define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
+define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot8_acc32:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -249,14 +249,14 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_dot8_i32_i4 v0, s0, s1, v0
 ; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
-                                       <8 x i4> addrspace(1)* %src2,
-                                       i32 addrspace(1)* nocapture %dst) {
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
-  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
+  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <8 x i4> %vec1, i64 0
   %cv1e0 = sext i4 %v1e0 to i32
@@ -306,7 +306,7 @@ entry:
   %cv2e7 = sext i4 %v2e7 to i32
   %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
 
-  %acc = load i32, i32 addrspace(1)* %dst, align 4
+  %acc = load i32, ptr addrspace(1) %dst, align 4
   %add1 = add i32 %mul0, %acc
   %add2 = add i32 %add1, %mul1
   %add3 = add i32 %add2, %mul2
@@ -316,13 +316,13 @@ entry:
   %add7 = add i32 %add6, %mul6
   %add8 = add i32 %add7, %mul7
 
-  store i32 %add8, i32 addrspace(1)* %dst, align 4
+  store i32 %add8, ptr addrspace(1) %dst, align 4
   ret void
 }
 
 ; TODO: Once the unnecessary zero extentions of the elements are removed;
 ; pattern recognizer will kick in.
-define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
+define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot8_acc16:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -822,14 +822,14 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s0, s1, v1
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
-                                       <8 x i4> addrspace(1)* %src2,
-                                       i16 addrspace(1)* nocapture %dst) {
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
-  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
+  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <8 x i4> %vec1, i64 0
   %cv1e0 = sext i4 %v1e0 to i16
@@ -879,7 +879,7 @@ entry:
   %cv2e7 = sext i4 %v2e7 to i16
   %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7
 
-  %acc = load i16, i16 addrspace(1)* %dst, align 4
+  %acc = load i16, ptr addrspace(1) %dst, align 4
   %add1 = add i16 %mul0, %acc
   %add2 = add i16 %add1, %mul1
   %add3 = add i16 %add2, %mul2
@@ -889,12 +889,12 @@ entry:
   %add7 = add i16 %add6, %mul6
   %add8 = add i16 %add7, %mul7
 
-  store i16 %add8, i16 addrspace(1)* %dst, align 4
+  store i16 %add8, ptr addrspace(1) %dst, align 4
   ret void
 }
 
 ; TODO: Support this pattern.
-define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
+define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot8_acc8:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -1394,14 +1394,14 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s0, s1, v1
 ; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
-                                       <8 x i4> addrspace(1)* %src2,
-                                       i8 addrspace(1)* nocapture %dst) {
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
-  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
+  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <8 x i4> %vec1, i64 0
   %cv1e0 = sext i4 %v1e0 to i8
@@ -1451,7 +1451,7 @@ entry:
   %cv2e7 = sext i4 %v2e7 to i8
   %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7
 
-  %acc = load i8, i8 addrspace(1)* %dst, align 4
+  %acc = load i8, ptr addrspace(1) %dst, align 4
   %add1 = add i8 %mul0, %acc
   %add2 = add i8 %add1, %mul1
   %add3 = add i8 %add2, %mul2
@@ -1461,13 +1461,13 @@ entry:
   %add7 = add i8 %add6, %mul6
   %add8 = add i8 %add7, %mul7
 
-  store i8 %add8, i8 addrspace(1)* %dst, align 4
+  store i8 %add8, ptr addrspace(1) %dst, align 4
   ret void
 }
 
 ; Make sure the pattern is not recognized if there are multiple uses of the
 ; intermediate multiplications.
-define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
+define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot8_multiuses_mul1:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -1828,14 +1828,14 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
-                                                <8 x i4> addrspace(1)* %src2,
-                                                i32 addrspace(1)* nocapture %dst) {
+                                                ptr addrspace(1) %src2,
+                                                ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
-  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
+  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <8 x i4> %vec1, i64 0
   %cv1e0 = sext i4 %v1e0 to i32
@@ -1885,7 +1885,7 @@ entry:
   %cv2e7 = sext i4 %v2e7 to i32
   %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
 
-  %acc = load i32, i32 addrspace(1)* %dst, align 4
+  %acc = load i32, ptr addrspace(1) %dst, align 4
   %add =  add i32  %mul0, %acc
   %add1 = add i32 %mul0, %add
   %add2 = add i32 %add1, %mul1
@@ -1897,12 +1897,12 @@ entry:
   %add8 = add i32 %add7, %mul7
 
   %res = add i32 %add, %add8
-  store i32 %res, i32 addrspace(1)* %dst, align 4
+  store i32 %res, ptr addrspace(1) %dst, align 4
   ret void
 }
 
 ; TODO: Support this pattern.
-define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
+define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot8_acc32_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -2143,14 +2143,14 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_dot8_i32_i4 v0, s0, s1, v0
 ; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
-                                              <8 x i4> addrspace(1)* %src2,
-                                              i32 addrspace(1)* nocapture %dst) {
+                                              ptr addrspace(1) %src2,
+                                              ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
-  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
+  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
 
   %cvec1 = sext <8 x i4> %vec1 to <8 x i32>
   %cvec2 = sext <8 x i4> %vec2 to <8 x i32>
@@ -2165,7 +2165,7 @@ entry:
   %mul6 = extractelement <8 x i32> %mul, i64 6
   %mul7 = extractelement <8 x i32> %mul, i64 7
 
-  %acc = load i32, i32 addrspace(1)* %dst, align 4
+  %acc = load i32, ptr addrspace(1) %dst, align 4
   %add1 = add i32 %mul0, %acc
   %add2 = add i32 %add1, %mul1
   %add3 = add i32 %add2, %mul2
@@ -2175,12 +2175,12 @@ entry:
   %add7 = add i32 %add6, %mul6
   %add8 = add i32 %add7, %mul7
 
-  store i32 %add8, i32 addrspace(1)* %dst, align 4
+  store i32 %add8, ptr addrspace(1) %dst, align 4
   ret void
 }
 
 ; TODO: Support this pattern.
-define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
+define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot8_acc16_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -2757,14 +2757,14 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
-                                              <8 x i4> addrspace(1)* %src2,
-                                              i16 addrspace(1)* nocapture %dst) {
+                                              ptr addrspace(1) %src2,
+                                              ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
-  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
+  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
 
   %cvec1 = sext <8 x i4> %vec1 to <8 x i16>
   %cvec2 = sext <8 x i4> %vec2 to <8 x i16>
@@ -2779,7 +2779,7 @@ entry:
   %mul6 = extractelement <8 x i16> %mul, i64 6
   %mul7 = extractelement <8 x i16> %mul, i64 7
 
-  %acc = load i16, i16 addrspace(1)* %dst, align 4
+  %acc = load i16, ptr addrspace(1) %dst, align 4
   %add1 = add i16 %mul0, %acc
   %add2 = add i16 %add1, %mul1
   %add3 = add i16 %add2, %mul2
@@ -2789,12 +2789,12 @@ entry:
   %add7 = add i16 %add6, %mul6
   %add8 = add i16 %add7, %mul7
 
-  store i16 %add8, i16 addrspace(1)* %dst, align 4
+  store i16 %add8, ptr addrspace(1) %dst, align 4
   ret void
 }
 
 ; TODO: Support this pattern.
-define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
+define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot8_acc8_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -3444,14 +3444,14 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX10-DL-NEXT:    s_endpgm
-                                             <8 x i4> addrspace(1)* %src2,
-                                             i8 addrspace(1)* nocapture %dst) {
+                                             ptr addrspace(1) %src2,
+                                             ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
-  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
+  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
 
   %cvec1 = sext <8 x i4> %vec1 to <8 x i8>
   %cvec2 = sext <8 x i4> %vec2 to <8 x i8>
@@ -3466,7 +3466,7 @@ entry:
   %mul6 = extractelement <8 x i8> %mul, i64 6
   %mul7 = extractelement <8 x i8> %mul, i64 7
 
-  %acc = load i8, i8 addrspace(1)* %dst, align 4
+  %acc = load i8, ptr addrspace(1) %dst, align 4
   %add1 = add i8 %mul0, %acc
   %add2 = add i8 %add1, %mul1
   %add3 = add i8 %add2, %mul2
@@ -3476,7 +3476,7 @@ entry:
   %add7 = add i8 %add6, %mul6
   %add8 = add i8 %add7, %mul7
 
-  store i8 %add8, i8 addrspace(1)* %dst, align 4
+  store i8 %add8, ptr addrspace(1) %dst, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 1969ac68efcfe..1c092fcbf55cf 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
 
-define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
+define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot8_acc32:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -204,14 +204,14 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_dot8_u32_u4 v1, v1, v2, s2
 ; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                       <8 x i4> addrspace(1)* %src2,
-                                       i32 addrspace(1)* nocapture %dst) {
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
-  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
+  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <8 x i4> %vec1, i64 0
   %cv1e0 = zext i4 %v1e0 to i32
@@ -261,7 +261,7 @@ entry:
   %cv2e7 = zext i4 %v2e7 to i32
   %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
 
-  %acc = load i32, i32 addrspace(1)* %dst, align 4
+  %acc = load i32, ptr addrspace(1) %dst, align 4
   %add1 = add i32 %mul0, %acc
   %add2 = add i32 %add1, %mul1
   %add3 = add i32 %add2, %mul2
@@ -271,13 +271,13 @@ entry:
   %add7 = add i32 %add6, %mul6
   %add8 = add i32 %add7, %mul7
 
-  store i32 %add8, i32 addrspace(1)* %dst, align 4
+  store i32 %add8, ptr addrspace(1) %dst, align 4
   ret void
 }
 
 ; TODO: Remove the unnecessary instruction(that is zero-extending the
 ; 2nd MAD) to have the pattern-recognizer to kick in.
-define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
+define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot8_acc16:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -522,14 +522,14 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
 ; GFX10-DL-NEXT:    global_store_short v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
-                                       <8 x i4> addrspace(1)* %src2,
-                                       i16 addrspace(1)* nocapture %dst) {
+                                       ptr addrspace(1) %src2,
+                                       ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
-  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
+  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <8 x i4> %vec1, i64 0
   %cv1e0 = zext i4 %v1e0 to i16
@@ -579,7 +579,7 @@ entry:
   %cv2e7 = zext i4 %v2e7 to i16
   %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7
 
-  %acc = load i16, i16 addrspace(1)* %dst, align 4
+  %acc = load i16, ptr addrspace(1) %dst, align 4
   %add1 = add i16 %mul0, %acc
   %add2 = add i16 %add1, %mul1
   %add3 = add i16 %add2, %mul2
@@ -589,13 +589,13 @@ entry:
   %add7 = add i16 %add6, %mul6
   %add8 = add i16 %add7, %mul7
 
-  store i16 %add8, i16 addrspace(1)* %dst, align 4
+  store i16 %add8, ptr addrspace(1) %dst, align 4
   ret void
 }
 
 ; TODO: Remove the unnecessary instruction(that is zero-extending the
 ; 2nd MAD) to have the pattern-recognizer to kick in.
-define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
+define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot8_acc8:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -840,14 +840,14 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
-                                      <8 x i4> addrspace(1)* %src2,
-                                      i8 addrspace(1)* nocapture %dst) {
+                                      ptr addrspace(1) %src2,
+                                      ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
-  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
+  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <8 x i4> %vec1, i64 0
   %cv1e0 = zext i4 %v1e0 to i8
@@ -897,7 +897,7 @@ entry:
   %cv2e7 = zext i4 %v2e7 to i8
   %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7
 
-  %acc = load i8, i8 addrspace(1)* %dst, align 4
+  %acc = load i8, ptr addrspace(1) %dst, align 4
   %add1 = add i8 %mul0, %acc
   %add2 = add i8 %add1, %mul1
   %add3 = add i8 %add2, %mul2
@@ -907,13 +907,13 @@ entry:
   %add7 = add i8 %add6, %mul6
   %add8 = add i8 %add7, %mul7
 
-  store i8 %add8, i8 addrspace(1)* %dst, align 4
+  store i8 %add8, ptr addrspace(1) %dst, align 4
   ret void
 }
 
 ; TODO: Remove the two unnecessary instructions(and+add after 2nd MAD)
 ; to have the pattern-recognizer to kick in.
-define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
+define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot8_acc4:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -1163,14 +1163,14 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
-                                      <8 x i4> addrspace(1)* %src2,
-                                      i4 addrspace(1)* nocapture %dst) {
+                                      ptr addrspace(1) %src2,
+                                      ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
-  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
+  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <8 x i4> %vec1, i64 0
   %v2e0 = extractelement <8 x i4> %vec2, i64 0
@@ -1204,7 +1204,7 @@ entry:
   %v2e7 = extractelement <8 x i4> %vec2, i64 7
   %mul7 = mul nuw nsw i4 %v1e7, %v2e7
 
-  %acc = load i4, i4 addrspace(1)* %dst, align 4
+  %acc = load i4, ptr addrspace(1) %dst, align 4
   %add1 = add i4 %mul0, %acc
   %add2 = add i4 %add1, %mul1
   %add3 = add i4 %add2, %mul2
@@ -1214,13 +1214,13 @@ entry:
   %add7 = add i4 %add6, %mul6
   %add8 = add i4 %add7, %mul7
 
-  store i4 %add8, i4 addrspace(1)* %dst, align 4
+  store i4 %add8, ptr addrspace(1) %dst, align 4
   ret void
 }
 
 ; TODO: Currently, permutation of udot8 is turned off due to a huge increase
 ; in the compile time.
-define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %src1,
+define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot8_CommutationInsideMAD:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -1470,14 +1470,14 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
 ; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
-                                                      <8 x i4> addrspace(1)* %src2,
-                                                      i4 addrspace(1)* nocapture %dst) {
+                                                      ptr addrspace(1) %src2,
+                                                      ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
-  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
+  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <8 x i4> %vec1, i64 0
   %v2e0 = extractelement <8 x i4> %vec2, i64 0
@@ -1511,7 +1511,7 @@ entry:
   %v2e7 = extractelement <8 x i4> %vec2, i64 7
   %mul7 = mul nuw nsw i4 %v1e7, %v2e7
 
-  %acc = load i4, i4 addrspace(1)* %dst, align 4
+  %acc = load i4, ptr addrspace(1) %dst, align 4
   %add1 = add i4 %mul0, %acc
   %add2 = add i4 %mul1, %add1
   %add3 = add i4 %mul2, %add2
@@ -1521,11 +1521,11 @@ entry:
   %add7 = add i4 %mul6, %add6
   %add8 = add i4 %mul7, %add7
 
-  store i4 %add8, i4 addrspace(1)* %dst, align 4
+  store i4 %add8, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
+define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot8_multiuses_mul1:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -1788,14 +1788,14 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v3, v13, v0
 ; GFX10-DL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                                <8 x i4> addrspace(1)* %src2,
-                                                i32 addrspace(1)* nocapture %dst) {
+                                                ptr addrspace(1) %src2,
+                                                ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
-  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
+  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
 
   %v1e0 = extractelement <8 x i4> %vec1, i64 0
   %cv1e0 = zext i4 %v1e0 to i32
@@ -1845,7 +1845,7 @@ entry:
   %cv2e7 = zext i4 %v2e7 to i32
   %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
 
-  %acc = load i32, i32 addrspace(1)* %dst, align 4
+  %acc = load i32, ptr addrspace(1) %dst, align 4
   %add1 = add i32 %mul0, %acc
   %add = add i32  %mul0, %add1
   %add2 = add i32 %add1, %mul1
@@ -1857,11 +1857,11 @@ entry:
   %add8 = add i32 %add7, %mul7
 
   %res = add i32 %add, %add8
-  store i32 %res, i32 addrspace(1)* %dst, align 4
+  store i32 %res, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
+define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot8_acc32_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -2059,14 +2059,14 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_dot8_u32_u4 v1, v1, v2, s2
 ; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                              <8 x i4> addrspace(1)* %src2,
-                                              i32 addrspace(1)* nocapture %dst) {
+                                              ptr addrspace(1) %src2,
+                                              ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
-  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
+  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
 
   %cvec1 = zext <8 x i4> %vec1 to <8 x i32>
   %cvec2 = zext <8 x i4> %vec2 to <8 x i32>
@@ -2081,7 +2081,7 @@ entry:
   %mul6 = extractelement <8 x i32> %mul, i64 6
   %mul7 = extractelement <8 x i32> %mul, i64 7
 
-  %acc = load i32, i32 addrspace(1)* %dst, align 4
+  %acc = load i32, ptr addrspace(1) %dst, align 4
   %add1 = add i32 %mul0, %acc
   %add2 = add i32 %add1, %mul1
   %add3 = add i32 %add2, %mul2
@@ -2091,13 +2091,13 @@ entry:
   %add7 = add i32 %add6, %mul6
   %add8 = add i32 %add7, %mul7
 
-  store i32 %add8, i32 addrspace(1)* %dst, align 4
+  store i32 %add8, ptr addrspace(1) %dst, align 4
   ret void
 }
 
 ; TODO: Clean up the code(by default pk_mad_I16 should be generated), then
 ; support the pattern.
-define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
+define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot8_acc16_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -2383,14 +2383,14 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                              <8 x i4> addrspace(1)* %src2,
-                                              i16 addrspace(1)* nocapture %dst) {
+                                              ptr addrspace(1) %src2,
+                                              ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
-  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
+  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
 
   %cvec1 = zext <8 x i4> %vec1 to <8 x i16>
   %cvec2 = zext <8 x i4> %vec2 to <8 x i16>
@@ -2405,7 +2405,7 @@ entry:
   %mul6 = extractelement <8 x i16> %mul, i64 6
   %mul7 = extractelement <8 x i16> %mul, i64 7
 
-  %acc = load i16, i16 addrspace(1)* %dst, align 4
+  %acc = load i16, ptr addrspace(1) %dst, align 4
   %add1 = add i16 %mul0, %acc
   %add2 = add i16 %add1, %mul1
   %add3 = add i16 %add2, %mul2
@@ -2415,12 +2415,12 @@ entry:
   %add7 = add i16 %add6, %mul6
   %add8 = add i16 %add7, %mul7
 
-  store i16 %add8, i16 addrspace(1)* %dst, align 4
+  store i16 %add8, ptr addrspace(1) %dst, align 4
   ret void
 }
 
 ; TODO: Cleanup the code to generate MAD; pattern should be recognized then.
-define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
+define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot8_acc8_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -2749,14 +2749,14 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX10-DL-NEXT:    global_store_byte v4, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                             <8 x i4> addrspace(1)* %src2,
-                                             i8 addrspace(1)* nocapture %dst) {
+                                             ptr addrspace(1) %src2,
+                                             ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
-  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
+  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
 
   %cvec1 = zext <8 x i4> %vec1 to <8 x i8>
   %cvec2 = zext <8 x i4> %vec2 to <8 x i8>
@@ -2771,7 +2771,7 @@ entry:
   %mul6 = extractelement <8 x i8> %mul, i64 6
   %mul7 = extractelement <8 x i8> %mul, i64 7
 
-  %acc = load i8, i8 addrspace(1)* %dst, align 4
+  %acc = load i8, ptr addrspace(1) %dst, align 4
   %add1 = add i8 %mul0, %acc
   %add2 = add i8 %add1, %mul1
   %add3 = add i8 %add2, %mul2
@@ -2781,12 +2781,12 @@ entry:
   %add7 = add i8 %add6, %mul6
   %add8 = add i8 %add7, %mul7
 
-  store i8 %add8, i8 addrspace(1)* %dst, align 4
+  store i8 %add8, ptr addrspace(1) %dst, align 4
   ret void
 }
 
 ; TODO: Once the adictional "and+add" are removed, the pattern will be recognized.
-define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
+define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot8_acc4_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -3077,14 +3077,14 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX10-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                             <8 x i4> addrspace(1)* %src2,
-                                             i4 addrspace(1)* nocapture %dst) {
+                                             ptr addrspace(1) %src2,
+                                             ptr addrspace(1) nocapture %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
-  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
-  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
-  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
+  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
+  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
+  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
+  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
 
   %mul = mul <8 x i4> %vec1, %vec2
   %mul0 = extractelement <8 x i4> %mul, i64 0
@@ -3096,7 +3096,7 @@ entry:
   %mul6 = extractelement <8 x i4> %mul, i64 6
   %mul7 = extractelement <8 x i4> %mul, i64 7
 
-  %acc = load i4, i4 addrspace(1)* %dst, align 4
+  %acc = load i4, ptr addrspace(1) %dst, align 4
   %add1 = add i4 %mul0, %acc
   %add2 = add i4 %add1, %mul1
   %add3 = add i4 %add2, %mul2
@@ -3106,11 +3106,11 @@ entry:
   %add7 = add i4 %add6, %mul6
   %add8 = add i4 %add7, %mul7
 
-  store i4 %add8, i4 addrspace(1)* %dst, align 4
+  store i4 %add8, ptr addrspace(1) %dst, align 4
   ret void
 }
 
-define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
+define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
 ; GFX7-LABEL: udot8_variant1:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3278,14 +3278,14 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
 ; GFX10-DL-NEXT:    v_dot8_u32_u4 v1, v2, v1, s2
 ; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
-                                          i32 addrspace(1)* %v2addr,
-                                          i32 addrspace(1)* %dst) {
+                                          ptr addrspace(1) %v2addr,
+                                          ptr addrspace(1) %dst) {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr i32, i32 addrspace(1)* %v1addr, i32 %idx
-  %v1 = load i32, i32 addrspace(1)* %gep1, align 4
-  %gep2 = getelementptr i32, i32 addrspace(1)* %v2addr, i32 %idx
-  %v2 = load i32, i32 addrspace(1)* %gep2, align 4
+  %gep1 = getelementptr i32, ptr addrspace(1) %v1addr, i32 %idx
+  %v1 = load i32, ptr addrspace(1) %gep1, align 4
+  %gep2 = getelementptr i32, ptr addrspace(1) %v2addr, i32 %idx
+  %v2 = load i32, ptr addrspace(1) %gep2, align 4
   %and = and i32 %v1, 15
   %and1 = and i32 %v2, 15
   %mul1 = mul nuw nsw i32 %and1, %and
@@ -3329,7 +3329,7 @@ entry:
   %shr36 = lshr i32 %v1, 28
   %shr37 = lshr i32 %v2, 28
   %mul8 = mul nuw nsw i32 %shr37, %shr36
-  %acc = load i32, i32 addrspace(1)* %dst, align 4
+  %acc = load i32, ptr addrspace(1) %dst, align 4
 
   %add1 = add i32 %mul1, %acc
   %add2 = add i32 %add1, %mul8
@@ -3339,7 +3339,7 @@ entry:
   %add6 = add i32 %add5, %mul5
   %add7 = add i32 %add6, %mul6
   %add8 = add i32 %add7, %mul7
-  store i32 %add8, i32 addrspace(1)* %dst, align 4
+  store i32 %add8, ptr addrspace(1) %dst, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/image-attributes.ll b/llvm/test/CodeGen/AMDGPU/image-attributes.ll
index 53d61e66c6ba8..d7362811dd904 100644
--- a/llvm/test/CodeGen/AMDGPU/image-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/image-attributes.ll
@@ -7,26 +7,26 @@
 ; FUNC-LABEL: {{^}}width_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].Z
-define amdgpu_kernel void @width_2d (%opencl.image2d_t addrspace(1)* %in,
-                       i32 addrspace(1)* %out) {
+define amdgpu_kernel void @width_2d (ptr addrspace(1) %in,
+                       ptr addrspace(1) %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d(
-      %opencl.image2d_t addrspace(1)* %in) #0
+      ptr addrspace(1) %in) #0
   %1 = extractvalue [3 x i32] %0, 0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}width_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].Z
-define amdgpu_kernel void @width_3d (%opencl.image3d_t addrspace(1)* %in,
-                       i32 addrspace(1)* %out) {
+define amdgpu_kernel void @width_3d (ptr addrspace(1) %in,
+                       ptr addrspace(1) %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d(
-      %opencl.image3d_t addrspace(1)* %in) #0
+      ptr addrspace(1) %in) #0
   %1 = extractvalue [3 x i32] %0, 0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -37,26 +37,26 @@ entry:
 ; FUNC-LABEL: {{^}}height_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].W
-define amdgpu_kernel void @height_2d (%opencl.image2d_t addrspace(1)* %in,
-                        i32 addrspace(1)* %out) {
+define amdgpu_kernel void @height_2d (ptr addrspace(1) %in,
+                        ptr addrspace(1) %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d(
-      %opencl.image2d_t addrspace(1)* %in) #0
+      ptr addrspace(1) %in) #0
   %1 = extractvalue [3 x i32] %0, 1
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}height_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].W
-define amdgpu_kernel void @height_3d (%opencl.image3d_t addrspace(1)* %in,
-                        i32 addrspace(1)* %out) {
+define amdgpu_kernel void @height_3d (ptr addrspace(1) %in,
+                        ptr addrspace(1) %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d(
-      %opencl.image3d_t addrspace(1)* %in) #0
+      ptr addrspace(1) %in) #0
   %1 = extractvalue [3 x i32] %0, 1
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -67,13 +67,13 @@ entry:
 ; FUNC-LABEL: {{^}}depth_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].X
-define amdgpu_kernel void @depth_3d (%opencl.image3d_t addrspace(1)* %in,
-                       i32 addrspace(1)* %out) {
+define amdgpu_kernel void @depth_3d (ptr addrspace(1) %in,
+                       ptr addrspace(1) %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d(
-      %opencl.image3d_t addrspace(1)* %in) #0
+      ptr addrspace(1) %in) #0
   %1 = extractvalue [3 x i32] %0, 2
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -84,26 +84,26 @@ entry:
 ; FUNC-LABEL: {{^}}data_type_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Y
-define amdgpu_kernel void @data_type_2d (%opencl.image2d_t addrspace(1)* %in,
-                           i32 addrspace(1)* %out) {
+define amdgpu_kernel void @data_type_2d (ptr addrspace(1) %in,
+                           ptr addrspace(1) %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.2d(
-      %opencl.image2d_t addrspace(1)* %in) #0
+      ptr addrspace(1) %in) #0
   %1 = extractvalue [2 x i32] %0, 0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}data_type_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Y
-define amdgpu_kernel void @data_type_3d (%opencl.image3d_t addrspace(1)* %in,
-                                     i32 addrspace(1)* %out) {
+define amdgpu_kernel void @data_type_3d (ptr addrspace(1) %in,
+                                     ptr addrspace(1) %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.3d(
-      %opencl.image3d_t addrspace(1)* %in) #0
+      ptr addrspace(1) %in) #0
   %1 = extractvalue [2 x i32] %0, 0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -114,26 +114,26 @@ entry:
 ; FUNC-LABEL: {{^}}channel_order_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Z
-define amdgpu_kernel void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in,
-                               i32 addrspace(1)* %out) {
+define amdgpu_kernel void @channel_order_2d (ptr addrspace(1) %in,
+                               ptr addrspace(1) %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.2d(
-      %opencl.image2d_t addrspace(1)* %in) #0
+      ptr addrspace(1) %in) #0
   %1 = extractvalue [2 x i32] %0, 1
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}channel_order_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Z
-define amdgpu_kernel void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in,
-                                         i32 addrspace(1)* %out) {
+define amdgpu_kernel void @channel_order_3d (ptr addrspace(1) %in,
+                                         ptr addrspace(1) %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.3d(
-      %opencl.image3d_t addrspace(1)* %in) #0
+      ptr addrspace(1) %in) #0
   %1 = extractvalue [2 x i32] %0, 1
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -146,49 +146,48 @@ entry:
 ; FUNC-LABEL: {{^}}image_arg_2nd:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[4].Z
-define amdgpu_kernel void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1,
+define amdgpu_kernel void @image_arg_2nd (ptr addrspace(1) %in1,
                             i32 %x,
-                            %opencl.image2d_t addrspace(1)* %in2,
-                            i32 addrspace(1)* %out) {
+                            ptr addrspace(1) %in2,
+                            ptr addrspace(1) %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d(
-      %opencl.image2d_t addrspace(1)* %in2) #0
+      ptr addrspace(1) %in2) #0
   %1 = extractvalue [3 x i32] %0, 1
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 %opencl.image2d_t = type opaque
 %opencl.image3d_t = type opaque
 
-declare [3 x i32] @llvm.OpenCL.image.get.size.2d(%opencl.image2d_t addrspace(1)*) #0
-declare [3 x i32] @llvm.OpenCL.image.get.size.3d(%opencl.image3d_t addrspace(1)*) #0
-declare [2 x i32] @llvm.OpenCL.image.get.format.2d(%opencl.image2d_t addrspace(1)*) #0
-declare [2 x i32] @llvm.OpenCL.image.get.format.3d(%opencl.image3d_t addrspace(1)*) #0
+declare [3 x i32] @llvm.OpenCL.image.get.size.2d(ptr addrspace(1)) #0
+declare [3 x i32] @llvm.OpenCL.image.get.size.3d(ptr addrspace(1)) #0
+declare [2 x i32] @llvm.OpenCL.image.get.format.2d(ptr addrspace(1)) #0
+declare [2 x i32] @llvm.OpenCL.image.get.format.3d(ptr addrspace(1)) #0
 
 attributes #0 = { readnone }
 
 !opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9}
-!0 = !{void (%opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @width_2d,
+!0 = !{ptr @width_2d,
        !10, !20, !30, !40, !50}
-!1 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @width_3d,
+!1 = !{ptr @width_3d,
        !10, !21, !31, !41, !50}
-!2 = !{void (%opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @height_2d,
+!2 = !{ptr @height_2d,
        !10, !20, !30, !40, !50}
-!3 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @height_3d,
+!3 = !{ptr @height_3d,
        !10, !21, !31, !41, !50}
-!4 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @depth_3d,
+!4 = !{ptr @depth_3d,
        !10, !21, !31, !41, !50}
-!5 = !{void (%opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @data_type_2d,
+!5 = !{ptr @data_type_2d,
        !10, !20, !30, !40, !50}
-!6 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @data_type_3d,
+!6 = !{ptr @data_type_3d,
        !10, !21, !31, !41, !50}
-!7 = !{void (%opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @channel_order_2d,
+!7 = !{ptr @channel_order_2d,
        !10, !20, !30, !40, !50}
-!8 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @channel_order_3d,
+!8 = !{ptr @channel_order_3d,
        !10, !21, !31, !41, !50}
-!9 = !{void (%opencl.image3d_t addrspace(1)*, i32, %opencl.image2d_t addrspace(1)*,
-      i32 addrspace(1)*)* @image_arg_2nd, !12, !22, !32, !42, !52}
+!9 = !{ptr @image_arg_2nd, !12, !22, !32, !42, !52}
 
 !10 = !{!"kernel_arg_addr_space", i32 1, i32 1}
 !20 = !{!"kernel_arg_access_qual", !"read_only", !"none"}

diff  --git a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
index 60cff45c3e72c..106d82278b554 100644
--- a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
@@ -88,8 +88,8 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
   %v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
   %v.data = extractvalue { half, i32 } %v, 0
   %v.err = extractvalue { half, i32 } %v, 1
-  store volatile half %v.data, half addrspace(1)* undef
-  store volatile i32 %v.err, i32 addrspace(1)* undef
+  store volatile half %v.data, ptr addrspace(1) undef
+  store volatile i32 %v.err, ptr addrspace(1) undef
   ret void
 }
 
@@ -177,8 +177,8 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
   %v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
   %v.data = extractvalue { half, i32 } %v, 0
   %v.err = extractvalue { half, i32 } %v, 1
-  store volatile half %v.data, half addrspace(1)* undef
-  store volatile i32 %v.err, i32 addrspace(1)* undef
+  store volatile half %v.data, ptr addrspace(1) undef
+  store volatile i32 %v.err, ptr addrspace(1) undef
   ret void
 }
 
@@ -266,8 +266,8 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
   %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
   %v.data = extractvalue { <2 x half>, i32 } %v, 0
   %v.err = extractvalue { <2 x half>, i32 } %v, 1
-  store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef
-  store volatile i32 %v.err, i32 addrspace(1)* undef
+  store volatile <2 x half> %v.data, ptr addrspace(1) undef
+  store volatile i32 %v.err, ptr addrspace(1) undef
   ret void
 }
 
@@ -355,8 +355,8 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
   %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
   %v.data = extractvalue { <2 x half>, i32 } %v, 0
   %v.err = extractvalue { <2 x half>, i32 } %v, 1
-  store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef
-  store volatile i32 %v.err, i32 addrspace(1)* undef
+  store volatile <2 x half> %v.data, ptr addrspace(1) undef
+  store volatile i32 %v.err, ptr addrspace(1) undef
   ret void
 }
 
@@ -447,8 +447,8 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) {
   %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
   %v.data = extractvalue { <2 x half>, i32 } %v, 0
   %v.err = extractvalue { <2 x half>, i32 } %v, 1
-  store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef
-  store volatile i32 %v.err, i32 addrspace(1)* undef
+  store volatile <2 x half> %v.data, ptr addrspace(1) undef
+  store volatile i32 %v.err, ptr addrspace(1) undef
   ret void
 }
 
@@ -551,8 +551,8 @@ define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
   %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
   %v.data = extractvalue { <3 x half>, i32 } %v, 0
   %v.err = extractvalue { <3 x half>, i32 } %v, 1
-  store volatile <3 x half> %v.data, <3 x half> addrspace(1)* undef
-  store volatile i32 %v.err, i32 addrspace(1)* undef
+  store volatile <3 x half> %v.data, ptr addrspace(1) undef
+  store volatile i32 %v.err, ptr addrspace(1) undef
   ret void
 }
 
@@ -650,8 +650,8 @@ define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s)
   %v = call { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
   %v.data = extractvalue { <4 x half>, i32 } %v, 0
   %v.err = extractvalue { <4 x half>, i32 } %v, 1
-  store volatile <4 x half> %v.data, <4 x half> addrspace(1)* undef
-  store volatile i32 %v.err, i32 addrspace(1)* undef
+  store volatile <4 x half> %v.data, ptr addrspace(1) undef
+  store volatile i32 %v.err, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/image-resource-id.ll b/llvm/test/CodeGen/AMDGPU/image-resource-id.ll
index dac7c7ddaeac9..89338bc9b10a5 100644
--- a/llvm/test/CodeGen/AMDGPU/image-resource-id.ll
+++ b/llvm/test/CodeGen/AMDGPU/image-resource-id.ll
@@ -7,12 +7,12 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define amdgpu_kernel void @test_2d_rd_1_0(%opencl.image2d_t addrspace(1)* %in, ; read_only
-                            i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_2d_rd_1_0(ptr addrspace(1) %in, ; read_only
+                            ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
-      %opencl.image2d_t addrspace(1)* %in) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -21,12 +21,12 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define amdgpu_kernel void @test_3d_rd_1_0(%opencl.image3d_t addrspace(1)* %in, ; read_only
-                            i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_3d_rd_1_0(ptr addrspace(1) %in, ; read_only
+                            ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
-      %opencl.image3d_t addrspace(1)* %in) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -37,12 +37,12 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define amdgpu_kernel void @test_2d_wr_1_0(%opencl.image2d_t addrspace(1)* %in, ; write_only
-                            i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_2d_wr_1_0(ptr addrspace(1) %in, ; write_only
+                            ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
-      %opencl.image2d_t addrspace(1)* %in) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -51,12 +51,12 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define amdgpu_kernel void @test_3d_wr_1_0(%opencl.image3d_t addrspace(1)* %in, ; write_only
-                            i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_3d_wr_1_0(ptr addrspace(1) %in, ; write_only
+                            ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
-      %opencl.image3d_t addrspace(1)* %in) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -67,13 +67,13 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define amdgpu_kernel void @test_2d_rd_2_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
-                            %opencl.image2d_t addrspace(1)* %in2, ; read_only
-                            i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_2d_rd_2_0(ptr addrspace(1) %in1, ; read_only
+                            ptr addrspace(1) %in2, ; read_only
+                            ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
-      %opencl.image2d_t addrspace(1)* %in1) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in1) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -82,13 +82,13 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define amdgpu_kernel void @test_2d_rd_2_1(%opencl.image2d_t addrspace(1)* %in1, ; read_only
-                            %opencl.image2d_t addrspace(1)* %in2, ; read_only
-                            i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_2d_rd_2_1(ptr addrspace(1) %in1, ; read_only
+                            ptr addrspace(1) %in2, ; read_only
+                            ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
-      %opencl.image2d_t addrspace(1)* %in2) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in2) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -97,13 +97,13 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define amdgpu_kernel void @test_3d_rd_2_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
-                            %opencl.image3d_t addrspace(1)* %in2, ; read_only
-                            i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_3d_rd_2_0(ptr addrspace(1) %in1, ; read_only
+                            ptr addrspace(1) %in2, ; read_only
+                            ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
-      %opencl.image3d_t addrspace(1)* %in1) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in1) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -112,13 +112,13 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define amdgpu_kernel void @test_3d_rd_2_1(%opencl.image3d_t addrspace(1)* %in1, ; read_only
-                            %opencl.image3d_t addrspace(1)* %in2, ; read_only
-                            i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_3d_rd_2_1(ptr addrspace(1) %in1, ; read_only
+                            ptr addrspace(1) %in2, ; read_only
+                            ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
-      %opencl.image3d_t addrspace(1)* %in2) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in2) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -129,13 +129,13 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define amdgpu_kernel void @test_2d_wr_2_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
-                            %opencl.image2d_t addrspace(1)* %in2, ; write_only
-                            i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_2d_wr_2_0(ptr addrspace(1) %in1, ; write_only
+                            ptr addrspace(1) %in2, ; write_only
+                            ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
-      %opencl.image2d_t addrspace(1)* %in1) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in1) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -144,13 +144,13 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define amdgpu_kernel void @test_2d_wr_2_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
-                            %opencl.image2d_t addrspace(1)* %in2, ; write_only
-                            i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_2d_wr_2_1(ptr addrspace(1) %in1, ; write_only
+                            ptr addrspace(1) %in2, ; write_only
+                            ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
-      %opencl.image2d_t addrspace(1)* %in2) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in2) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -159,13 +159,13 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define amdgpu_kernel void @test_3d_wr_2_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
-                            %opencl.image3d_t addrspace(1)* %in2, ; write_only
-                            i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_3d_wr_2_0(ptr addrspace(1) %in1, ; write_only
+                            ptr addrspace(1) %in2, ; write_only
+                            ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
-      %opencl.image3d_t addrspace(1)* %in1) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in1) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -174,13 +174,13 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define amdgpu_kernel void @test_3d_wr_2_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
-                            %opencl.image3d_t addrspace(1)* %in2, ; write_only
-                            i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_3d_wr_2_1(ptr addrspace(1) %in1, ; write_only
+                            ptr addrspace(1) %in2, ; write_only
+                            ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
-      %opencl.image3d_t addrspace(1)* %in2) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in2) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -191,14 +191,14 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define amdgpu_kernel void @test_2d_rd_3_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
-                            %opencl.image3d_t addrspace(1)* %in2, ; read_only
-                            %opencl.image2d_t addrspace(1)* %in3, ; read_only
-                            i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_2d_rd_3_0(ptr addrspace(1) %in1, ; read_only
+                            ptr addrspace(1) %in2, ; read_only
+                            ptr addrspace(1) %in3, ; read_only
+                            ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
-      %opencl.image2d_t addrspace(1)* %in3) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in3) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -208,14 +208,14 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define amdgpu_kernel void @test_3d_rd_3_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
-                            %opencl.image2d_t addrspace(1)* %in2, ; read_only
-                            %opencl.image3d_t addrspace(1)* %in3, ; read_only
-                            i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_3d_rd_3_0(ptr addrspace(1) %in1, ; read_only
+                            ptr addrspace(1) %in2, ; read_only
+                            ptr addrspace(1) %in3, ; read_only
+                            ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
-      %opencl.image3d_t addrspace(1)* %in3) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in3) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -226,14 +226,14 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define amdgpu_kernel void @test_2d_wr_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
-                            %opencl.image3d_t addrspace(1)* %in2, ; write_only
-                            %opencl.image2d_t addrspace(1)* %in3, ; write_only
-                            i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_2d_wr_3_0(ptr addrspace(1) %in1, ; write_only
+                            ptr addrspace(1) %in2, ; write_only
+                            ptr addrspace(1) %in3, ; write_only
+                            ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
-      %opencl.image2d_t addrspace(1)* %in3) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in3) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -243,14 +243,14 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define amdgpu_kernel void @test_3d_wr_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
-                            %opencl.image2d_t addrspace(1)* %in2, ; write_only
-                            %opencl.image3d_t addrspace(1)* %in3, ; write_only
-                            i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_3d_wr_3_0(ptr addrspace(1) %in1, ; write_only
+                            ptr addrspace(1) %in2, ; write_only
+                            ptr addrspace(1) %in3, ; write_only
+                            ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
-      %opencl.image3d_t addrspace(1)* %in3) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in3) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -261,14 +261,14 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define amdgpu_kernel void @test_2d_mix_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
-                             %opencl.image3d_t addrspace(1)* %in2, ; read_only
-                             %opencl.image2d_t addrspace(1)* %in3, ; read_only
-                             i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_2d_mix_3_0(ptr addrspace(1) %in1, ; write_only
+                             ptr addrspace(1) %in2, ; read_only
+                             ptr addrspace(1) %in3, ; read_only
+                             ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
-      %opencl.image2d_t addrspace(1)* %in3) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in3) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -277,14 +277,14 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define amdgpu_kernel void @test_3d_mix_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
-                             %opencl.image2d_t addrspace(1)* %in2, ; read_only
-                             %opencl.image3d_t addrspace(1)* %in3, ; read_only
-                             i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_3d_mix_3_0(ptr addrspace(1) %in1, ; write_only
+                             ptr addrspace(1) %in2, ; read_only
+                             ptr addrspace(1) %in3, ; read_only
+                             ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
-      %opencl.image3d_t addrspace(1)* %in3) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in3) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -293,14 +293,14 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define amdgpu_kernel void @test_2d_mix_3_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
-                             %opencl.image3d_t addrspace(1)* %in2, ; read_only
-                             %opencl.image2d_t addrspace(1)* %in3, ; write_only
-                             i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_2d_mix_3_1(ptr addrspace(1) %in1, ; write_only
+                             ptr addrspace(1) %in2, ; read_only
+                             ptr addrspace(1) %in3, ; write_only
+                             ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
-      %opencl.image2d_t addrspace(1)* %in3) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in3) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -309,14 +309,14 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define amdgpu_kernel void @test_3d_mix_3_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
-                             %opencl.image2d_t addrspace(1)* %in2, ; read_only
-                             %opencl.image3d_t addrspace(1)* %in3, ; write_only
-                             i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_3d_mix_3_1(ptr addrspace(1) %in1, ; write_only
+                             ptr addrspace(1) %in2, ; read_only
+                             ptr addrspace(1) %in3, ; write_only
+                             ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
-      %opencl.image3d_t addrspace(1)* %in3) #0
-  store i32 %0, i32 addrspace(1)* %out
+      ptr addrspace(1) %in3) #0
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -324,20 +324,20 @@ entry:
 %opencl.image2d_t = type opaque
 %opencl.image3d_t = type opaque
 
-declare i32 @llvm.OpenCL.image.get.resource.id.2d(%opencl.image2d_t addrspace(1)*) #0
-declare i32 @llvm.OpenCL.image.get.resource.id.3d(%opencl.image3d_t addrspace(1)*) #0
+declare i32 @llvm.OpenCL.image.get.resource.id.2d(ptr addrspace(1)) #0
+declare i32 @llvm.OpenCL.image.get.resource.id.3d(ptr addrspace(1)) #0
 
 attributes #0 = { readnone }
 
 !opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13,
                     !14, !15, !16, !17, !18, !19}
-!0 = !{void (%opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @test_2d_rd_1_0,
+!0 = !{ptr @test_2d_rd_1_0,
        !110, !120, !130, !140, !150}
-!1 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @test_3d_rd_1_0,
+!1 = !{ptr @test_3d_rd_1_0,
        !110, !120, !131, !141, !150}
-!2 = !{void (%opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @test_2d_wr_1_0,
+!2 = !{ptr @test_2d_wr_1_0,
        !110, !121, !130, !140, !150}
-!3 = !{void (%opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @test_3d_wr_1_0,
+!3 = !{ptr @test_3d_wr_1_0,
        !110, !121, !131, !141, !150}
 !110 = !{!"kernel_arg_addr_space", i32 1, i32 1}
 !120 = !{!"kernel_arg_access_qual", !"read_only", !"none"}
@@ -348,22 +348,14 @@ attributes #0 = { readnone }
 !141 = !{!"kernel_arg_base_type", !"image3d_t", !"int*"}
 !150 = !{!"kernel_arg_type_qual", !"", !""}
 
-!4  = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*,
-              i32 addrspace(1)*)* @test_2d_rd_2_0, !112, !122, !132, !142, !152}
-!5  = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*,
-              i32 addrspace(1)*)* @test_2d_rd_2_1, !112, !122, !132, !142, !152}
-!6  = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*,
-              i32 addrspace(1)*)* @test_3d_rd_2_0, !112, !122, !133, !143, !152}
-!7  = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*,
-              i32 addrspace(1)*)* @test_3d_rd_2_1, !112, !122, !133, !143, !152}
-!8  = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*,
-              i32 addrspace(1)*)* @test_2d_wr_2_0, !112, !123, !132, !142, !152}
-!9  = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*,
-              i32 addrspace(1)*)* @test_2d_wr_2_1, !112, !123, !132, !142, !152}
-!10 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*,
-              i32 addrspace(1)*)* @test_3d_wr_2_0, !112, !123, !133, !143, !152}
-!11 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*,
-              i32 addrspace(1)*)* @test_3d_wr_2_1, !112, !123, !133, !143, !152}
+!4  = !{ptr @test_2d_rd_2_0, !112, !122, !132, !142, !152}
+!5  = !{ptr @test_2d_rd_2_1, !112, !122, !132, !142, !152}
+!6  = !{ptr @test_3d_rd_2_0, !112, !122, !133, !143, !152}
+!7  = !{ptr @test_3d_rd_2_1, !112, !122, !133, !143, !152}
+!8  = !{ptr @test_2d_wr_2_0, !112, !123, !132, !142, !152}
+!9  = !{ptr @test_2d_wr_2_1, !112, !123, !132, !142, !152}
+!10 = !{ptr @test_3d_wr_2_0, !112, !123, !133, !143, !152}
+!11 = !{ptr @test_3d_wr_2_1, !112, !123, !133, !143, !152}
 !112 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1}
 !122 = !{!"kernel_arg_access_qual", !"read_only", !"read_only", !"none"}
 !123 = !{!"kernel_arg_access_qual", !"write_only", !"write_only", !"none"}
@@ -373,29 +365,21 @@ attributes #0 = { readnone }
 !143 = !{!"kernel_arg_base_type", !"image3d_t", !"image3d_t", !"int*"}
 !152 = !{!"kernel_arg_type_qual", !"", !"", !""}
 
-!12 = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*,
-              %opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @test_2d_rd_3_0,
+!12 = !{ptr @test_2d_rd_3_0,
               !114, !124, !134, !144, !154}
-!13 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*,
-              %opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @test_3d_rd_3_0,
+!13 = !{ptr @test_3d_rd_3_0,
               !114, !124, !135, !145, !154}
-!14 = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*,
-              %opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @test_2d_wr_3_0,
+!14 = !{ptr @test_2d_wr_3_0,
               !114, !125, !134, !144, !154}
-!15 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*,
-              %opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @test_3d_wr_3_0,
+!15 = !{ptr @test_3d_wr_3_0,
               !114, !125, !135, !145, !154}
-!16 = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*,
-              %opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @test_2d_mix_3_0,
+!16 = !{ptr @test_2d_mix_3_0,
               !114, !126, !134, !144, !154}
-!17 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*,
-              %opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @test_3d_mix_3_0,
+!17 = !{ptr @test_3d_mix_3_0,
               !114, !126, !135, !145, !154}
-!18 = !{void (%opencl.image2d_t addrspace(1)*, %opencl.image3d_t addrspace(1)*,
-              %opencl.image2d_t addrspace(1)*, i32 addrspace(1)*)* @test_2d_mix_3_1,
+!18 = !{ptr @test_2d_mix_3_1,
               !114, !127, !134, !144, !154}
-!19 = !{void (%opencl.image3d_t addrspace(1)*, %opencl.image2d_t addrspace(1)*,
-              %opencl.image3d_t addrspace(1)*, i32 addrspace(1)*)* @test_3d_mix_3_1,
+!19 = !{ptr @test_3d_mix_3_1,
               !114, !127, !135, !145, !154}
 !114 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1}
 !124 = !{!"kernel_arg_access_qual", !"read_only", !"read_only", !"read_only", !"none"}

diff  --git a/llvm/test/CodeGen/AMDGPU/image-schedule.ll b/llvm/test/CodeGen/AMDGPU/image-schedule.ll
index 8c875ae90d064..dbd9efc58e59d 100644
--- a/llvm/test/CodeGen/AMDGPU/image-schedule.ll
+++ b/llvm/test/CodeGen/AMDGPU/image-schedule.ll
@@ -17,22 +17,20 @@ define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg, i32 inreg %arg1
   %.0.vec.insert = insertelement <2 x i32> undef, i32 %arg2, i32 0
   %.4.vec.insert = shufflevector <2 x i32> %.0.vec.insert, <2 x i32> %tmp6, <2 x i32> <i32 0, i32 3>
   %tmp7 = bitcast <2 x i32> %.4.vec.insert to i64
-  %tmp8 = inttoptr i64 %tmp7 to [4294967295 x i8] addrspace(4)*
+  %tmp8 = inttoptr i64 %tmp7 to ptr addrspace(4)
   %tmp9 = add <3 x i32> %arg3, %arg5
-  %tmp10 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(4)* %tmp8, i64 0, i64 32
-  %tmp11 = bitcast i8 addrspace(4)* %tmp10 to <8 x i32> addrspace(4)*, !amdgpu.uniform !0
-  %tmp12 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp11, align 16
+  %tmp10 = getelementptr [4294967295 x i8], ptr addrspace(4) %tmp8, i64 0, i64 32
+  %tmp12 = load <8 x i32>, ptr addrspace(4) %tmp10, align 16
   %tmp13.0 = extractelement <3 x i32> %tmp9, i32 0
   %tmp13.1 = extractelement <3 x i32> %tmp9, i32 1
   %tmp14 = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %tmp13.0, i32 %tmp13.1, <8 x i32> %tmp12, i32 0, i32 0) #0
-  %tmp15 = inttoptr i64 %tmp7 to <8 x i32> addrspace(4)*
-  %tmp16 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp15, align 16
+  %tmp15 = inttoptr i64 %tmp7 to ptr addrspace(4)
+  %tmp16 = load <8 x i32>, ptr addrspace(4) %tmp15, align 16
   call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %tmp14, i32 15, i32 %tmp13.0, i32 %tmp13.1, <8 x i32> %tmp16, i32 0, i32 0) #0
-  %tmp17 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp15, align 16
+  %tmp17 = load <8 x i32>, ptr addrspace(4) %tmp15, align 16
   %tmp18 = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 165, i32 %tmp13.0, i32 %tmp13.1, <8 x i32> %tmp17, i32 0, i32 0) #0
-  %tmp19 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(4)* %tmp8, i64 0, i64 64
-  %tmp20 = bitcast i8 addrspace(4)* %tmp19 to <8 x i32> addrspace(4)*, !amdgpu.uniform !0
-  %tmp21 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp20, align 16
+  %tmp19 = getelementptr [4294967295 x i8], ptr addrspace(4) %tmp8, i64 0, i64 64
+  %tmp21 = load <8 x i32>, ptr addrspace(4) %tmp19, align 16
   call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %tmp18, i32 15, i32 %tmp13.0, i32 %tmp13.1, <8 x i32> %tmp21, i32 0, i32 0) #0
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll
index e4bf5f1149326..674c1e597423d 100644
--- a/llvm/test/CodeGen/AMDGPU/imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm.ll
@@ -3,7 +3,7 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 
 ; Use a 64-bit value with lo bits that can be represented as an inline constant
-define amdgpu_kernel void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
+define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) {
 ; SI-LABEL: i64_imm_inline_lo:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -26,12 +26,12 @@ define amdgpu_kernel void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 entry:
-  store i64 1311768464867721221, i64 addrspace(1) *%out ; 0x1234567800000005
+  store i64 1311768464867721221, ptr addrspace(1) %out ; 0x1234567800000005
   ret void
 }
 
 ; Use a 64-bit value with hi bits that can be represented as an inline constant
-define amdgpu_kernel void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
+define amdgpu_kernel void @i64_imm_inline_hi(ptr addrspace(1) %out) {
 ; SI-LABEL: i64_imm_inline_hi:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -54,11 +54,11 @@ define amdgpu_kernel void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 entry:
-  store i64 21780256376, i64 addrspace(1) *%out ; 0x0000000512345678
+  store i64 21780256376, ptr addrspace(1) %out ; 0x0000000512345678
   ret void
 }
 
-define amdgpu_kernel void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
+define amdgpu_kernel void @store_imm_neg_0.0_i64(ptr addrspace(1) %out) {
 ; SI-LABEL: store_imm_neg_0.0_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -80,11 +80,11 @@ define amdgpu_kernel void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store i64 -9223372036854775808, i64 addrspace(1) *%out
+  store i64 -9223372036854775808, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_neg_0.0_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -104,11 +104,11 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out)
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store i32 -2147483648, i32 addrspace(1)* %out
+  store i32 -2147483648, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.0_f32(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_0.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -128,11 +128,11 @@ define amdgpu_kernel void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store float 0.0, float addrspace(1)* %out
+  store float 0.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_imm_neg_0.0_f32(ptr addrspace(1) %out) {
 ; SI-LABEL: store_imm_neg_0.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -152,11 +152,11 @@ define amdgpu_kernel void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store float -0.0, float addrspace(1)* %out
+  store float -0.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.5_f32(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_0.5_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -176,11 +176,11 @@ define amdgpu_kernel void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store float 0.5, float addrspace(1)* %out
+  store float 0.5, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_0.5_f32(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_m_0.5_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -200,11 +200,11 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out)
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store float -0.5, float addrspace(1)* %out
+  store float -0.5, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_1.0_f32(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_1.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -224,11 +224,11 @@ define amdgpu_kernel void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store float 1.0, float addrspace(1)* %out
+  store float 1.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_1.0_f32(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_m_1.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -248,11 +248,11 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out)
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store float -1.0, float addrspace(1)* %out
+  store float -1.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_2.0_f32(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_2.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -272,11 +272,11 @@ define amdgpu_kernel void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store float 2.0, float addrspace(1)* %out
+  store float 2.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_2.0_f32(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_m_2.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -296,11 +296,11 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out)
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store float -2.0, float addrspace(1)* %out
+  store float -2.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_4.0_f32(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_4.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -320,11 +320,11 @@ define amdgpu_kernel void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store float 4.0, float addrspace(1)* %out
+  store float 4.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_4.0_f32(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_m_4.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -344,11 +344,11 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out)
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store float -4.0, float addrspace(1)* %out
+  store float -4.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_inv_2pi_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -368,11 +368,11 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store float 0x3FC45F3060000000, float addrspace(1)* %out
+  store float 0x3FC45F3060000000, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_m_inv_2pi_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -392,11 +392,11 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %o
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store float 0xBFC45F3060000000, float addrspace(1)* %out
+  store float 0xBFC45F3060000000, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_literal_imm_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_f32(ptr addrspace(1) %out) {
 ; SI-LABEL: store_literal_imm_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -416,11 +416,11 @@ define amdgpu_kernel void @store_literal_imm_f32(float addrspace(1)* %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store float 4096.0, float addrspace(1)* %out
+  store float 4096.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f32(ptr addrspace(1) %out, float %x) {
 ; SI-LABEL: add_inline_imm_0.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -443,11 +443,11 @@ define amdgpu_kernel void @add_inline_imm_0.0_f32(float addrspace(1)* %out, floa
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd float %x, 0.0
-  store float %y, float addrspace(1)* %out
+  store float %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f32(ptr addrspace(1) %out, float %x) {
 ; SI-LABEL: add_inline_imm_0.5_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -470,11 +470,11 @@ define amdgpu_kernel void @add_inline_imm_0.5_f32(float addrspace(1)* %out, floa
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd float %x, 0.5
-  store float %y, float addrspace(1)* %out
+  store float %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(ptr addrspace(1) %out, float %x) {
 ; SI-LABEL: add_inline_imm_neg_0.5_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -497,11 +497,11 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out,
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd float %x, -0.5
-  store float %y, float addrspace(1)* %out
+  store float %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f32(ptr addrspace(1) %out, float %x) {
 ; SI-LABEL: add_inline_imm_1.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -524,11 +524,11 @@ define amdgpu_kernel void @add_inline_imm_1.0_f32(float addrspace(1)* %out, floa
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd float %x, 1.0
-  store float %y, float addrspace(1)* %out
+  store float %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(ptr addrspace(1) %out, float %x) {
 ; SI-LABEL: add_inline_imm_neg_1.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -551,11 +551,11 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out,
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd float %x, -1.0
-  store float %y, float addrspace(1)* %out
+  store float %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f32(ptr addrspace(1) %out, float %x) {
 ; SI-LABEL: add_inline_imm_2.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -578,11 +578,11 @@ define amdgpu_kernel void @add_inline_imm_2.0_f32(float addrspace(1)* %out, floa
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd float %x, 2.0
-  store float %y, float addrspace(1)* %out
+  store float %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(ptr addrspace(1) %out, float %x) {
 ; SI-LABEL: add_inline_imm_neg_2.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -605,11 +605,11 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out,
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd float %x, -2.0
-  store float %y, float addrspace(1)* %out
+  store float %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f32(ptr addrspace(1) %out, float %x) {
 ; SI-LABEL: add_inline_imm_4.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -632,11 +632,11 @@ define amdgpu_kernel void @add_inline_imm_4.0_f32(float addrspace(1)* %out, floa
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd float %x, 4.0
-  store float %y, float addrspace(1)* %out
+  store float %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(ptr addrspace(1) %out, float %x) {
 ; SI-LABEL: add_inline_imm_neg_4.0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -659,11 +659,11 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out,
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd float %x, -4.0
-  store float %y, float addrspace(1)* %out
+  store float %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: commute_add_inline_imm_0.5_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -699,13 +699,13 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %o
 ; VI-NEXT:    v_add_f32_e32 v0, 0.5, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %x = load float, float addrspace(1)* %in
+  %x = load float, ptr addrspace(1) %in
   %y = fadd float %x, 0.5
-  store float %y, float addrspace(1)* %out
+  store float %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_literal_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: commute_add_literal_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -741,13 +741,13 @@ define amdgpu_kernel void @commute_add_literal_f32(float addrspace(1)* %out, flo
 ; VI-NEXT:    v_add_f32_e32 v0, 0x44800000, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %x = load float, float addrspace(1)* %in
+  %x = load float, ptr addrspace(1) %in
   %y = fadd float %x, 1024.0
-  store float %y, float addrspace(1)* %out
+  store float %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_1_f32(ptr addrspace(1) %out, float %x) {
 ; SI-LABEL: add_inline_imm_1_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -770,11 +770,11 @@ define amdgpu_kernel void @add_inline_imm_1_f32(float addrspace(1)* %out, float
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd float %x, 0x36a0000000000000
-  store float %y, float addrspace(1)* %out
+  store float %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_2_f32(ptr addrspace(1) %out, float %x) {
 ; SI-LABEL: add_inline_imm_2_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -797,11 +797,11 @@ define amdgpu_kernel void @add_inline_imm_2_f32(float addrspace(1)* %out, float
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd float %x, 0x36b0000000000000
-  store float %y, float addrspace(1)* %out
+  store float %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_16_f32(ptr addrspace(1) %out, float %x) {
 ; SI-LABEL: add_inline_imm_16_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -824,11 +824,11 @@ define amdgpu_kernel void @add_inline_imm_16_f32(float addrspace(1)* %out, float
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd float %x, 0x36e0000000000000
-  store float %y, float addrspace(1)* %out
+  store float %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f32(ptr addrspace(1) %out, float %x) {
 ; SI-LABEL: add_inline_imm_neg_1_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -855,11 +855,11 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, fl
   %xbc = bitcast float %x to i32
   %y = add i32 %xbc, -1
   %ybc = bitcast i32 %y to float
-  store float %ybc, float addrspace(1)* %out
+  store float %ybc, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f32(ptr addrspace(1) %out, float %x) {
 ; SI-LABEL: add_inline_imm_neg_2_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -886,11 +886,11 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, fl
   %xbc = bitcast float %x to i32
   %y = add i32 %xbc, -2
   %ybc = bitcast i32 %y to float
-  store float %ybc, float addrspace(1)* %out
+  store float %ybc, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f32(ptr addrspace(1) %out, float %x) {
 ; SI-LABEL: add_inline_imm_neg_16_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -917,11 +917,11 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, f
   %xbc = bitcast float %x to i32
   %y = add i32 %xbc, -16
   %ybc = bitcast i32 %y to float
-  store float %ybc, float addrspace(1)* %out
+  store float %ybc, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_63_f32(ptr addrspace(1) %out, float %x) {
 ; SI-LABEL: add_inline_imm_63_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -944,11 +944,11 @@ define amdgpu_kernel void @add_inline_imm_63_f32(float addrspace(1)* %out, float
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd float %x, 0x36ff800000000000
-  store float %y, float addrspace(1)* %out
+  store float %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_64_f32(ptr addrspace(1) %out, float %x) {
 ; SI-LABEL: add_inline_imm_64_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -971,11 +971,11 @@ define amdgpu_kernel void @add_inline_imm_64_f32(float addrspace(1)* %out, float
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd float %x, 0x3700000000000000
-  store float %y, float addrspace(1)* %out
+  store float %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) {
 ; SI-LABEL: add_inline_imm_0.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -998,11 +998,11 @@ define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, [8
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd double %x, 0.0
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f64(ptr addrspace(1) %out, [8 x i32], double %x) {
 ; SI-LABEL: add_inline_imm_0.5_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1025,11 +1025,11 @@ define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, [8
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd double %x, 0.5
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(ptr addrspace(1) %out, [8 x i32], double %x) {
 ; SI-LABEL: add_inline_imm_neg_0.5_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1052,11 +1052,11 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out,
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd double %x, -0.5
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) {
 ; SI-LABEL: add_inline_imm_1.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1079,11 +1079,11 @@ define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, [8
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd double %x, 1.0
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) {
 ; SI-LABEL: add_inline_imm_neg_1.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1106,11 +1106,11 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out,
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd double %x, -1.0
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) {
 ; SI-LABEL: add_inline_imm_2.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1133,11 +1133,11 @@ define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, [8
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd double %x, 2.0
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) {
 ; SI-LABEL: add_inline_imm_neg_2.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1160,11 +1160,11 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out,
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd double %x, -2.0
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) {
 ; SI-LABEL: add_inline_imm_4.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1187,11 +1187,11 @@ define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, [8
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd double %x, 4.0
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(ptr addrspace(1) %out, [8 x i32], double %x) {
 ; SI-LABEL: add_inline_imm_neg_4.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1214,11 +1214,11 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out,
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd double %x, -4.0
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], double %x) {
 ; SI-LABEL: add_inline_imm_inv_2pi_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1243,11 +1243,11 @@ define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out,
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd double %x, 0x3fc45f306dc9c882
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_m_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], double %x) {
 ; SI-LABEL: add_m_inv_2pi_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1274,11 +1274,11 @@ define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, [8 x i32
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd double %x, 0xbfc45f306dc9c882
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_1_f64(ptr addrspace(1) %out, [8 x i32], double %x) {
 ; SI-LABEL: add_inline_imm_1_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1301,11 +1301,11 @@ define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, [8 x
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd double %x, 0x0000000000000001
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_2_f64(ptr addrspace(1) %out, [8 x i32], double %x) {
 ; SI-LABEL: add_inline_imm_2_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1328,11 +1328,11 @@ define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, [8 x
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd double %x, 0x0000000000000002
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_16_f64(ptr addrspace(1) %out, [8 x i32], double %x) {
 ; SI-LABEL: add_inline_imm_16_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1355,11 +1355,11 @@ define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, [8 x
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd double %x, 0x0000000000000010
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x i32], double %x) {
 ; SI-LABEL: add_inline_imm_neg_1_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1382,11 +1382,11 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, [
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd double %x, 0xffffffffffffffff
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x i32], double %x) {
 ; SI-LABEL: add_inline_imm_neg_2_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1409,11 +1409,11 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, [
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd double %x, 0xfffffffffffffffe
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x i32], double %x) {
 ; SI-LABEL: add_inline_imm_neg_16_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1436,11 +1436,11 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out,
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd double %x, 0xfffffffffffffff0
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_63_f64(ptr addrspace(1) %out, [8 x i32], double %x) {
 ; SI-LABEL: add_inline_imm_63_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1463,11 +1463,11 @@ define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, [8 x
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd double %x, 0x000000000000003F
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, [8 x i32], double %x) {
+define amdgpu_kernel void @add_inline_imm_64_f64(ptr addrspace(1) %out, [8 x i32], double %x) {
 ; SI-LABEL: add_inline_imm_64_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
@@ -1490,11 +1490,11 @@ define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, [8 x
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %y = fadd double %x, 0x0000000000000040
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_0.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1516,11 +1516,11 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store double 0.0, double addrspace(1)* %out
+  store double 0.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(ptr addrspace(1) %out) {
 ; SI-LABEL: store_literal_imm_neg_0.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1542,11 +1542,11 @@ define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %o
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store double -0.0, double addrspace(1)* %out
+  store double -0.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_0.5_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_0.5_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1568,11 +1568,11 @@ define amdgpu_kernel void @store_inline_imm_0.5_f64(double addrspace(1)* %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store double 0.5, double addrspace(1)* %out
+  store double 0.5, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_m_0.5_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1594,11 +1594,11 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out)
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store double -0.5, double addrspace(1)* %out
+  store double -0.5, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_1.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_1.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1620,11 +1620,11 @@ define amdgpu_kernel void @store_inline_imm_1.0_f64(double addrspace(1)* %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store double 1.0, double addrspace(1)* %out
+  store double 1.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_m_1.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1646,11 +1646,11 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out)
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store double -1.0, double addrspace(1)* %out
+  store double -1.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_2.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_2.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1672,11 +1672,11 @@ define amdgpu_kernel void @store_inline_imm_2.0_f64(double addrspace(1)* %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store double 2.0, double addrspace(1)* %out
+  store double 2.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_m_2.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1698,11 +1698,11 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out)
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store double -2.0, double addrspace(1)* %out
+  store double -2.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_4.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_4.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1724,11 +1724,11 @@ define amdgpu_kernel void @store_inline_imm_4.0_f64(double addrspace(1)* %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store double 4.0, double addrspace(1)* %out
+  store double 4.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_m_4.0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1750,11 +1750,11 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out)
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store double -4.0, double addrspace(1)* %out
+  store double -4.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inv_2pi_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inv_2pi_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1776,11 +1776,11 @@ define amdgpu_kernel void @store_inv_2pi_f64(double addrspace(1)* %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store double 0x3fc45f306dc9c882, double addrspace(1)* %out
+  store double 0x3fc45f306dc9c882, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(ptr addrspace(1) %out) {
 ; SI-LABEL: store_inline_imm_m_inv_2pi_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1802,11 +1802,11 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store double 0xbfc45f306dc9c882, double addrspace(1)* %out
+  store double 0xbfc45f306dc9c882, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_literal_imm_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_f64(ptr addrspace(1) %out) {
 ; SI-LABEL: store_literal_imm_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -1828,7 +1828,7 @@ define amdgpu_kernel void @store_literal_imm_f64(double addrspace(1)* %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-  store double 4096.0, double addrspace(1)* %out
+  store double 4096.0, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll
index 17aff8b243f37..8cf9dc1250887 100644
--- a/llvm/test/CodeGen/AMDGPU/imm16.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm16.ll
@@ -6,7 +6,7 @@
 
 ; FIXME: Merge into imm.ll
 
-define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) {
 ; GFX10-LABEL: store_inline_imm_neg_0.0_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
@@ -51,11 +51,11 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out)
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
-  store volatile i16 -32768, i16 addrspace(1)* %out
+  store volatile i16 -32768, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_0.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) {
 ; GFX10-LABEL: store_inline_imm_0.0_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
@@ -96,11 +96,11 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(half addrspace(1)* %out) {
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-  store half 0.0, half addrspace(1)* %out
+  store half 0.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_imm_neg_0.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) {
 ; GFX10-LABEL: store_imm_neg_0.0_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
@@ -141,11 +141,11 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(half addrspace(1)* %out) {
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-  store half -0.0, half addrspace(1)* %out
+  store half -0.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_0.5_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) {
 ; GFX10-LABEL: store_inline_imm_0.5_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
@@ -186,11 +186,11 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(half addrspace(1)* %out) {
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-  store half 0.5, half addrspace(1)* %out
+  store half 0.5, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) {
 ; GFX10-LABEL: store_inline_imm_m_0.5_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
@@ -231,11 +231,11 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) {
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-  store half -0.5, half addrspace(1)* %out
+  store half -0.5, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_1.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) {
 ; GFX10-LABEL: store_inline_imm_1.0_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
@@ -276,11 +276,11 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(half addrspace(1)* %out) {
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-  store half 1.0, half addrspace(1)* %out
+  store half 1.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) {
 ; GFX10-LABEL: store_inline_imm_m_1.0_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
@@ -321,11 +321,11 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) {
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-  store half -1.0, half addrspace(1)* %out
+  store half -1.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_2.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) {
 ; GFX10-LABEL: store_inline_imm_2.0_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
@@ -366,11 +366,11 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(half addrspace(1)* %out) {
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-  store half 2.0, half addrspace(1)* %out
+  store half 2.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) {
 ; GFX10-LABEL: store_inline_imm_m_2.0_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
@@ -411,11 +411,11 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) {
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-  store half -2.0, half addrspace(1)* %out
+  store half -2.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_4.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) {
 ; GFX10-LABEL: store_inline_imm_4.0_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
@@ -456,11 +456,11 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(half addrspace(1)* %out) {
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-  store half 4.0, half addrspace(1)* %out
+  store half 4.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) {
 ; GFX10-LABEL: store_inline_imm_m_4.0_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
@@ -501,11 +501,11 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) {
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-  store half -4.0, half addrspace(1)* %out
+  store half -4.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) {
 ; GFX10-LABEL: store_inline_imm_inv_2pi_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
@@ -546,11 +546,11 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out)
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-  store half 0xH3118, half addrspace(1)* %out
+  store half 0xH3118, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out) {
 ; GFX10-LABEL: store_inline_imm_m_inv_2pi_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
@@ -591,11 +591,11 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %ou
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-  store half 0xHB118, half addrspace(1)* %out
+  store half 0xHB118, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_literal_imm_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) {
 ; GFX10-LABEL: store_literal_imm_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
@@ -636,11 +636,11 @@ define amdgpu_kernel void @store_literal_imm_f16(half addrspace(1)* %out) {
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-  store half 4096.0, half addrspace(1)* %out
+  store half 4096.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x) {
 ; GFX10-LABEL: add_inline_imm_0.0_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
@@ -690,11 +690,11 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
   %y = fadd half %x, 0.0
-  store half %y, half addrspace(1)* %out
+  store half %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x) {
 ; GFX10-LABEL: add_inline_imm_0.5_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
@@ -744,11 +744,11 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
   %y = fadd half %x, 0.5
-  store half %y, half addrspace(1)* %out
+  store half %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, half %x) {
 ; GFX10-LABEL: add_inline_imm_neg_0.5_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
@@ -798,11 +798,11 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, h
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
   %y = fadd half %x, -0.5
-  store half %y, half addrspace(1)* %out
+  store half %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x) {
 ; GFX10-LABEL: add_inline_imm_1.0_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
@@ -852,11 +852,11 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
   %y = fadd half %x, 1.0
-  store half %y, half addrspace(1)* %out
+  store half %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, half %x) {
 ; GFX10-LABEL: add_inline_imm_neg_1.0_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
@@ -906,11 +906,11 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, h
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
   %y = fadd half %x, -1.0
-  store half %y, half addrspace(1)* %out
+  store half %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x) {
 ; GFX10-LABEL: add_inline_imm_2.0_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
@@ -960,11 +960,11 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
   %y = fadd half %x, 2.0
-  store half %y, half addrspace(1)* %out
+  store half %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, half %x) {
 ; GFX10-LABEL: add_inline_imm_neg_2.0_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
@@ -1014,11 +1014,11 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, h
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
   %y = fadd half %x, -2.0
-  store half %y, half addrspace(1)* %out
+  store half %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x) {
 ; GFX10-LABEL: add_inline_imm_4.0_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
@@ -1068,11 +1068,11 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
   %y = fadd half %x, 4.0
-  store half %y, half addrspace(1)* %out
+  store half %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, half %x) {
 ; GFX10-LABEL: add_inline_imm_neg_4.0_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
@@ -1122,11 +1122,11 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, h
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
   %y = fadd half %x, -4.0
-  store half %y, half addrspace(1)* %out
+  store half %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX10-LABEL: commute_add_inline_imm_0.5_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa]
@@ -1201,13 +1201,13 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %ou
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
-  %x = load half, half addrspace(1)* %in
+  %x = load half, ptr addrspace(1) %in
   %y = fadd half %x, 0.5
-  store half %y, half addrspace(1)* %out
+  store half %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX10-LABEL: commute_add_literal_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa]
@@ -1282,13 +1282,13 @@ define amdgpu_kernel void @commute_add_literal_f16(half addrspace(1)* %out, half
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
-  %x = load half, half addrspace(1)* %in
+  %x = load half, ptr addrspace(1) %in
   %y = fadd half %x, 1024.0
-  store half %y, half addrspace(1)* %out
+  store half %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x) {
 ; GFX10-LABEL: add_inline_imm_1_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
@@ -1338,11 +1338,11 @@ define amdgpu_kernel void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
   %y = fadd half %x, 0xH0001
-  store half %y, half addrspace(1)* %out
+  store half %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x) {
 ; GFX10-LABEL: add_inline_imm_2_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
@@ -1392,11 +1392,11 @@ define amdgpu_kernel void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
   %y = fadd half %x, 0xH0002
-  store half %y, half addrspace(1)* %out
+  store half %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) {
 ; GFX10-LABEL: add_inline_imm_16_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
@@ -1446,11 +1446,11 @@ define amdgpu_kernel void @add_inline_imm_16_f16(half addrspace(1)* %out, half %
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
   %y = fadd half %x, 0xH0010
-  store half %y, half addrspace(1)* %out
+  store half %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX10-LABEL: add_inline_imm_neg_1_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa]
@@ -1523,14 +1523,14 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, i16
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
-  %x = load i16, i16 addrspace(1)* %in
+  %x = load i16, ptr addrspace(1) %in
   %y = add i16 %x, -1
   %ybc = bitcast i16 %y to half
-  store half %ybc, half addrspace(1)* %out
+  store half %ybc, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX10-LABEL: add_inline_imm_neg_2_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa]
@@ -1603,14 +1603,14 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, i16
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, -2, v0
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
-  %x = load i16, i16 addrspace(1)* %in
+  %x = load i16, ptr addrspace(1) %in
   %y = add i16 %x, -2
   %ybc = bitcast i16 %y to half
-  store half %ybc, half addrspace(1)* %out
+  store half %ybc, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX10-LABEL: add_inline_imm_neg_16_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa]
@@ -1683,14 +1683,14 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, i1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, -16, v0
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
-  %x = load i16, i16 addrspace(1)* %in
+  %x = load i16, ptr addrspace(1) %in
   %y = add i16 %x, -16
   %ybc = bitcast i16 %y to half
-  store half %ybc, half addrspace(1)* %out
+  store half %ybc, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x) {
 ; GFX10-LABEL: add_inline_imm_63_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
@@ -1740,11 +1740,11 @@ define amdgpu_kernel void @add_inline_imm_63_f16(half addrspace(1)* %out, half %
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
   %y = fadd half %x, 0xH003F
-  store half %y, half addrspace(1)* %out
+  store half %y, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x) {
 ; GFX10-LABEL: add_inline_imm_64_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
@@ -1794,13 +1794,13 @@ define amdgpu_kernel void @add_inline_imm_64_f16(half addrspace(1)* %out, half %
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
   %y = fadd half %x, 0xH0040
-  store half %y, half addrspace(1)* %out
+  store half %y, ptr addrspace(1) %out
   ret void
 }
 
 ; This needs to be emitted as a literal constant since the 16-bit
 ; float values do not work for 16-bit integer operations.
-define void @mul_inline_imm_0.5_i16(i16 addrspace(1)* %out, i16 %x) {
+define void @mul_inline_imm_0.5_i16(ptr addrspace(1) %out, i16 %x) {
 ; GFX10-LABEL: mul_inline_imm_0.5_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
@@ -1840,11 +1840,11 @@ define void @mul_inline_imm_0.5_i16(i16 addrspace(1)* %out, i16 %x) {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
   %y = mul i16 %x, bitcast (half 0.5 to i16)
-  store i16 %y, i16 addrspace(1)* %out
+  store i16 %y, ptr addrspace(1) %out
   ret void
 }
 
-define void @mul_inline_imm_neg_0.5_i16(i16 addrspace(1)* %out, i16 %x) {
+define void @mul_inline_imm_neg_0.5_i16(ptr addrspace(1) %out, i16 %x) {
 ; GFX10-LABEL: mul_inline_imm_neg_0.5_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
@@ -1884,11 +1884,11 @@ define void @mul_inline_imm_neg_0.5_i16(i16 addrspace(1)* %out, i16 %x) {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
   %y = mul i16 %x, bitcast (half -0.5 to i16)
-  store i16 %y, i16 addrspace(1)* %out
+  store i16 %y, ptr addrspace(1) %out
   ret void
 }
 
-define void @mul_inline_imm_1.0_i16(i16 addrspace(1)* %out, i16 %x) {
+define void @mul_inline_imm_1.0_i16(ptr addrspace(1) %out, i16 %x) {
 ; GFX10-LABEL: mul_inline_imm_1.0_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
@@ -1928,11 +1928,11 @@ define void @mul_inline_imm_1.0_i16(i16 addrspace(1)* %out, i16 %x) {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
   %y = mul i16 %x, bitcast (half 1.0 to i16)
-  store i16 %y, i16 addrspace(1)* %out
+  store i16 %y, ptr addrspace(1) %out
   ret void
 }
 
-define void @mul_inline_imm_neg_1.0_i16(i16 addrspace(1)* %out, i16 %x) {
+define void @mul_inline_imm_neg_1.0_i16(ptr addrspace(1) %out, i16 %x) {
 ; GFX10-LABEL: mul_inline_imm_neg_1.0_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
@@ -1972,11 +1972,11 @@ define void @mul_inline_imm_neg_1.0_i16(i16 addrspace(1)* %out, i16 %x) {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
   %y = mul i16 %x, bitcast (half -1.0 to i16)
-  store i16 %y, i16 addrspace(1)* %out
+  store i16 %y, ptr addrspace(1) %out
   ret void
 }
 
-define void @shl_inline_imm_2.0_i16(i16 addrspace(1)* %out, i16 %x) {
+define void @shl_inline_imm_2.0_i16(ptr addrspace(1) %out, i16 %x) {
 ; GFX10-LABEL: shl_inline_imm_2.0_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
@@ -2016,11 +2016,11 @@ define void @shl_inline_imm_2.0_i16(i16 addrspace(1)* %out, i16 %x) {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
   %y = shl i16 bitcast (half 2.0 to i16), %x
-  store i16 %y, i16 addrspace(1)* %out
+  store i16 %y, ptr addrspace(1) %out
   ret void
 }
 
-define void @shl_inline_imm_neg_2.0_i16(i16 addrspace(1)* %out, i16 %x) {
+define void @shl_inline_imm_neg_2.0_i16(ptr addrspace(1) %out, i16 %x) {
 ; GFX10-LABEL: shl_inline_imm_neg_2.0_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
@@ -2060,11 +2060,11 @@ define void @shl_inline_imm_neg_2.0_i16(i16 addrspace(1)* %out, i16 %x) {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
   %y = shl i16 bitcast (half -2.0 to i16), %x
-  store i16 %y, i16 addrspace(1)* %out
+  store i16 %y, ptr addrspace(1) %out
   ret void
 }
 
-define void @mul_inline_imm_4.0_i16(i16 addrspace(1)* %out, i16 %x) {
+define void @mul_inline_imm_4.0_i16(ptr addrspace(1) %out, i16 %x) {
 ; GFX10-LABEL: mul_inline_imm_4.0_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
@@ -2104,11 +2104,11 @@ define void @mul_inline_imm_4.0_i16(i16 addrspace(1)* %out, i16 %x) {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
   %y = mul i16 %x, bitcast (half 4.0 to i16)
-  store i16 %y, i16 addrspace(1)* %out
+  store i16 %y, ptr addrspace(1) %out
   ret void
 }
 
-define void @mul_inline_imm_neg_4.0_i16(i16 addrspace(1)* %out, i16 %x) {
+define void @mul_inline_imm_neg_4.0_i16(ptr addrspace(1) %out, i16 %x) {
 ; GFX10-LABEL: mul_inline_imm_neg_4.0_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
@@ -2148,11 +2148,11 @@ define void @mul_inline_imm_neg_4.0_i16(i16 addrspace(1)* %out, i16 %x) {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
   %y = mul i16 %x, bitcast (half -4.0 to i16)
-  store i16 %y, i16 addrspace(1)* %out
+  store i16 %y, ptr addrspace(1) %out
   ret void
 }
 
-define void @mul_inline_imm_inv2pi_i16(i16 addrspace(1)* %out, i16 %x) {
+define void @mul_inline_imm_inv2pi_i16(ptr addrspace(1) %out, i16 %x) {
 ; GFX10-LABEL: mul_inline_imm_inv2pi_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
@@ -2192,6 +2192,6 @@ define void @mul_inline_imm_inv2pi_i16(i16 addrspace(1)* %out, i16 %x) {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
   %y = mul i16 %x, bitcast (half 0xH3118 to i16)
-  store i16 %y, i16 addrspace(1)* %out
+  store i16 %y, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll
index 493b2c4b6611b..8c33004a2316e 100644
--- a/llvm/test/CodeGen/AMDGPU/immv216.ll
+++ b/llvm/test/CodeGen/AMDGPU/immv216.ll
@@ -8,112 +8,112 @@
 ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000 ; encoding
 ; GCN: buffer_store_{{dword|b32}} [[REG]]
-define amdgpu_kernel void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 {
-  store <2 x i16> <i16 -32768, i16 -32768>, <2 x i16> addrspace(1)* %out
+define amdgpu_kernel void @store_inline_imm_neg_0.0_v2i16(ptr addrspace(1) %out) #0 {
+  store <2 x i16> <i16 -32768, i16 -32768>, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_inline_imm_0.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0 ; encoding
 ; GCN: buffer_store_{{dword|b32}} [[REG]]
-define amdgpu_kernel void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
-  store <2 x half> <half 0.0, half 0.0>, <2 x half> addrspace(1)* %out
+define amdgpu_kernel void @store_inline_imm_0.0_v2f16(ptr addrspace(1) %out) #0 {
+  store <2 x half> <half 0.0, half 0.0>, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_imm_neg_0.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000 ; encoding
 ; GCN: buffer_store_{{dword|b32}} [[REG]]
-define amdgpu_kernel void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
-  store <2 x half> <half -0.0, half -0.0>, <2 x half> addrspace(1)* %out
+define amdgpu_kernel void @store_imm_neg_0.0_v2f16(ptr addrspace(1) %out) #0 {
+  store <2 x half> <half -0.0, half -0.0>, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_inline_imm_0.5_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x38003800 ; encoding
 ; GCN: buffer_store_{{dword|b32}} [[REG]]
-define amdgpu_kernel void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
-  store <2 x half> <half 0.5, half 0.5>, <2 x half> addrspace(1)* %out
+define amdgpu_kernel void @store_inline_imm_0.5_v2f16(ptr addrspace(1) %out) #0 {
+  store <2 x half> <half 0.5, half 0.5>, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_inline_imm_m_0.5_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800b800 ; encoding
 ; GCN: buffer_store_{{dword|b32}} [[REG]]
-define amdgpu_kernel void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
-  store <2 x half> <half -0.5, half -0.5>, <2 x half> addrspace(1)* %out
+define amdgpu_kernel void @store_inline_imm_m_0.5_v2f16(ptr addrspace(1) %out) #0 {
+  store <2 x half> <half -0.5, half -0.5>, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_inline_imm_1.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00 ; encoding
 ; GCN: buffer_store_{{dword|b32}} [[REG]]
-define amdgpu_kernel void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
-  store <2 x half> <half 1.0, half 1.0>, <2 x half> addrspace(1)* %out
+define amdgpu_kernel void @store_inline_imm_1.0_v2f16(ptr addrspace(1) %out) #0 {
+  store <2 x half> <half 1.0, half 1.0>, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_inline_imm_m_1.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00 ; encoding
 ; GCN: buffer_store_{{dword|b32}} [[REG]]
-define amdgpu_kernel void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
-  store <2 x half> <half -1.0, half -1.0>, <2 x half> addrspace(1)* %out
+define amdgpu_kernel void @store_inline_imm_m_1.0_v2f16(ptr addrspace(1) %out) #0 {
+  store <2 x half> <half -1.0, half -1.0>, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_inline_imm_2.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x40004000 ; encoding
 ; GCN: buffer_store_{{dword|b32}} [[REG]]
-define amdgpu_kernel void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
-  store <2 x half> <half 2.0, half 2.0>, <2 x half> addrspace(1)* %out
+define amdgpu_kernel void @store_inline_imm_2.0_v2f16(ptr addrspace(1) %out) #0 {
+  store <2 x half> <half 2.0, half 2.0>, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_inline_imm_m_2.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000c000 ; encoding
 ; GCN: buffer_store_{{dword|b32}} [[REG]]
-define amdgpu_kernel void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
-  store <2 x half> <half -2.0, half -2.0>, <2 x half> addrspace(1)* %out
+define amdgpu_kernel void @store_inline_imm_m_2.0_v2f16(ptr addrspace(1) %out) #0 {
+  store <2 x half> <half -2.0, half -2.0>, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_inline_imm_4.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x44004400 ; encoding
 ; GCN: buffer_store_{{dword|b32}} [[REG]]
-define amdgpu_kernel void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
-  store <2 x half> <half 4.0, half 4.0>, <2 x half> addrspace(1)* %out
+define amdgpu_kernel void @store_inline_imm_4.0_v2f16(ptr addrspace(1) %out) #0 {
+  store <2 x half> <half 4.0, half 4.0>, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_inline_imm_m_4.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400c400 ; encoding
 ; GCN: buffer_store_{{dword|b32}} [[REG]]
-define amdgpu_kernel void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
-  store <2 x half> <half -4.0, half -4.0>, <2 x half> addrspace(1)* %out
+define amdgpu_kernel void @store_inline_imm_m_4.0_v2f16(ptr addrspace(1) %out) #0 {
+  store <2 x half> <half -4.0, half -4.0>, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x31183118 ; encoding
 ; GCN: buffer_store_{{dword|b32}} [[REG]]
-define amdgpu_kernel void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
-  store <2 x half> <half 0xH3118, half 0xH3118>, <2 x half> addrspace(1)* %out
+define amdgpu_kernel void @store_inline_imm_inv_2pi_v2f16(ptr addrspace(1) %out) #0 {
+  store <2 x half> <half 0xH3118, half 0xH3118>, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118b118 ; encoding
 ; GCN: buffer_store_{{dword|b32}} [[REG]]
-define amdgpu_kernel void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
-  store <2 x half> <half 0xHB118, half 0xHB118>, <2 x half> addrspace(1)* %out
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_v2f16(ptr addrspace(1) %out) #0 {
+  store <2 x half> <half 0xHB118, half 0xHB118>, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_literal_imm_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c006c00
 ; GCN: buffer_store_{{dword|b32}} [[REG]]
-define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 {
-  store <2 x half> <half 4096.0, half 4096.0>, <2 x half> addrspace(1)* %out
+define amdgpu_kernel void @store_literal_imm_v2f16(ptr addrspace(1) %out) #0 {
+  store <2 x half> <half 4096.0, half 4096.0>, ptr addrspace(1) %out
   ret void
 }
 
@@ -132,9 +132,9 @@ define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out
 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_0.0_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0.0, half 0.0>
-  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  store <2 x half> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -157,9 +157,9 @@ define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %ou
 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0.5
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_0.5_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0.5, half 0.5>
-  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  store <2 x half> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -182,9 +182,9 @@ define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %ou
 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -0.5
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half -0.5, half -0.5>
-  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  store <2 x half> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -207,9 +207,9 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)*
 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1.0
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_1.0_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 1.0, half 1.0>
-  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  store <2 x half> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -233,9 +233,9 @@ define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %ou
 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -1.0
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half -1.0, half -1.0>
-  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  store <2 x half> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -258,9 +258,9 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)*
 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2.0
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_2.0_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 2.0, half 2.0>
-  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  store <2 x half> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -283,9 +283,9 @@ define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %ou
 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -2.0
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half -2.0, half -2.0>
-  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  store <2 x half> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -308,9 +308,9 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)*
 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 4.0
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_4.0_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 4.0, half 4.0>
-  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  store <2 x half> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -333,9 +333,9 @@ define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %ou
 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -4.0
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half -4.0, half -4.0>
-  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  store <2 x half> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -355,10 +355,10 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)*
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
-  %x = load <2 x half>, <2 x half> addrspace(1)* %in
+define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %x = load <2 x half>, ptr addrspace(1) %in
   %y = fadd <2 x half> %x, <half 0.5, half 0.5>
-  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  store <2 x half> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -378,10 +378,10 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace
 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: buffer_store_dword
-define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
-  %x = load <2 x half>, <2 x half> addrspace(1)* %in
+define amdgpu_kernel void @commute_add_literal_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %x = load <2 x half>, ptr addrspace(1) %in
   %y = fadd <2 x half> %x, <half 1024.0, half 1024.0>
-  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  store <2 x half> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -404,9 +404,9 @@ define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %o
 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1 ; encoding
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_1_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xH0001, half 0xH0001>
-  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  store <2 x half> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -430,9 +430,9 @@ define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out,
 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2 ; encoding
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_2_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xH0002, half 0xH0002>
-  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  store <2 x half> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -456,9 +456,9 @@ define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out,
 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 16 ; encoding
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_16_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xH0010, half 0xH0010>
-  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  store <2 x half> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -475,11 +475,11 @@ define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out
 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], -1 ; encoding
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
 ; VI: buffer_store_dword [[REG]]
-define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
   %xbc = bitcast <2 x half> %x to i32
   %y = add i32 %xbc, -1
   %ybc = bitcast i32 %y to <2 x half>
-  store <2 x half> %ybc, <2 x half> addrspace(1)* %out
+  store <2 x half> %ybc, ptr addrspace(1) %out
   ret void
 }
 
@@ -496,11 +496,11 @@ define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %
 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfffefffe ; encoding
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
 ; VI: buffer_store_dword [[REG]]
-define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
   %xbc = bitcast <2 x half> %x to i32
   %y = add i32 %xbc, 4294901758 ; 0xfffefffe
   %ybc = bitcast i32 %y to <2 x half>
-  store <2 x half> %ybc, <2 x half> addrspace(1)* %out
+  store <2 x half> %ybc, ptr addrspace(1) %out
   ret void
 }
 
@@ -518,11 +518,11 @@ define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %
 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfff0fff0 ; encoding
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
 ; VI: buffer_store_dword [[REG]]
-define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
   %xbc = bitcast <2 x half> %x to i32
   %y = add i32 %xbc, 4293984240 ; 0xfff0fff0
   %ybc = bitcast i32 %y to <2 x half>
-  store <2 x half> %ybc, <2 x half> addrspace(1)* %out
+  store <2 x half> %ybc, ptr addrspace(1) %out
   ret void
 }
 
@@ -545,9 +545,9 @@ define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)*
 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 63
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_63_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xH003F, half 0xH003F>
-  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  store <2 x half> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -570,9 +570,9 @@ define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out
 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 64
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_64_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xH0040, half 0xH0040>
-  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  store <2 x half> %y, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
index 0221933acbb5d..060aad291165a 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
@@ -2,216 +2,195 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=5 -S -passes=amdgpu-lower-kernel-attributes,instcombine %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
-define amdgpu_kernel void @get_local_size_x(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @get_local_size_x(ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: @get_local_size_x(
-; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-; GCN-NEXT:    [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[IMPLICITARG_PTR]], i64 12
-; GCN-NEXT:    [[BC_GEP_LOCAL_SIZE:%.*]] = bitcast i8 addrspace(4)* [[GEP_LOCAL_SIZE]] to i16 addrspace(4)*
-; GCN-NEXT:    [[LOCAL_SIZE:%.*]] = load i16, i16 addrspace(4)* [[BC_GEP_LOCAL_SIZE]], align 4
-; GCN-NEXT:    store i16 [[LOCAL_SIZE]], i16 addrspace(1)* [[OUT:%.*]], align 2
+; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; GCN-NEXT:    [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 12
+; GCN-NEXT:    [[LOCAL_SIZE:%.*]] = load i16, ptr addrspace(4) [[GEP_LOCAL_SIZE]], align 4
+; GCN-NEXT:    store i16 [[LOCAL_SIZE]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
-  %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %bc.block.count.x = bitcast i8 addrspace(4)* %implicitarg.ptr  to i32 addrspace(4)*
-  %block.count.x = load i32, i32 addrspace(4)* %bc.block.count.x, align 4
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %block.count.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
   %cmp.id.count = icmp ult i32 %group.id, %block.count.x
   %local.size.offset = select i1 %cmp.id.count, i64 12, i64 18
-  %gep.local.size = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 %local.size.offset
-  %bc.gep.local.size = bitcast i8 addrspace(4)* %gep.local.size to i16 addrspace(4)*
-  %local.size = load i16, i16 addrspace(4)* %bc.gep.local.size, align 2
-  store i16 %local.size, i16 addrspace(1)* %out
+  %gep.local.size = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 %local.size.offset
+  %local.size = load i16, ptr addrspace(4) %gep.local.size, align 2
+  store i16 %local.size, ptr addrspace(1) %out
   ret void
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
-define amdgpu_kernel void @get_local_size_y(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @get_local_size_y(ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: @get_local_size_y(
-; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-; GCN-NEXT:    [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[IMPLICITARG_PTR]], i64 14
-; GCN-NEXT:    [[BC_GEP_LOCAL_SIZE:%.*]] = bitcast i8 addrspace(4)* [[GEP_LOCAL_SIZE]] to i16 addrspace(4)*
-; GCN-NEXT:    [[LOCAL_SIZE:%.*]] = load i16, i16 addrspace(4)* [[BC_GEP_LOCAL_SIZE]], align 2
-; GCN-NEXT:    store i16 [[LOCAL_SIZE]], i16 addrspace(1)* [[OUT:%.*]], align 2
+; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; GCN-NEXT:    [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 14
+; GCN-NEXT:    [[LOCAL_SIZE:%.*]] = load i16, ptr addrspace(4) [[GEP_LOCAL_SIZE]], align 2
+; GCN-NEXT:    store i16 [[LOCAL_SIZE]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
-  %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep.block.count.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 4
-  %bc.block.count.y = bitcast i8 addrspace(4)* %gep.block.count.y  to i32 addrspace(4)*
-  %block.count.y = load i32, i32 addrspace(4)* %bc.block.count.y, align 4
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.block.count.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 4
+  %block.count.y = load i32, ptr addrspace(4) %gep.block.count.y, align 4
   %cmp.id.count = icmp ult i32 %group.id, %block.count.y
   %local.size.offset = select i1 %cmp.id.count, i64 14, i64 20
-  %gep.local.size = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 %local.size.offset
-  %bc.gep.local.size = bitcast i8 addrspace(4)* %gep.local.size to i16 addrspace(4)*
-  %local.size = load i16, i16 addrspace(4)* %bc.gep.local.size, align 2
-  store i16 %local.size, i16 addrspace(1)* %out
+  %gep.local.size = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 %local.size.offset
+  %local.size = load i16, ptr addrspace(4) %gep.local.size, align 2
+  store i16 %local.size, ptr addrspace(1) %out
   ret void
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
-define amdgpu_kernel void @get_local_size_z(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @get_local_size_z(ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: @get_local_size_z(
-; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-; GCN-NEXT:    [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[IMPLICITARG_PTR]], i64 16
-; GCN-NEXT:    [[BC_GEP_LOCAL_SIZE:%.*]] = bitcast i8 addrspace(4)* [[GEP_LOCAL_SIZE]] to i16 addrspace(4)*
-; GCN-NEXT:    [[LOCAL_SIZE:%.*]] = load i16, i16 addrspace(4)* [[BC_GEP_LOCAL_SIZE]], align 4
-; GCN-NEXT:    store i16 [[LOCAL_SIZE]], i16 addrspace(1)* [[OUT:%.*]], align 2
+; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; GCN-NEXT:    [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 16
+; GCN-NEXT:    [[LOCAL_SIZE:%.*]] = load i16, ptr addrspace(4) [[GEP_LOCAL_SIZE]], align 4
+; GCN-NEXT:    store i16 [[LOCAL_SIZE]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.z()
-  %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep.block.count.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 8
-  %bc.block.count.z = bitcast i8 addrspace(4)* %gep.block.count.z  to i32 addrspace(4)*
-  %block.count.z = load i32, i32 addrspace(4)* %bc.block.count.z, align 4
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.block.count.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 8
+  %block.count.z = load i32, ptr addrspace(4) %gep.block.count.z, align 4
   %cmp.id.count = icmp ult i32 %group.id, %block.count.z
   %local.size.offset = select i1 %cmp.id.count, i64 16, i64 22
-  %gep.local.size = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 %local.size.offset
-  %bc.gep.local.size = bitcast i8 addrspace(4)* %gep.local.size to i16 addrspace(4)*
-  %local.size = load i16, i16 addrspace(4)* %bc.gep.local.size, align 2
-  store i16 %local.size, i16 addrspace(1)* %out
+  %gep.local.size = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 %local.size.offset
+  %local.size = load i16, ptr addrspace(4) %gep.local.size, align 2
+  store i16 %local.size, ptr addrspace(1) %out
   ret void
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
-define amdgpu_kernel void @get_remainder_x(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @get_remainder_x(ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: @get_remainder_x(
-; GCN-NEXT:    store i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2
+; GCN-NEXT:    store i16 0, ptr addrspace(1) [[OUT:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
-  %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep.x = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18
-  %bc.x = bitcast i8 addrspace(4)* %gep.x to i16 addrspace(4)*
-  %remainder.x = load i16, i16 addrspace(4)* %bc.x, align 2
-  store i16 %remainder.x, i16 addrspace(1)* %out
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 18
+  %remainder.x = load i16, ptr addrspace(4) %gep.x, align 2
+  store i16 %remainder.x, ptr addrspace(1) %out
   ret void
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
-define amdgpu_kernel void @get_remainder_y(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @get_remainder_y(ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: @get_remainder_y(
-; GCN-NEXT:    store i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2
+; GCN-NEXT:    store i16 0, ptr addrspace(1) [[OUT:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
-  %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18
-  %bc.y = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)*
-  %remainder.y = load i16, i16 addrspace(4)* %bc.y, align 2
-  store i16 %remainder.y, i16 addrspace(1)* %out
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 18
+  %remainder.y = load i16, ptr addrspace(4) %gep.y, align 2
+  store i16 %remainder.y, ptr addrspace(1) %out
   ret void
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
-define amdgpu_kernel void @get_remainder_z(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @get_remainder_z(ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: @get_remainder_z(
-; GCN-NEXT:    store i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2
+; GCN-NEXT:    store i16 0, ptr addrspace(1) [[OUT:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
-  %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18
-  %bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)*
-  %remainder.z = load i16, i16 addrspace(4)* %bc.z, align 2
-  store i16 %remainder.z, i16 addrspace(1)* %out
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 18
+  %remainder.z = load i16, ptr addrspace(4) %gep.z, align 2
+  store i16 %remainder.z, ptr addrspace(1) %out
   ret void
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
-define amdgpu_kernel void @get_work_group_size_x(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @get_work_group_size_x(ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: @get_work_group_size_x(
-; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-; GCN-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[IMPLICITARG_PTR]], i64 12
-; GCN-NEXT:    [[BC_X:%.*]] = bitcast i8 addrspace(4)* [[GEP_X]] to i16 addrspace(4)*
-; GCN-NEXT:    [[GROUP_SIZE_X:%.*]] = load i16, i16 addrspace(4)* [[BC_X]], align 4
-; GCN-NEXT:    store i16 [[GROUP_SIZE_X]], i16 addrspace(1)* [[OUT:%.*]], align 2
+; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; GCN-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 12
+; GCN-NEXT:    [[GROUP_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 4
+; GCN-NEXT:    store i16 [[GROUP_SIZE_X]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
-  %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep.x = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 12
-  %bc.x = bitcast i8 addrspace(4)* %gep.x to i16 addrspace(4)*
-  %group.size.x = load i16, i16 addrspace(4)* %bc.x, align 2
-  store i16 %group.size.x, i16 addrspace(1)* %out
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 12
+  %group.size.x = load i16, ptr addrspace(4) %gep.x, align 2
+  store i16 %group.size.x, ptr addrspace(1) %out
   ret void
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
-define amdgpu_kernel void @get_work_group_size_y(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @get_work_group_size_y(ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: @get_work_group_size_y(
-; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-; GCN-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[IMPLICITARG_PTR]], i64 14
-; GCN-NEXT:    [[BC_Y:%.*]] = bitcast i8 addrspace(4)* [[GEP_Y]] to i16 addrspace(4)*
-; GCN-NEXT:    [[GROUP_SIZE_Y:%.*]] = load i16, i16 addrspace(4)* [[BC_Y]], align 2
-; GCN-NEXT:    store i16 [[GROUP_SIZE_Y]], i16 addrspace(1)* [[OUT:%.*]], align 2
+; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; GCN-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 14
+; GCN-NEXT:    [[GROUP_SIZE_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2
+; GCN-NEXT:    store i16 [[GROUP_SIZE_Y]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
-  %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 14
-  %bc.y = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)*
-  %group.size.y = load i16, i16 addrspace(4)* %bc.y, align 2
-  store i16 %group.size.y, i16 addrspace(1)* %out
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 14
+  %group.size.y = load i16, ptr addrspace(4) %gep.y, align 2
+  store i16 %group.size.y, ptr addrspace(1) %out
   ret void
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
-define amdgpu_kernel void @get_work_group_size_z(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @get_work_group_size_z(ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: @get_work_group_size_z(
-; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-; GCN-NEXT:    [[GEP_Z:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[IMPLICITARG_PTR]], i64 16
-; GCN-NEXT:    [[BC_Z:%.*]] = bitcast i8 addrspace(4)* [[GEP_Z]] to i16 addrspace(4)*
-; GCN-NEXT:    [[GROUP_SIZE_Z:%.*]] = load i16, i16 addrspace(4)* [[BC_Z]], align 4
-; GCN-NEXT:    store i16 [[GROUP_SIZE_Z]], i16 addrspace(1)* [[OUT:%.*]], align 2
+; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; GCN-NEXT:    [[GEP_Z:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 16
+; GCN-NEXT:    [[GROUP_SIZE_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 4
+; GCN-NEXT:    store i16 [[GROUP_SIZE_Z]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
-  %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 16
-  %bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)*
-  %group.size.z = load i16, i16 addrspace(4)* %bc.z, align 2
-  store i16 %group.size.z, i16 addrspace(1)* %out
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 16
+  %group.size.z = load i16, ptr addrspace(4) %gep.z, align 2
+  store i16 %group.size.z, ptr addrspace(1) %out
   ret void
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
-define amdgpu_kernel void @get_work_group_size_x_reqd(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
+define amdgpu_kernel void @get_work_group_size_x_reqd(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
 ; GCN-LABEL: @get_work_group_size_x_reqd(
-; GCN-NEXT:    store i16 8, i16 addrspace(1)* [[OUT:%.*]], align 2
+; GCN-NEXT:    store i16 8, ptr addrspace(1) [[OUT:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
-  %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep.x = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 12
-  %bc.x = bitcast i8 addrspace(4)* %gep.x to i16 addrspace(4)*
-  %group.size.x = load i16, i16 addrspace(4)* %bc.x, align 2
-  store i16 %group.size.x, i16 addrspace(1)* %out
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.x = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 12
+  %group.size.x = load i16, ptr addrspace(4) %gep.x, align 2
+  store i16 %group.size.x, ptr addrspace(1) %out
   ret void
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
-define amdgpu_kernel void @get_work_group_size_y_reqd(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
+define amdgpu_kernel void @get_work_group_size_y_reqd(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
 ; GCN-LABEL: @get_work_group_size_y_reqd(
-; GCN-NEXT:    store i16 16, i16 addrspace(1)* [[OUT:%.*]], align 2
+; GCN-NEXT:    store i16 16, ptr addrspace(1) [[OUT:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
-  %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 14
-  %bc.y = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)*
-  %group.size.y = load i16, i16 addrspace(4)* %bc.y, align 2
-  store i16 %group.size.y, i16 addrspace(1)* %out
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 14
+  %group.size.y = load i16, ptr addrspace(4) %gep.y, align 2
+  store i16 %group.size.y, ptr addrspace(1) %out
   ret void
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
-define amdgpu_kernel void @get_work_group_size_z_reqd(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
+define amdgpu_kernel void @get_work_group_size_z_reqd(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
 ; GCN-LABEL: @get_work_group_size_z_reqd(
-; GCN-NEXT:    store i16 2, i16 addrspace(1)* [[OUT:%.*]], align 2
+; GCN-NEXT:    store i16 2, ptr addrspace(1) [[OUT:%.*]], align 2
 ; GCN-NEXT:    ret void
 ;
-  %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 16
-  %bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)*
-  %group.size.z = load i16, i16 addrspace(4)* %bc.z, align 2
-  store i16 %group.size.z, i16 addrspace(1)* %out
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 16
+  %group.size.z = load i16, ptr addrspace(4) %gep.z, align 2
+  store i16 %group.size.z, ptr addrspace(1) %out
   ret void
 }
 
 
-declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #1
+declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #1
 declare i32 @llvm.amdgcn.workgroup.id.x() #1
 declare i32 @llvm.amdgcn.workgroup.id.y() #1
 declare i32 @llvm.amdgcn.workgroup.id.z() #1

diff  --git a/llvm/test/CodeGen/AMDGPU/implicit-def-muse.ll b/llvm/test/CodeGen/AMDGPU/implicit-def-muse.ll
index 427df91390d10..5e13fa2ef087a 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-def-muse.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-def-muse.ll
@@ -7,12 +7,12 @@
 ; CHECK-NOT: COPY [[IMPDEF0]]
 ; CHECK-NOT: COPY [[IMPDEF1]]
 ; CHECK: .false:
-define <2 x float> @vcopy_i1_undef(<2 x float> addrspace(1)* %p) {
+define <2 x float> @vcopy_i1_undef(ptr addrspace(1) %p) {
 entry:
   br i1 undef, label %exit, label %false
 
 false:
-  %x = load <2 x float>, <2 x float> addrspace(1)* %p
+  %x = load <2 x float>, ptr addrspace(1) %p
   %cmp = fcmp one <2 x float> %x, zeroinitializer
   br label %exit
 

diff  --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index 85ba5c90e44fe..1e8a44e35fe8f 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -7,7 +7,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefixes=GFX9V4 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefixes=GFX9V5 %s
 
-define amdgpu_kernel void @addrspacecast(i32 addrspace(5)* %ptr.private, i32 addrspace(3)* %ptr.local) {
+define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) {
 ; GFX8V3-LABEL: addrspacecast:
 ; GFX8V3:       ; %bb.0:
 ; GFX8V3-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -148,14 +148,14 @@ define amdgpu_kernel void @addrspacecast(i32 addrspace(5)* %ptr.private, i32 add
 ; GFX9V5-NEXT:    flat_store_dword v[2:3], v0
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V5-NEXT:    s_endpgm
-  %flat.private = addrspacecast i32 addrspace(5)* %ptr.private to i32*
-  %flat.local = addrspacecast i32 addrspace(3)* %ptr.local to i32*
-  store volatile i32 1, i32* %flat.private
-  store volatile i32 2, i32* %flat.local
+  %flat.private = addrspacecast ptr addrspace(5) %ptr.private to ptr
+  %flat.local = addrspacecast ptr addrspace(3) %ptr.local to ptr
+  store volatile i32 1, ptr %flat.private
+  store volatile i32 2, ptr %flat.local
   ret void
 }
 
-define amdgpu_kernel void @llvm_amdgcn_is_shared(i8* %ptr) {
+define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V3-LABEL: llvm_amdgcn_is_shared:
 ; GFX8V3:       ; %bb.0:
 ; GFX8V3-NEXT:    s_load_dword s0, s[4:5], 0x40
@@ -227,13 +227,13 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(i8* %ptr) {
 ; GFX9V5-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V5-NEXT:    s_endpgm
-  %is.shared = call i1 @llvm.amdgcn.is.shared(i8* %ptr)
+  %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %ptr)
   %zext = zext i1 %is.shared to i32
-  store volatile i32 %zext, i32 addrspace(1)* undef
+  store volatile i32 %zext, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @llvm_amdgcn_is_private(i8* %ptr) {
+define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V3-LABEL: llvm_amdgcn_is_private:
 ; GFX8V3:       ; %bb.0:
 ; GFX8V3-NEXT:    s_load_dword s0, s[4:5], 0x44
@@ -305,9 +305,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(i8* %ptr) {
 ; GFX9V5-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V5-NEXT:    s_endpgm
-  %is.private = call i1 @llvm.amdgcn.is.private(i8* %ptr)
+  %is.private = call i1 @llvm.amdgcn.is.private(ptr %ptr)
   %zext = zext i1 %is.private to i32
-  store volatile i32 %zext, i32 addrspace(1)* undef
+  store volatile i32 %zext, ptr addrspace(1) undef
   ret void
 }
 
@@ -372,7 +372,7 @@ define amdgpu_kernel void @llvm_debugtrap() {
   unreachable
 }
 
-define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr)  {
+define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ; GFX8V3-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V3:       ; %bb.0:
 ; GFX8V3-NEXT:    v_mov_b32_e32 v0, s6
@@ -496,22 +496,22 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr)  {
 ; GFX9V5-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V5-NEXT:    s_endpgm
-  %queue.ptr = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr()
-  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %queue.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
   %dispatch.id = call i64 @llvm.amdgcn.dispatch.id()
-  %queue.load = load volatile i8, i8 addrspace(4)* %queue.ptr
-  %implicitarg.load = load volatile i8, i8 addrspace(4)* %implicitarg.ptr
-  %dispatch.load = load volatile i8, i8 addrspace(4)* %dispatch.ptr
-  store volatile i64 %dispatch.id, i64 addrspace(1)* %ptr
+  %queue.load = load volatile i8, ptr addrspace(4) %queue.ptr
+  %implicitarg.load = load volatile i8, ptr addrspace(4) %implicitarg.ptr
+  %dispatch.load = load volatile i8, ptr addrspace(4) %dispatch.ptr
+  store volatile i64 %dispatch.id, ptr addrspace(1) %ptr
   ret void
 }
 
-declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr()
-declare noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+declare noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+declare noalias ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 declare i64 @llvm.amdgcn.dispatch.id()
-declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-declare i1 @llvm.amdgcn.is.shared(i8*)
-declare i1 @llvm.amdgcn.is.private(i8*)
+declare noalias ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+declare i1 @llvm.amdgcn.is.shared(ptr)
+declare i1 @llvm.amdgcn.is.private(ptr)
 declare void @llvm.trap()
 declare void @llvm.debugtrap()

diff  --git a/llvm/test/CodeGen/AMDGPU/implicit-kernel-argument-alignment.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernel-argument-alignment.ll
index aca0a07263442..081b60b51f47e 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernel-argument-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernel-argument-alignment.ll
@@ -4,8 +4,8 @@
 ; CHECK-LABEL: test_unaligned_to_eight:
 ; CHECK: .amdhsa_kernarg_size 264
 define amdgpu_kernel void @test_unaligned_to_eight(i32 %four)  {
-  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  store volatile i8 addrspace(4)* %implicitarg.ptr, i8 addrspace(4)* addrspace(1)* undef
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  store volatile ptr addrspace(4) %implicitarg.ptr, ptr addrspace(1) undef
   ret void
 }
 
@@ -13,8 +13,8 @@ define amdgpu_kernel void @test_unaligned_to_eight(i32 %four)  {
 ; CHECK-LABEL: test_aligned_to_eight:
 ; CHECK: .amdhsa_kernarg_size 264
 define amdgpu_kernel void @test_aligned_to_eight(i64 %eight)  {
-  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  store volatile i8 addrspace(4)* %implicitarg.ptr, i8 addrspace(4)* addrspace(1)* undef
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  store volatile ptr addrspace(4) %implicitarg.ptr, ptr addrspace(1) undef
   ret void
 }
 
@@ -55,4 +55,4 @@ define amdgpu_kernel void @test_aligned_to_eight(i64 %eight)  {
 ; CHECK-NEXT:         .kernarg_segment_size: 264
 ; CHECK-LABEL:        .name:           test_aligned_to_eight
 
-declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()

diff  --git a/llvm/test/CodeGen/AMDGPU/implicitarg-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-attributes.ll
index f4da140aeb01f..4c5c136f5333f 100644
--- a/llvm/test/CodeGen/AMDGPU/implicitarg-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicitarg-attributes.ll
@@ -11,29 +11,28 @@ target triple = "amdgcn-amd-amdhsa"
 ; CHECK: .value_kind:     hidden_multigrid_sync_arg
 ; CHECK-LABEL: .name:           kernel_1
 
-define amdgpu_kernel void @kernel_1(i32 addrspace(1)* %a, i64 %index1, i64 %index2, i1 %cond)  {
+define amdgpu_kernel void @kernel_1(ptr addrspace(1) %a, i64 %index1, i64 %index2, i1 %cond)  {
 entry:
-  %tmp7 = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %tmp7 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   br i1 %cond, label %old, label %new
 
 old:                                              ; preds = %entry
-  %tmp4 = getelementptr i8, i8 addrspace(4)* %tmp7, i64 %index1
+  %tmp4 = getelementptr i8, ptr addrspace(4) %tmp7, i64 %index1
   br label %join
 
 new:                                              ; preds = %entry
-  %tmp12 = getelementptr inbounds i8, i8 addrspace(4)* %tmp7, i64 %index2
+  %tmp12 = getelementptr inbounds i8, ptr addrspace(4) %tmp7, i64 %index2
   br label %join
 
 join:                                             ; preds = %new, %old
-  %.in.in.in = phi i8 addrspace(4)* [ %tmp12, %new ], [ %tmp4, %old ]
-  %.in.in = bitcast i8 addrspace(4)* %.in.in.in to i16 addrspace(4)*
+  %.in.in.in = phi ptr addrspace(4) [ %tmp12, %new ], [ %tmp4, %old ]
 
   ;;; THIS USE is where the offset into implicitarg_ptr is unknown
-  %.in = load i16, i16 addrspace(4)* %.in.in, align 2
+  %.in = load i16, ptr addrspace(4) %.in.in.in, align 2
 
   %idx.ext = sext i16 %.in to i64
-  %add.ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idx.ext
-  %tmp16 = atomicrmw add i32 addrspace(1)* %add.ptr3, i32 15 syncscope("agent-one-as") monotonic, align 4
+  %add.ptr3 = getelementptr inbounds i32, ptr addrspace(1) %a, i64 %idx.ext
+  %tmp16 = atomicrmw add ptr addrspace(1) %add.ptr3, i32 15 syncscope("agent-one-as") monotonic, align 4
   ret void
 }
 
@@ -47,19 +46,18 @@ join:                                             ; preds = %new, %old
 ; CHECK-NOT: hidden_multigrid_sync_arg
 ; CHECK-LABEL: .name:           kernel_2
 
-define amdgpu_kernel void @kernel_2(i32 addrspace(1)* %a, i1 %cond)  {
+define amdgpu_kernel void @kernel_2(ptr addrspace(1) %a, i1 %cond)  {
 entry:
-  %tmp7 = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %tmp7 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %tmp5 = select i1 %cond, i64 12, i64 18
-  %tmp6 = getelementptr inbounds i8, i8 addrspace(4)* %tmp7, i64 %tmp5
-  %tmp8 = bitcast i8 addrspace(4)* %tmp6 to i16 addrspace(4)*
+  %tmp6 = getelementptr inbounds i8, ptr addrspace(4) %tmp7, i64 %tmp5
 
   ;;; THIS USE is where multiple offsets are possible, relative to implicitarg_ptr
-  %tmp9 = load i16, i16 addrspace(4)* %tmp8, align 2
+  %tmp9 = load i16, ptr addrspace(4) %tmp6, align 2
 
   %idx.ext = sext i16 %tmp9 to i64
-  %add.ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idx.ext
-  %tmp16 = atomicrmw add i32 addrspace(1)* %add.ptr3, i32 15 syncscope("agent-one-as") monotonic, align 4
+  %add.ptr3 = getelementptr inbounds i32, ptr addrspace(1) %a, i64 %idx.ext
+  %tmp16 = atomicrmw add ptr addrspace(1) %add.ptr3, i32 15 syncscope("agent-one-as") monotonic, align 4
   ret void
 }
 
@@ -67,34 +65,33 @@ entry:
 ; CHECK-NOT: hidden_multigrid_sync_arg
 ; CHECK-LABEL: .name:           kernel_3
 
-define amdgpu_kernel void @kernel_3(i32 addrspace(1)* %a, i1 %cond)  {
+define amdgpu_kernel void @kernel_3(ptr addrspace(1) %a, i1 %cond)  {
 entry:
-  %tmp7 = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %tmp7 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   br i1 %cond, label %old, label %new
 
 old:                                              ; preds = %entry
-  %tmp4 = getelementptr i8, i8 addrspace(4)* %tmp7, i64 12
+  %tmp4 = getelementptr i8, ptr addrspace(4) %tmp7, i64 12
   br label %join
 
 new:                                              ; preds = %entry
-  %tmp12 = getelementptr inbounds i8, i8 addrspace(4)* %tmp7, i64 18
+  %tmp12 = getelementptr inbounds i8, ptr addrspace(4) %tmp7, i64 18
   br label %join
 
 join:                                             ; preds = %new, %old
-  %.in.in.in = phi i8 addrspace(4)* [ %tmp12, %new ], [ %tmp4, %old ]
-  %.in.in = bitcast i8 addrspace(4)* %.in.in.in to i16 addrspace(4)*
+  %.in.in.in = phi ptr addrspace(4) [ %tmp12, %new ], [ %tmp4, %old ]
 
   ;;; THIS USE of implicitarg_ptr should not produce hostcall metadata
-  %.in = load i16, i16 addrspace(4)* %.in.in, align 2
+  %.in = load i16, ptr addrspace(4) %.in.in.in, align 2
 
   %idx.ext = sext i16 %.in to i64
-  %add.ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idx.ext
-  %tmp16 = atomicrmw add i32 addrspace(1)* %add.ptr3, i32 15 syncscope("agent-one-as") monotonic, align 4
+  %add.ptr3 = getelementptr inbounds i32, ptr addrspace(1) %a, i64 %idx.ext
+  %tmp16 = atomicrmw add ptr addrspace(1) %add.ptr3, i32 15 syncscope("agent-one-as") monotonic, align 4
   ret void
 }
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
-declare align 4 i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 
 declare i32 @llvm.amdgcn.workgroup.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
index 261df086aed7d..ab764179b8841 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
@@ -18,22 +18,22 @@
 ; GCN-COUNT-32: v_cndmask_b32
 
 ; GCN-COUNT-4: buffer_store_dwordx4
-define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<16 x i32> addrspace(1)* %out0, <16 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <16 x i32> %vec0) #0 {
+define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) #0 {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %id.ext = zext i32 %id to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
-  %idx0 = load volatile i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext
+  %idx0 = load volatile i32, ptr addrspace(1) %gep
   %idx1 = add i32 %idx0, 1
   %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
   %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0
   %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1
-  store volatile <16 x i32> %vec2, <16 x i32> addrspace(1)* %out0
+  store volatile <16 x i32> %vec2, ptr addrspace(1) %out0
   %cmp = icmp eq i32 %id, 0
   br i1 %cmp, label %bb1, label %bb2
 
 bb1:
-  store volatile i32 %live.out.val, i32 addrspace(1)* undef
+  store volatile i32 %live.out.val, ptr addrspace(1) undef
   br label %bb2
 
 bb2:
@@ -53,15 +53,15 @@ bb2:
 ; GCN-NEXT: v_mov_b32_e32
 ; GCN-NOT: v_mov_b32_e32
 ; GCN-NEXT: s_set_gpr_idx_off
-define amdgpu_kernel void @insert_w_offset_multiple_in_block(<16 x float> addrspace(1)* %out1, i32 %in) #0 {
+define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %out1, i32 %in) #0 {
 entry:
   %add1 = add i32 %in, 1
   %ins1 = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add1
   %add2 = add i32 %in, 2
   %ins2 = insertelement <16 x float> %ins1, float 17.0, i32 %add2
-  store <16 x float> %ins1, <16 x float> addrspace(1)* %out1
-  %out2 = getelementptr <16 x float>, <16 x float> addrspace(1)* %out1, i32 1
-  store <16 x float> %ins2, <16 x float> addrspace(1)* %out2
+  store <16 x float> %ins1, ptr addrspace(1) %out1
+  %out2 = getelementptr <16 x float>, ptr addrspace(1) %out1, i32 1
+  store <16 x float> %ins2, ptr addrspace(1) %out2
 
   ret void
 }
@@ -75,11 +75,11 @@ declare hidden void @foo()
 ; GCN-NEXT: v_mov_b32_e32 {{v[0-9]+}}, 8
 ; GCN-NEXT: s_set_gpr_idx_off
 ; GCN: s_swappc_b64
-define amdgpu_kernel void @insertelement_with_call(<16 x i32> addrspace(1)* %ptr, i32 %idx) #0 {
-  %vec = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
+define amdgpu_kernel void @insertelement_with_call(ptr addrspace(1) %ptr, i32 %idx) #0 {
+  %vec = load <16 x i32>, ptr addrspace(1) %ptr
   %i6 = insertelement <16 x i32> %vec, i32 8, i32 %idx
   call void @foo()
-  store <16 x i32> %i6, <16 x i32> addrspace(1)* null
+  store <16 x i32> %i6, ptr addrspace(1) null
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
index d7663800fe5bf..42f2714f137be 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
@@ -13,10 +13,10 @@
 ; CHECK: buffer_store_dwordx4
 ; CHECK: buffer_store_dwordx4
 ; CHECK: buffer_store_dwordx4
-define amdgpu_kernel void @insert_wo_offset(<16 x float> addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) {
 entry:
   %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %in
-  store <16 x float> %ins, <16 x float> addrspace(1)* %out
+  store <16 x float> %ins, ptr addrspace(1) %out
   ret void
 }
 
@@ -45,19 +45,19 @@ bb:
   br i1 %tmp, label %bb1, label %bb4
 
 bb1:
-  %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+  %tmp2 = load volatile <4 x float>, ptr addrspace(1) undef
   %tmp3 = extractelement <4 x float> %tmp2, i32 undef
   call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp2) #0 ; Prevent block optimize out
   br label %bb7
 
 bb4:
-  %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+  %tmp5 = load volatile <4 x float>, ptr addrspace(1) undef
   %tmp6 = extractelement <4 x float> %tmp5, i32 undef
   call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp5) #0 ; Prevent block optimize out
   br label %bb7
 
 bb7:
   %tmp8 = phi float [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
-  store volatile float %tmp8, float addrspace(1)* undef
+  store volatile float %tmp8, ptr addrspace(1) undef
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
index 732079b4d9ff0..264d56e462702 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
@@ -23,22 +23,22 @@
 ; GCN-COUNT-32: v_cndmask_b32
 
 ; GCN-COUNT-4: buffer_store_dwordx4
-define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<16 x i32> addrspace(1)* %out0, <16 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <16 x i32> %vec0) #0 {
+define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) #0 {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %id.ext = zext i32 %id to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
-  %idx0 = load volatile i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext
+  %idx0 = load volatile i32, ptr addrspace(1) %gep
   %idx1 = add i32 %idx0, 1
   %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
   %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0
   %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1
-  store volatile <16 x i32> %vec2, <16 x i32> addrspace(1)* %out0
+  store volatile <16 x i32> %vec2, ptr addrspace(1) %out0
   %cmp = icmp eq i32 %id, 0
   br i1 %cmp, label %bb1, label %bb2
 
 bb1:
-  store volatile i32 %live.out.val, i32 addrspace(1)* undef
+  store volatile i32 %live.out.val, ptr addrspace(1) undef
   br label %bb2
 
 bb2:

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 446f81e45415c..9867d45417a1c 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -20,11 +20,11 @@
 ; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(SRC0){{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define amdgpu_kernel void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) {
 entry:
   %idx = add i32 %in, 1
   %elt = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %idx
-  store float %elt, float addrspace(1)* %out
+  store float %elt, ptr addrspace(1) %out
   ret void
 }
 
@@ -46,12 +46,12 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(SRC0){{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define amdgpu_kernel void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <16 x i32> %or.val) {
+define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %out, i32 %in, <16 x i32> %or.val) {
 entry:
   %idx = add i32 %in, 1
   %vec = or <16 x i32> %or.val, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
   %elt = extractelement <16 x i32> %vec, i32 %idx
-  store i32 %elt, i32 addrspace(1)* %out
+  store i32 %elt, ptr addrspace(1) %out
   ret void
 }
 
@@ -68,10 +68,10 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(SRC0){{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define amdgpu_kernel void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) {
 entry:
   %elt = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %in
-  store float %elt, float addrspace(1)* %out
+  store float %elt, ptr addrspace(1) %out
   ret void
 }
 
@@ -86,11 +86,11 @@ entry:
 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0){{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define amdgpu_kernel void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) {
+define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -119,12 +119,12 @@ entry:
 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0){{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <16 x i32> %vec0, <16 x i32> %vec1, i32 %offset) {
+define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, <16 x i32> %vec0, <16 x i32> %vec1, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %or = or <16 x i32> %vec0, %vec1
   %value = extractelement <16 x i32> %or, i32 %index
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -135,32 +135,32 @@ entry:
 ; GCN-COUNT-14: v_cndmask_b32
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 16
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -512
   %value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}extract_undef_offset_sgpr:
 ; undefined behavior, but shouldn't crash compiler
-define amdgpu_kernel void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @extract_undef_offset_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
+  %ld = load volatile <4 x i32>, ptr addrspace(1) %in
   %value = extractelement <4 x i32> %ld, i32 undef
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}insert_undef_offset_sgpr_vector_src:
 ; undefined behavior, but shouldn't crash compiler
-define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %ld = load <4 x i32>, <4  x i32> addrspace(1)* %in
+  %ld = load <4 x i32>, ptr addrspace(1) %in
   %value = insertelement <4 x i32> %ld, i32 5, i32 undef
-  store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -177,11 +177,11 @@ entry:
 
 ; MOVREL: v_movreld_b32_e32 v[[ELT0]], v[[INS]]
 ; MOVREL: buffer_store_dwordx4 v[[[ELT0]]:[[ELT3]]]
-define amdgpu_kernel void @insert_w_offset(<16 x float> addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) {
 entry:
   %add = add i32 %in, 1
   %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
-  store <16 x float> %ins, <16 x float> addrspace(1)* %out
+  store <16 x float> %ins, ptr addrspace(1) %out
   ret void
 }
 
@@ -197,12 +197,12 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[BASE]], gpr_idx(DST)
 ; IDXMODE-NEXT: v_mov_b32_e32 [[ELT1]], v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define amdgpu_kernel void @insert_unsigned_base_plus_offset(<16 x float> addrspace(1)* %out, i16 %in) {
+define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %out, i16 %in) {
 entry:
   %base = zext i16 %in to i32
   %add = add i32 %base, 1
   %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
-  store <16 x float> %ins, <16 x float> addrspace(1)* %out
+  store <16 x float> %ins, ptr addrspace(1) %out
   ret void
 }
 
@@ -220,12 +220,12 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[BASE_PLUS_OFFSET]], gpr_idx(DST)
 ; IDXMODE-NEXT: v_mov_b32_e32 [[ELT0]], v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define amdgpu_kernel void @insert_signed_base_plus_offset(<16 x float> addrspace(1)* %out, i16 %in) {
+define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, i16 %in) {
 entry:
   %base = sext i16 %in to i32
   %add = add i32 %base, 1
   %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
-  store <16 x float> %ins, <16 x float> addrspace(1)* %out
+  store <16 x float> %ins, ptr addrspace(1) %out
   ret void
 }
 
@@ -241,10 +241,10 @@ entry:
 ; IDXMODE-NEXT: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dwordx4 v[[[ELT0]]:
-define amdgpu_kernel void @insert_wo_offset(<16 x float> addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) {
 entry:
   %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %in
-  store <16 x float> %ins, <16 x float> addrspace(1)* %out
+  store <16 x float> %ins, ptr addrspace(1) %out
   ret void
 }
 
@@ -257,11 +257,11 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST)
 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 16
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, i32 %offset) {
+define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addrspace(1) %out, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %value = insertelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, i32 16, i32 %index
-  store <16 x i32> %value, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -277,11 +277,11 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST)
 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, <16 x i32> %vec, i32 %offset) {
+define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ptr addrspace(1) %out, <16 x i32> %vec, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %value = insertelement <16 x i32> %vec, i32 5, i32 %index
-  store <16 x i32> %value, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -291,12 +291,12 @@ entry:
 ; GCN: v_cmp_eq_u32_e32
 ; GCN-COUNT-16: v_cndmask_b32
 ; GCN-COUNT-4:  buffer_store_dwordx4
-define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -512
   %value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 33, i32 %index
-  store <16 x i32> %value, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -305,12 +305,12 @@ entry:
 ; GCN: v_cmp_eq_u32_e32
 ; GCN-COUNT-16: v_cndmask_b32
 ; GCN-COUNT-4:  buffer_store_dwordx4
-define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -16
   %value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 500, i32 %index
-  store <16 x i32> %value, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -326,23 +326,23 @@ entry:
 
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
-define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) #0 {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %id.ext = zext i32 %id to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
-  %idx0 = load volatile i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext
+  %idx0 = load volatile i32, ptr addrspace(1) %gep
   %idx1 = add i32 %idx0, 1
   %val0 = extractelement <16 x i32> <i32 7, i32 9, i32 11, i32 13, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %idx0
   %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" ()
   %val1 = extractelement <16 x i32> <i32 7, i32 9, i32 11, i32 13, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %idx1
-  store volatile i32 %val0, i32 addrspace(1)* %out0
-  store volatile i32 %val1, i32 addrspace(1)* %out0
+  store volatile i32 %val0, ptr addrspace(1) %out0
+  store volatile i32 %val1, ptr addrspace(1) %out0
   %cmp = icmp eq i32 %id, 0
   br i1 %cmp, label %bb1, label %bb2
 
 bb1:
-  store volatile i32 %live.out.reg, i32 addrspace(1)* undef
+  store volatile i32 %live.out.reg, ptr addrspace(1) undef
   br label %bb2
 
 bb2:
@@ -361,20 +361,20 @@ bb:
   br i1 %tmp, label %bb1, label %bb4
 
 bb1:                                              ; preds = %bb
-  %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+  %tmp2 = load volatile <4 x float>, ptr addrspace(1) undef
   %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef
   call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp3) #0 ; Prevent block optimize out
   br label %bb7
 
 bb4:                                              ; preds = %bb
-  %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+  %tmp5 = load volatile <4 x float>, ptr addrspace(1) undef
   %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef
   call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp6) #0 ; Prevent block optimize out
   br label %bb7
 
 bb7:                                              ; preds = %bb4, %bb1
   %tmp8 = phi <4 x float> [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
-  store volatile <4 x float> %tmp8, <4 x float> addrspace(1)* undef
+  store volatile <4 x float> %tmp8, ptr addrspace(1) undef
   ret void
 }
 
@@ -414,8 +414,8 @@ bb:
   %tmp6 = extractelement <9 x i32> %tmp5, i32 1
   %tmp7 = bitcast <9 x float> %tmp4 to <9 x i32>
   %tmp8 = extractelement <9 x i32> %tmp7, i32 5
-  store volatile i32 %tmp6, i32 addrspace(3)* undef, align 4
-  store volatile i32 %tmp8, i32 addrspace(3)* undef, align 4
+  store volatile i32 %tmp6, ptr addrspace(3) undef, align 4
+  store volatile i32 %tmp8, ptr addrspace(3) undef, align 4
   ret void
 }
 
@@ -433,12 +433,12 @@ bb:
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dword [[EXTRACT]]
-define amdgpu_kernel void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
 entry:
-  %ld = load volatile <16 x i32>, <16  x i32> addrspace(1)* %in
+  %ld = load volatile <16 x i32>, ptr addrspace(1) %in
   %offset = add i32 %idx, 15
   %value = extractelement <16 x i32> %ld, i32 %offset
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -455,12 +455,12 @@ entry:
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dword [[EXTRACT]]
-define amdgpu_kernel void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
 entry:
-  %ld = load volatile <16 x i32>, <16  x i32> addrspace(1)* %in
+  %ld = load volatile <16 x i32>, ptr addrspace(1) %in
   %offset = add i32 %idx, 16
   %value = extractelement <16 x i32> %ld, i32 %offset
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -474,13 +474,13 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(SRC0)
 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE: s_set_gpr_idx_off
-define amdgpu_kernel void @extractelement_v16i32_or_index(i32 addrspace(1)* %out, <16 x i32> addrspace(1)* %in, i32 %idx.in) {
+define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx.in) {
 entry:
-  %ld = load volatile <16 x i32>, <16  x i32> addrspace(1)* %in
+  %ld = load volatile <16 x i32>, ptr addrspace(1) %in
   %idx.shl = shl i32 %idx.in, 2
   %idx = or i32 %idx.shl, 1
   %value = extractelement <16 x i32> %ld, i32 %idx
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -494,11 +494,11 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(DST)
 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE: s_set_gpr_idx_off
-define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %idx.in) nounwind {
+define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, <16 x float> %a, i32 %idx.in) nounwind {
   %idx.shl = shl i32 %idx.in, 2
   %idx = or i32 %idx.shl, 1
   %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %idx
-  store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
+  store <16 x float> %vecins, ptr addrspace(1) %out, align 64
   ret void
 }
 
@@ -533,7 +533,7 @@ bb2:                                              ; preds = %bb4, %bb
   br i1 %tmp3, label %bb4, label %bb8
 
 bb4:                                              ; preds = %bb2
-  %vgpr = load volatile i32, i32 addrspace(1)* undef
+  %vgpr = load volatile i32, ptr addrspace(1) undef
   %tmp5 = insertelement <16 x i32> undef, i32 undef, i32 %vgpr
   %tmp6 = insertelement <16 x i32> %tmp5, i32 %arg1, i32 %vgpr
   %tmp7 = extractelement <16 x i32> %tmp6, i32 0

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index 36053945e1341..fe7323eeadf8a 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -39,9 +39,9 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
 ; CHECK-NEXT:    s_endpgm
 
 bb:
-  %cond = load i1, i1 addrspace(4)* null
-  %tmp = select i1 %cond, void (i8*, i32, i8*)* bitcast (void ()* @wobble to void (i8*, i32, i8*)*), void (i8*, i32, i8*)* bitcast (void ()* @snork to void (i8*, i32, i8*)*)
-  call void %tmp(i8* undef, i32 undef, i8* undef)
+  %cond = load i1, ptr addrspace(4) null
+  %tmp = select i1 %cond, ptr @wobble, ptr @snork
+  call void %tmp(ptr undef, i32 undef, ptr undef)
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 38383bc5f8531..cc8d85c85b0b6 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefix=GISEL %s
 
- at gv.fptr0 = external hidden unnamed_addr addrspace(4) constant void()*, align 4
- at gv.fptr1 = external hidden unnamed_addr addrspace(4) constant void(i32)*, align 4
+ at gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
+ at gv.fptr1 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
 
 define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
 ; GCN-LABEL: test_indirect_call_sgpr_ptr:
@@ -191,7 +191,7 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GISEL-NEXT:    s_endpgm
-  %fptr = load void()*, void()* addrspace(4)* @gv.fptr0
+  %fptr = load ptr, ptr addrspace(4) @gv.fptr0
   call void %fptr()
   ret void
 }
@@ -384,12 +384,12 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GISEL-NEXT:    s_endpgm
-  %fptr = load void(i32)*, void(i32)* addrspace(4)* @gv.fptr1
+  %fptr = load ptr, ptr addrspace(4) @gv.fptr1
   call void %fptr(i32 123)
   ret void
 }
 
-define void @test_indirect_call_vgpr_ptr(void()* %fptr) {
+define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -567,7 +567,7 @@ define void @test_indirect_call_vgpr_ptr(void()* %fptr) {
   ret void
 }
 
-define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
+define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -749,7 +749,7 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
   ret void
 }
 
-define i32 @test_indirect_call_vgpr_ptr_ret(i32()* %fptr) {
+define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_ret:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -932,7 +932,7 @@ define i32 @test_indirect_call_vgpr_ptr_ret(i32()* %fptr) {
   ret i32 %b
 }
 
-define void @test_indirect_call_vgpr_ptr_in_branch(void()* %fptr, i1 %cond) {
+define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_in_branch:
 ; GCN:       ; %bb.0: ; %bb0
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1139,7 +1139,7 @@ bb2:
   ret void
 }
 
-define void @test_indirect_call_vgpr_ptr_inreg_arg(void(i32)* %fptr) {
+define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_inreg_arg:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1333,7 +1333,7 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(void(i32)* %fptr) {
   ret void
 }
 
-define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr) {
+define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1539,7 +1539,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr)
 ; TODO The argument and return variable could be in the same physical register, but the register
 ; allocator is not able to do that because the return value clashes with the liverange of an
 ; IMPLICIT_DEF of the argument.
-define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, i32(i32)* %fptr) {
+define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_return:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1738,7 +1738,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, i32(i32)* %fptr)
 }
 
 ; Calling a vgpr can never be a tail call.
-define void @test_indirect_tail_call_vgpr_ptr(void()* %fptr) {
+define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GCN-LABEL: test_indirect_tail_call_vgpr_ptr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll b/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll
index 06641c70c2e11..591f582e4a34c 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll
@@ -20,14 +20,14 @@ declare void @llvm.amdgcn.s.barrier() #0
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write_b64
 ; CI-PROMOTE: ds_read_b64
-define amdgpu_kernel void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 {
-  %val = load double, double addrspace(1)* %in, align 8
+define amdgpu_kernel void @private_access_f64_alloca(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %b) #1 {
+  %val = load double, ptr addrspace(1) %in, align 8
   %array = alloca [8 x double], align 8, addrspace(5)
-  %ptr = getelementptr inbounds [8 x double], [8 x double] addrspace(5)* %array, i32 0, i32 %b
-  store double %val, double addrspace(5)* %ptr, align 8
+  %ptr = getelementptr inbounds [8 x double], ptr addrspace(5) %array, i32 0, i32 %b
+  store double %val, ptr addrspace(5) %ptr, align 8
   call void @llvm.amdgcn.s.barrier()
-  %result = load double, double addrspace(5)* %ptr, align 8
-  store double %result, double addrspace(1)* %out, align 8
+  %result = load double, ptr addrspace(5) %ptr, align 8
+  store double %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -51,14 +51,14 @@ define amdgpu_kernel void @private_access_f64_alloca(double addrspace(1)* noalia
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write_b128
 ; CI-PROMOTE: ds_read_b128
-define amdgpu_kernel void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 {
-  %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
+define amdgpu_kernel void @private_access_v2f64_alloca(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %b) #1 {
+  %val = load <2 x double>, ptr addrspace(1) %in, align 16
   %array = alloca [4 x <2 x double>], align 16, addrspace(5)
-  %ptr = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>] addrspace(5)* %array, i32 0, i32 %b
-  store <2 x double> %val, <2 x double> addrspace(5)* %ptr, align 16
+  %ptr = getelementptr inbounds [4 x <2 x double>], ptr addrspace(5) %array, i32 0, i32 %b
+  store <2 x double> %val, ptr addrspace(5) %ptr, align 16
   call void @llvm.amdgcn.s.barrier()
-  %result = load <2 x double>, <2 x double> addrspace(5)* %ptr, align 16
-  store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16
+  %result = load <2 x double>, ptr addrspace(5) %ptr, align 16
+  store <2 x double> %result, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -77,14 +77,14 @@ define amdgpu_kernel void @private_access_v2f64_alloca(<2 x double> addrspace(1)
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write_b64
 ; CI-PROMOTE: ds_read_b64
-define amdgpu_kernel void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 {
-  %val = load i64, i64 addrspace(1)* %in, align 8
+define amdgpu_kernel void @private_access_i64_alloca(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %b) #1 {
+  %val = load i64, ptr addrspace(1) %in, align 8
   %array = alloca [8 x i64], align 8, addrspace(5)
-  %ptr = getelementptr inbounds [8 x i64], [8 x i64] addrspace(5)* %array, i32 0, i32 %b
-  store i64 %val, i64 addrspace(5)* %ptr, align 8
+  %ptr = getelementptr inbounds [8 x i64], ptr addrspace(5) %array, i32 0, i32 %b
+  store i64 %val, ptr addrspace(5) %ptr, align 8
   call void @llvm.amdgcn.s.barrier()
-  %result = load i64, i64 addrspace(5)* %ptr, align 8
-  store i64 %result, i64 addrspace(1)* %out, align 8
+  %result = load i64, ptr addrspace(5) %ptr, align 8
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -109,14 +109,14 @@ define amdgpu_kernel void @private_access_i64_alloca(i64 addrspace(1)* noalias %
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write_b128
 ; CI-PROMOTE: ds_read_b128
-define amdgpu_kernel void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
-  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
+define amdgpu_kernel void @private_access_v2i64_alloca(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %b) #1 {
+  %val = load <2 x i64>, ptr addrspace(1) %in, align 16
   %array = alloca [4 x <2 x i64>], align 16, addrspace(5)
-  %ptr = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] addrspace(5)* %array, i32 0, i32 %b
-  store <2 x i64> %val, <2 x i64> addrspace(5)* %ptr, align 16
+  %ptr = getelementptr inbounds [4 x <2 x i64>], ptr addrspace(5) %array, i32 0, i32 %b
+  store <2 x i64> %val, ptr addrspace(5) %ptr, align 16
   call void @llvm.amdgcn.s.barrier()
-  %result = load <2 x i64>, <2 x i64> addrspace(5)* %ptr, align 16
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16
+  %result = load <2 x i64>, ptr addrspace(5) %ptr, align 16
+  store <2 x i64> %result, ptr addrspace(1) %out, align 16
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/infer-uniform-load-shader.ll b/llvm/test/CodeGen/AMDGPU/infer-uniform-load-shader.ll
index e44feac835a13..8dfba9b823641 100644
--- a/llvm/test/CodeGen/AMDGPU/infer-uniform-load-shader.ll
+++ b/llvm/test/CodeGen/AMDGPU/infer-uniform-load-shader.ll
@@ -4,22 +4,22 @@
 ; Make sure shaders with uniform, unmodified global address space
 ; loads are accessed with scalar loads.
 
-define amdgpu_ps i32 @ps_load_uniform_global_i32_align4(i32 addrspace(1)* inreg %ptr) {
+define amdgpu_ps i32 @ps_load_uniform_global_i32_align4(ptr addrspace(1) inreg %ptr) {
 ; GCN-LABEL: ps_load_uniform_global_i32_align4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %load = load i32, ptr addrspace(1) %ptr, align 4
   ret i32 %load
 }
 
-define amdgpu_cs i32 @cs_load_uniform_global_i32_align4(i32 addrspace(1)* inreg %ptr) {
+define amdgpu_cs i32 @cs_load_uniform_global_i32_align4(ptr addrspace(1) inreg %ptr) {
 ; GCN-LABEL: cs_load_uniform_global_i32_align4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %load = load i32, ptr addrspace(1) %ptr, align 4
   ret i32 %load
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index 5eb9490d184f1..83d424d93453f 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI %s
 ; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
 
-define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @infinite_loop(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loop:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -20,17 +20,17 @@ define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) {
 ; IR-NEXT:  entry:
 ; IR-NEXT:    br label [[LOOP:%.*]]
 ; IR:       loop:
-; IR-NEXT:    store volatile i32 999, i32 addrspace(1)* [[OUT:%.*]], align 4
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
 ; IR-NEXT:    br label [[LOOP]]
 entry:
   br label %loop
 
 loop:
-  store volatile i32 999, i32 addrspace(1)* %out, align 4
+  store volatile i32 999, ptr addrspace(1) %out, align 4
   br label %loop
 }
 
-define amdgpu_kernel void @infinite_loop_ret(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loop_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -57,7 +57,7 @@ define amdgpu_kernel void @infinite_loop_ret(i32 addrspace(1)* %out) {
 ; IR-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP]], 1
 ; IR-NEXT:    br i1 [[COND]], label [[LOOP:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
 ; IR:       loop:
-; IR-NEXT:    store volatile i32 999, i32 addrspace(1)* [[OUT:%.*]], align 4
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
 ; IR-NEXT:    br i1 true, label [[LOOP]], label [[UNIFIEDRETURNBLOCK]]
 ; IR:       UnifiedReturnBlock:
 ; IR-NEXT:    ret void
@@ -67,14 +67,14 @@ entry:
   br i1 %cond, label %loop, label %return
 
 loop:
-  store volatile i32 999, i32 addrspace(1)* %out, align 4
+  store volatile i32 999, ptr addrspace(1) %out, align 4
   br label %loop
 
 return:
   ret void
 }
 
-define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loops:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -117,10 +117,10 @@ define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) {
 ; IR-NEXT:  entry:
 ; IR-NEXT:    br i1 undef, label [[LOOP1:%.*]], label [[LOOP2:%.*]]
 ; IR:       loop1:
-; IR-NEXT:    store volatile i32 999, i32 addrspace(1)* [[OUT:%.*]], align 4
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
 ; IR-NEXT:    br i1 true, label [[LOOP1]], label [[DUMMYRETURNBLOCK:%.*]]
 ; IR:       loop2:
-; IR-NEXT:    store volatile i32 888, i32 addrspace(1)* [[OUT]], align 4
+; IR-NEXT:    store volatile i32 888, ptr addrspace(1) [[OUT]], align 4
 ; IR-NEXT:    br i1 true, label [[LOOP2]], label [[DUMMYRETURNBLOCK]]
 ; IR:       DummyReturnBlock:
 ; IR-NEXT:    ret void
@@ -128,15 +128,15 @@ entry:
   br i1 undef, label %loop1, label %loop2
 
 loop1:
-  store volatile i32 999, i32 addrspace(1)* %out, align 4
+  store volatile i32 999, ptr addrspace(1) %out, align 4
   br label %loop1
 
 loop2:
-  store volatile i32 888, i32 addrspace(1)* %out, align 4
+  store volatile i32 888, ptr addrspace(1) %out, align 4
   br label %loop2
 }
 
-define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loop_nest_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -177,7 +177,7 @@ define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) {
 ; IR:       outer_loop:
 ; IR-NEXT:    br label [[INNER_LOOP:%.*]]
 ; IR:       inner_loop:
-; IR-NEXT:    store volatile i32 999, i32 addrspace(1)* [[OUT:%.*]], align 4
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
 ; IR-NEXT:    [[COND3:%.*]] = icmp eq i32 [[TMP]], 3
 ; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK]]
 ; IR:       TransitionBlock:
@@ -195,7 +195,7 @@ outer_loop:
   br label %inner_loop
 
 inner_loop:                                     ; preds = %LeafBlock, %LeafBlock1
-  store volatile i32 999, i32 addrspace(1)* %out, align 4
+  store volatile i32 999, ptr addrspace(1) %out, align 4
   %cond3 = icmp eq i32 %tmp, 3
   br i1 %cond3, label %inner_loop, label %outer_loop
 

diff  --git a/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll b/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll
index 9dc871439c64d..f20d720c3876b 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll
@@ -42,10 +42,10 @@ entry:
 
 ; FIXME: This should warn too
 ; ERR-NOT: warning
-define amdgpu_kernel void @def_exec(i64 addrspace(1)* %ptr) {
+define amdgpu_kernel void @def_exec(ptr addrspace(1) %ptr) {
 entry:
   %exec = call i64 asm sideeffect "; def $0", "={exec}"()
-  store i64 %exec, i64 addrspace(1)* %ptr
+  store i64 %exec, ptr addrspace(1) %ptr
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
index 48220392aea29..b9637ab49baa4 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
@@ -4,9 +4,9 @@
 ; CHECK-LABEL: {{^}}inline_asm:
 ; CHECK: s_endpgm
 ; CHECK: s_endpgm
-define amdgpu_kernel void @inline_asm(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @inline_asm(ptr addrspace(1) %out) {
 entry:
-  store i32 5, i32 addrspace(1)* %out
+  store i32 5, ptr addrspace(1) %out
   call void asm sideeffect "s_endpgm", ""()
   ret void
 }
@@ -26,13 +26,13 @@ entry:
 ; CHECK: v_mov_b32 v{{[0-9]+}}, 0
 ; CHECK: v_cmp_eq_u32
 ; CHECK: s_and_saveexec_b64
-define amdgpu_kernel void @branch_on_asm_vgpr(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @branch_on_asm_vgpr(ptr addrspace(1) %out) {
 	%zero = call i32 asm "v_mov_b32 $0, 0", "=v"()
 	%cmp = icmp eq i32 %zero, 0
 	br i1 %cmp, label %if, label %endif
 
 if:
-	store i32 0, i32 addrspace(1)* %out
+	store i32 0, ptr addrspace(1) %out
 	br label %endif
 
 endif:
@@ -44,13 +44,13 @@ endif:
 ; CHECK: s_mov_b32 s{{[0-9]+}}, 0
 ; CHECK: s_cmp_lg_u32
 ; CHECK: s_cbranch_scc0
-define amdgpu_kernel void @branch_on_asm_sgpr(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @branch_on_asm_sgpr(ptr addrspace(1) %out) {
 	%zero = call i32 asm "s_mov_b32 $0, 0", "=s"()
 	%cmp = icmp eq i32 %zero, 0
 	br i1 %cmp, label %if, label %endif
 
 if:
-	store i32 0, i32 addrspace(1)* %out
+	store i32 0, ptr addrspace(1) %out
 	br label %endif
 
 endif:
@@ -63,15 +63,15 @@ endif:
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[MASK_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[MASK_HI]]
 ; CHECK: buffer_store_dwordx2 v[[[V_LO]]:[[V_HI]]]
-define amdgpu_kernel void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @v_cmp_asm(ptr addrspace(1) %out, i32 %in) {
   %sgpr = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 %in)
-  store i64 %sgpr, i64 addrspace(1)* %out
+  store i64 %sgpr, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm:
 ; CHECK: codeLenInByte = 12
-define amdgpu_kernel void @code_size_inline_asm(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm(ptr addrspace(1) %out) {
 entry:
   call void asm sideeffect "v_nop_e64", ""()
   ret void
@@ -80,7 +80,7 @@ entry:
 ; All inlineasm instructions are assumed to be the maximum size
 ; CHECK-LABEL: {{^}}code_size_inline_asm_small_inst:
 ; CHECK: codeLenInByte = 12
-define amdgpu_kernel void @code_size_inline_asm_small_inst(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_small_inst(ptr addrspace(1) %out) {
 entry:
   call void asm sideeffect "v_nop_e32", ""()
   ret void
@@ -88,7 +88,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst:
 ; CHECK: codeLenInByte = 20
-define amdgpu_kernel void @code_size_inline_asm_2_inst(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_inst(ptr addrspace(1) %out) {
 entry:
   call void asm sideeffect "
     v_nop_e64
@@ -99,7 +99,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst_extra_newline:
 ; CHECK: codeLenInByte = 20
-define amdgpu_kernel void @code_size_inline_asm_2_inst_extra_newline(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_inst_extra_newline(ptr addrspace(1) %out) {
 entry:
   call void asm sideeffect "
     v_nop_e64
@@ -111,7 +111,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_0_inst:
 ; CHECK: codeLenInByte = 4
-define amdgpu_kernel void @code_size_inline_asm_0_inst(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_0_inst(ptr addrspace(1) %out) {
 entry:
   call void asm sideeffect "", ""()
   ret void
@@ -119,7 +119,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment:
 ; CHECK: codeLenInByte = 4
-define amdgpu_kernel void @code_size_inline_asm_1_comment(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_1_comment(ptr addrspace(1) %out) {
 entry:
   call void asm sideeffect "; comment", ""()
   ret void
@@ -127,7 +127,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_newline_1_comment:
 ; CHECK: codeLenInByte = 4
-define amdgpu_kernel void @code_size_inline_asm_newline_1_comment(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_newline_1_comment(ptr addrspace(1) %out) {
 entry:
   call void asm sideeffect "
 ; comment", ""()
@@ -136,7 +136,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment_newline:
 ; CHECK: codeLenInByte = 4
-define amdgpu_kernel void @code_size_inline_asm_1_comment_newline(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_1_comment_newline(ptr addrspace(1) %out) {
 entry:
   call void asm sideeffect "; comment
 ", ""()
@@ -145,7 +145,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line:
 ; CHECK: codeLenInByte = 4
-define amdgpu_kernel void @code_size_inline_asm_2_comments_line(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_comments_line(ptr addrspace(1) %out) {
 entry:
   call void asm sideeffect "; first comment ; second comment", ""()
   ret void
@@ -153,7 +153,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line_nospace:
 ; CHECK: codeLenInByte = 4
-define amdgpu_kernel void @code_size_inline_asm_2_comments_line_nospace(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_comments_line_nospace(ptr addrspace(1) %out) {
 entry:
   call void asm sideeffect "; first comment;second comment", ""()
   ret void
@@ -161,7 +161,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments0:
 ; CHECK: codeLenInByte = 20
-define amdgpu_kernel void @code_size_inline_asm_mixed_comments0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_mixed_comments0(ptr addrspace(1) %out) {
 entry:
   call void asm sideeffect "; comment
     v_nop_e64 ; inline comment
@@ -176,7 +176,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments1:
 ; CHECK: codeLenInByte = 20
-define amdgpu_kernel void @code_size_inline_asm_mixed_comments1(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_mixed_comments1(ptr addrspace(1) %out) {
 entry:
   call void asm sideeffect "v_nop_e64 ; inline comment
 ; separate comment
@@ -190,7 +190,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments_operands:
 ; CHECK: codeLenInByte = 20
-define amdgpu_kernel void @code_size_inline_asm_mixed_comments_operands(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_mixed_comments_operands(ptr addrspace(1) %out) {
 entry:
   call void asm sideeffect "; comment
     v_add_i32_e32 v0, vcc, v1, v2 ; inline comment
@@ -235,9 +235,9 @@ entry:
 ; CHECK: {{buffer|flat}}_store_byte [[STORE]],
 define amdgpu_kernel void @i1_input_phys_vgpr() {
 entry:
-  %val = load i1, i1 addrspace(1)* undef
+  %val = load i1, ptr addrspace(1) undef
   %cc = call i1 asm sideeffect "; use $1, def $0 ", "={v1}, {v0}"(i1 %val)
-  store i1 %cc, i1 addrspace(1)* undef
+  store i1 %cc, ptr addrspace(1) undef
   ret void
 }
 
@@ -250,8 +250,8 @@ entry:
 ; CHECK-NEXT: ASMSTART
 define amdgpu_kernel void @i1_input_phys_vgpr_x2() {
 entry:
-  %val0 = load volatile i1, i1 addrspace(1)* undef
-  %val1 = load volatile i1, i1 addrspace(1)* undef
+  %val0 = load volatile i1, ptr addrspace(1) undef
+  %val1 = load volatile i1, ptr addrspace(1) undef
   call void asm sideeffect "; use $0 $1 ", "{v0}, {v1}"(i1 %val0, i1 %val1)
   ret void
 }
@@ -266,7 +266,7 @@ entry:
   %def0 = call i32 asm sideeffect "; def $0 ", "={v0}"()
   %def1 = call i32 asm sideeffect "; def $0 ", "={v0}"()
   %add = shl i32 %def0, %def1
-  store i32 %add, i32 addrspace(1)* undef
+  store i32 %add, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/inline-attr.ll b/llvm/test/CodeGen/AMDGPU/inline-attr.ll
index 222a8a26f7f82..4fecdb576a6de 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-attr.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-attr.ll
@@ -3,7 +3,7 @@
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -O3 -enable-no-infs-fp-math %s | FileCheck -check-prefix=GCN -check-prefix=NOINFS %s
 
 ; GCN: define float @foo(float %x) local_unnamed_addr #0 {
-; GCN: define amdgpu_kernel void @caller(float addrspace(1)* nocapture %p) local_unnamed_addr #1 {
+; GCN: define amdgpu_kernel void @caller(ptr addrspace(1) nocapture %p) local_unnamed_addr #1 {
 ; GCN: %mul.i = fmul float %load, 1.500000e+01
 
 ; UNSAFE: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "unsafe-fp-math"="true" }
@@ -21,11 +21,11 @@ entry:
   ret float %mul
 }
 
-define amdgpu_kernel void @caller(float addrspace(1)* %p) #1 {
+define amdgpu_kernel void @caller(ptr addrspace(1) %p) #1 {
 entry:
-  %load = load float, float addrspace(1)* %p, align 4
+  %load = load float, ptr addrspace(1) %p, align 4
   %call = call fast float @foo(float %load) #0
-  store float %call, float addrspace(1)* %p, align 4
+  store float %call, ptr addrspace(1) %p, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
index 06fa8ee65ddc6..0846c8eb0d5f1 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
@@ -17,19 +17,19 @@
 ; GCN: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
 ; GCN: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
 
-define amdgpu_kernel void @inline_reg_constraints(i32 addrspace(1)* %ptr) {
+define amdgpu_kernel void @inline_reg_constraints(ptr addrspace(1) %ptr) {
 entry:
-  %v32 = tail call i32 asm sideeffect "flat_load_dword   $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
-  %v2_32 = tail call <2 x i32> asm sideeffect "flat_load_dwordx2 $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
-  %v64 =   tail call i64 asm sideeffect "flat_load_dwordx2 $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
-  %v4_32 = tail call <4 x i32> asm sideeffect "flat_load_dwordx4 $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
-  %v128 =  tail call i128 asm sideeffect "flat_load_dwordx4 $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
-  %s32 =   tail call i32 asm sideeffect "s_load_dword $0, $1", "=s,s"(i32 addrspace(1)* %ptr)
-  %s32_2 = tail call <2 x i32> asm sideeffect "s_load_dwordx2 $0, $1", "=s,s"(i32 addrspace(1)* %ptr)
-  %s64 =   tail call i64 asm sideeffect "s_load_dwordx2 $0, $1", "=s,s"(i32 addrspace(1)* %ptr)
-  %s4_32 =  tail call <4 x i32> asm sideeffect "s_load_dwordx4 $0, $1", "=s,s"(i32 addrspace(1)* %ptr)
-  %s128 =  tail call i128 asm sideeffect "s_load_dwordx4 $0, $1", "=s,s"(i32 addrspace(1)* %ptr)
-  %s256 =  tail call <8 x i32> asm sideeffect "s_load_dwordx8 $0, $1", "=s,s"(i32 addrspace(1)* %ptr)
+  %v32 = tail call i32 asm sideeffect "flat_load_dword   $0, $1", "=v,v"(ptr addrspace(1) %ptr)
+  %v2_32 = tail call <2 x i32> asm sideeffect "flat_load_dwordx2 $0, $1", "=v,v"(ptr addrspace(1) %ptr)
+  %v64 =   tail call i64 asm sideeffect "flat_load_dwordx2 $0, $1", "=v,v"(ptr addrspace(1) %ptr)
+  %v4_32 = tail call <4 x i32> asm sideeffect "flat_load_dwordx4 $0, $1", "=v,v"(ptr addrspace(1) %ptr)
+  %v128 =  tail call i128 asm sideeffect "flat_load_dwordx4 $0, $1", "=v,v"(ptr addrspace(1) %ptr)
+  %s32 =   tail call i32 asm sideeffect "s_load_dword $0, $1", "=s,s"(ptr addrspace(1) %ptr)
+  %s32_2 = tail call <2 x i32> asm sideeffect "s_load_dwordx2 $0, $1", "=s,s"(ptr addrspace(1) %ptr)
+  %s64 =   tail call i64 asm sideeffect "s_load_dwordx2 $0, $1", "=s,s"(ptr addrspace(1) %ptr)
+  %s4_32 =  tail call <4 x i32> asm sideeffect "s_load_dwordx4 $0, $1", "=s,s"(ptr addrspace(1) %ptr)
+  %s128 =  tail call i128 asm sideeffect "s_load_dwordx4 $0, $1", "=s,s"(ptr addrspace(1) %ptr)
+  %s256 =  tail call <8 x i32> asm sideeffect "s_load_dwordx8 $0, $1", "=s,s"(ptr addrspace(1) %ptr)
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/inline-maxbb.ll b/llvm/test/CodeGen/AMDGPU/inline-maxbb.ll
index 375f5fd31752e..cc9203a5e984f 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-maxbb.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-maxbb.ll
@@ -30,7 +30,7 @@ ret_res:
 
 define amdgpu_kernel void @caller(i32 %x) {
   %res = call i32 @callee(i32 %x)
-  store volatile i32 %res, i32 addrspace(1)* undef
+  store volatile i32 %res, ptr addrspace(1) undef
   ret void
 }
 
@@ -63,7 +63,7 @@ ret_res:
 
 define amdgpu_kernel void @caller_hint(i32 %x) {
   %res = call i32 @callee_hint(i32 %x)
-  store volatile i32 %res, i32 addrspace(1)* undef
+  store volatile i32 %res, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll
index d4f18474e15a0..bb65697834303 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll
@@ -2,55 +2,55 @@
 
 ; GCN-LABEL: {{^}}inline_asm_input_v2i16:
 ; GCN: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @inline_asm_input_v2i16(i32 addrspace(1)* %out, <2 x i16> %in) #0 {
+define amdgpu_kernel void @inline_asm_input_v2i16(ptr addrspace(1) %out, <2 x i16> %in) #0 {
 entry:
   %val = call i32 asm "s_mov_b32 $0, $1", "=r,r"(<2 x i16> %in) #0
-  store i32 %val, i32 addrspace(1)* %out
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}inline_asm_input_v2f16:
 ; GCN: s_mov_b32 s0, s{{[0-9]+}}
-define amdgpu_kernel void @inline_asm_input_v2f16(i32 addrspace(1)* %out, <2 x half> %in) #0 {
+define amdgpu_kernel void @inline_asm_input_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 {
 entry:
   %val = call i32 asm "s_mov_b32 $0, $1", "=r,r"(<2 x half> %in) #0
-  store i32 %val, i32 addrspace(1)* %out
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}inline_asm_output_v2i16:
 ; GCN: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @inline_asm_output_v2i16(<2 x i16> addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @inline_asm_output_v2i16(ptr addrspace(1) %out, i32 %in) #0 {
 entry:
   %val = call <2 x i16> asm "s_mov_b32 $0, $1", "=r,r"(i32 %in) #0
-  store <2 x i16> %val, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %val, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}inline_asm_output_v2f16:
 ; GCN: v_mov_b32 v{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @inline_asm_output_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @inline_asm_output_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
 entry:
   %val = call <2 x half> asm "v_mov_b32 $0, $1", "=v,r"(i32 %in) #0
-  store <2 x half> %val, <2 x half> addrspace(1)* %out
+  store <2 x half> %val, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}inline_asm_packed_v2i16:
 ; GCN: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @inline_asm_packed_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %in0, <2 x i16> %in1) #0 {
+define amdgpu_kernel void @inline_asm_packed_v2i16(ptr addrspace(1) %out, <2 x i16> %in0, <2 x i16> %in1) #0 {
 entry:
   %val = call <2 x i16> asm "v_pk_add_u16 $0, $1, $2", "=v,r,v"(<2 x i16> %in0, <2 x i16> %in1) #0
-  store <2 x i16> %val, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %val, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}inline_asm_packed_v2f16:
 ; GCN: v_pk_add_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @inline_asm_packed_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in0, <2 x half> %in1) #0 {
+define amdgpu_kernel void @inline_asm_packed_v2f16(ptr addrspace(1) %out, <2 x half> %in0, <2 x half> %in1) #0 {
 entry:
   %val = call <2 x half> asm "v_pk_add_f16 $0, $1, $2", "=v,r,v"(<2 x half> %in0, <2 x half> %in1) #0
-  store <2 x half> %val, <2 x half> addrspace(1)* %out
+  store <2 x half> %val, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll b/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll
index ceb02e38db837..0623110564b48 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll
@@ -7,10 +7,10 @@
 ; GCN:        ds_read_b96
 ; GCN:        ds_write_b96
 ; GCN: ScratchSize: 0
-define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %a) nounwind {
-  %val = load <3 x i32>, <3 x i32> addrspace(3)* %out
+define amdgpu_kernel void @store_v3i32(ptr addrspace(3) %out, <3 x i32> %a) nounwind {
+  %val = load <3 x i32>, ptr addrspace(3) %out
   %val.1 = add <3 x i32> %a, %val
-  store <3 x i32> %val.1, <3 x i32> addrspace(3)* %out, align 16
+  store <3 x i32> %val.1, ptr addrspace(3) %out, align 16
   ret void
 }
 
@@ -20,9 +20,9 @@ define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %
 ; GCN:        ds_write_b32
 ; GCN:        ds_write_b128
 ; GCN: ScratchSize: 0
-define amdgpu_kernel void @store_v5i32(<5 x i32> addrspace(3)* %out, <5 x i32> %a) nounwind {
-  %val = load <5 x i32>, <5 x i32> addrspace(3)* %out
+define amdgpu_kernel void @store_v5i32(ptr addrspace(3) %out, <5 x i32> %a) nounwind {
+  %val = load <5 x i32>, ptr addrspace(3) %out
   %val.1 = add <5 x i32> %a, %val
-  store <5 x i32> %val.1, <5 x i32> addrspace(3)* %out, align 16
+  store <5 x i32> %val.1, ptr addrspace(3) %out, align 16
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_subreg.ll b/llvm/test/CodeGen/AMDGPU/insert_subreg.ll
index 2dc9277460716..eb30bcee9de26 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_subreg.ll
@@ -6,11 +6,11 @@
 
 ; Make sure this doesn't crash
 ; CHECK-LABEL: test:
-define amdgpu_kernel void @test(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @test(ptr addrspace(1) %out) {
 entry:
   %tmp0 = alloca [16 x i32], addrspace(5)
-  %tmp1 = ptrtoint [16 x i32] addrspace(5)* %tmp0 to i32
+  %tmp1 = ptrtoint ptr addrspace(5) %tmp0 to i32
   %tmp2 = sext i32 %tmp1 to i64
-  store i64 %tmp2, i64 addrspace(1)* %out
+  store i64 %tmp2, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index ade5d5154d749..032b8b89fb4ee 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
 
-define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, <4 x float> %vec, i32 %sel) {
+define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec, i32 %sel) {
 ; GCN-LABEL: float4_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x44
@@ -30,11 +30,11 @@ define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, <4 x fl
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel
-  store <4 x float> %v, <4 x float> addrspace(1)* %out
+  store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @float4_inselt_undef(<4 x float> addrspace(1)* %out, i32 %sel) {
+define amdgpu_kernel void @float4_inselt_undef(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-LABEL: float4_inselt_undef:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -49,11 +49,11 @@ define amdgpu_kernel void @float4_inselt_undef(<4 x float> addrspace(1)* %out, i
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel
-  store <4 x float> %v, <4 x float> addrspace(1)* %out
+  store <4 x float> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @int4_inselt(<4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %sel) {
+define amdgpu_kernel void @int4_inselt(ptr addrspace(1) %out, <4 x i32> %vec, i32 %sel) {
 ; GCN-LABEL: int4_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x44
@@ -78,11 +78,11 @@ define amdgpu_kernel void @int4_inselt(<4 x i32> addrspace(1)* %out, <4 x i32> %
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <4 x i32> %vec, i32 1, i32 %sel
-  store <4 x i32> %v, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, <2 x float> %vec, i32 %sel) {
+define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec, i32 %sel) {
 ; GCN-LABEL: float2_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0x34
@@ -102,11 +102,11 @@ define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, <2 x fl
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel
-  store <2 x float> %v, <2 x float> addrspace(1)* %out
+  store <2 x float> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @float8_inselt(<8 x float> addrspace(1)* %out, <8 x float> %vec, i32 %sel) {
+define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec, i32 %sel) {
 ; GCN-LABEL: float8_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x44
@@ -135,11 +135,11 @@ define amdgpu_kernel void @float8_inselt(<8 x float> addrspace(1)* %out, <8 x fl
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel
-  store <8 x float> %v, <8 x float> addrspace(1)* %out
+  store <8 x float> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @float16_inselt(<16 x float> addrspace(1)* %out, <16 x float> %vec, i32 %sel) {
+define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %vec, i32 %sel) {
 ; GCN-LABEL: float16_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x64
@@ -188,11 +188,11 @@ define amdgpu_kernel void @float16_inselt(<16 x float> addrspace(1)* %out, <16 x
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <16 x float> %vec, float 1.000000e+00, i32 %sel
-  store <16 x float> %v, <16 x float> addrspace(1)* %out
+  store <16 x float> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @float32_inselt(<32 x float> addrspace(1)* %out, <32 x float> %vec, i32 %sel) {
+define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %vec, i32 %sel) {
 ; GCN-LABEL: float32_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0xa4
@@ -282,11 +282,11 @@ define amdgpu_kernel void @float32_inselt(<32 x float> addrspace(1)* %out, <32 x
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel
-  store <32 x float> %v, <32 x float> addrspace(1)* %out
+  store <32 x float> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) {
+define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) {
 ; GCN-LABEL: half4_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s7, s[0:1], 0x34
@@ -308,11 +308,11 @@ define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel
-  store <4 x half> %v, <4 x half> addrspace(1)* %out
+  store <4 x half> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @half2_inselt(<2 x half> addrspace(1)* %out, <2 x half> %vec, i32 %sel) {
+define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec, i32 %sel) {
 ; GCN-LABEL: half2_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -329,11 +329,11 @@ define amdgpu_kernel void @half2_inselt(<2 x half> addrspace(1)* %out, <2 x half
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel
-  store <2 x half> %v, <2 x half> addrspace(1)* %out
+  store <2 x half> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @half8_inselt(<8 x half> addrspace(1)* %out, <8 x half> %vec, i32 %sel) {
+define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec, i32 %sel) {
 ; GCN-LABEL: half8_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -391,11 +391,11 @@ define amdgpu_kernel void @half8_inselt(<8 x half> addrspace(1)* %out, <8 x half
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <8 x half> %vec, half 1.000000e+00, i32 %sel
-  store <8 x half> %v, <8 x half> addrspace(1)* %out
+  store <8 x half> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) {
+define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec, i32 %sel) {
 ; GCN-LABEL: short2_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -412,11 +412,11 @@ define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <2 x i16> %vec, i16 1, i32 %sel
-  store <2 x i16> %v, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) {
+define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) {
 ; GCN-LABEL: short4_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s7, s[0:1], 0x34
@@ -438,11 +438,11 @@ define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16>
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <4 x i16> %vec, i16 1, i32 %sel
-  store <4 x i16> %v, <4 x i16> addrspace(1)* %out
+  store <4 x i16> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) {
+define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) {
 ; GCN-LABEL: byte8_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
@@ -463,11 +463,11 @@ define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %v
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <8 x i8> %vec, i8 1, i32 %sel
-  store <8 x i8> %v, <8 x i8> addrspace(1)* %out
+  store <8 x i8> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @byte16_inselt(<16 x i8> addrspace(1)* %out, <16 x i8> %vec, i32 %sel) {
+define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec, i32 %sel) {
 ; GCN-LABEL: byte16_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -576,11 +576,11 @@ define amdgpu_kernel void @byte16_inselt(<16 x i8> addrspace(1)* %out, <16 x i8>
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <16 x i8> %vec, i8 1, i32 %sel
-  store <16 x i8> %v, <16 x i8> addrspace(1)* %out
+  store <16 x i8> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) {
+define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %vec, i32 %sel) {
 ; GCN-LABEL: double2_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x44
@@ -603,11 +603,11 @@ define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel
-  store <2 x double> %v, <2 x double> addrspace(1)* %out
+  store <2 x double> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) {
+define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %vec, i32 %sel) {
 ; GCN-LABEL: double5_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
@@ -663,11 +663,11 @@ define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel
-  store <5 x double> %v, <5 x double> addrspace(1)* %out
+  store <5 x double> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @double8_inselt(<8 x double> addrspace(1)* %out, <8 x double> %vec, i32 %sel) {
+define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %vec, i32 %sel) {
 ; GCN-LABEL: double8_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0xa4
@@ -719,11 +719,11 @@ define amdgpu_kernel void @double8_inselt(<8 x double> addrspace(1)* %out, <8 x
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
-  store <8 x double> %v, <8 x double> addrspace(1)* %out
+  store <8 x double> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @double7_inselt(<7 x double> addrspace(1)* %out, <7 x double> %vec, i32 %sel) {
+define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %vec, i32 %sel) {
 ; GCN-LABEL: double7_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x64
@@ -772,11 +772,11 @@ define amdgpu_kernel void @double7_inselt(<7 x double> addrspace(1)* %out, <7 x
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <7 x double> %vec, double 1.000000e+00, i32 %sel
-  store <7 x double> %v, <7 x double> addrspace(1)* %out
+  store <7 x double> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @double16_inselt(<16 x double> addrspace(1)* %out, <16 x double> %vec, i32 %sel) {
+define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> %vec, i32 %sel) {
 ; GCN-LABEL: double16_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x124
@@ -869,11 +869,11 @@ define amdgpu_kernel void @double16_inselt(<16 x double> addrspace(1)* %out, <16
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <16 x double> %vec, double 1.000000e+00, i32 %sel
-  store <16 x double> %v, <16 x double> addrspace(1)* %out
+  store <16 x double> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @double15_inselt(<15 x double> addrspace(1)* %out, <15 x double> %vec, i32 %sel) {
+define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> %vec, i32 %sel) {
 ; GCN-LABEL: double15_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0xa4
@@ -964,11 +964,11 @@ define amdgpu_kernel void @double15_inselt(<15 x double> addrspace(1)* %out, <15
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <15 x double> %vec, double 1.000000e+00, i32 %sel
-  store <15 x double> %v, <15 x double> addrspace(1)* %out
+  store <15 x double> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @bit4_inselt(<4 x i1> addrspace(1)* %out, <4 x i1> %vec, i32 %sel) {
+define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 %sel) {
 ; GCN-LABEL: bit4_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
@@ -1019,11 +1019,11 @@ define amdgpu_kernel void @bit4_inselt(<4 x i1> addrspace(1)* %out, <4 x i1> %ve
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <4 x i1> %vec, i1 1, i32 %sel
-  store <4 x i1> %v, <4 x i1> addrspace(1)* %out
+  store <4 x i1> %v, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i1> %vec, i32 %sel) {
+define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, i32 %sel) {
 ; GCN-LABEL: bit128_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -1933,7 +1933,7 @@ define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <128 x i1> %vec, i1 1, i32 %sel
-  store <128 x i1> %v, <128 x i1> addrspace(1)* %out
+  store <128 x i1> %v, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 8ee9beeee8e44..95d0b18f9b15b 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CIVI,CI %s
 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
 
-define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
 ; GFX9-LABEL: s_insertelement_v2i16_0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -43,14 +43,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out,
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
+  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %vecins, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 {
 ; GFX9-LABEL: s_insertelement_v2i16_0_reg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -110,13 +110,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
+  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %vecins, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 {
 ; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -192,16 +192,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> ad
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
+  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %elt1 = extractelement <2 x i16> %vec, i32 1
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %vecins, ptr addrspace(1) %out
   %use1 = zext i16 %elt1 to i32
   call void asm sideeffect "; use $0", "s"(i32 %use1) #0
   ret void
 }
 
-define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %elt.arg) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %elt.arg) #0 {
 ; GFX9-LABEL: s_insertelement_v2i16_0_reghi:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -260,15 +260,15 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
+  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %elt.hi = lshr i32 %elt.arg, 16
   %elt = trunc i32 %elt.hi to i16
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %vecins, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %elt.arg) #0 {
 ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -342,17 +342,17 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> a
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
+  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %elt.hi = lshr i32 %elt.arg, 16
   %elt = trunc i32 %elt.hi to i16
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %vecins, ptr addrspace(1) %out
   %use1 = zext i16 %elt to i32
   call void asm sideeffect "; use $0", "s"(i32 %use1) #0
   ret void
 }
 
-define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %elt.arg) #0 {
 ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -440,12 +440,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
+  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %elt.hi = lshr i32 %elt.arg, 16
   %elt = trunc i32 %elt.hi to i16
   %vec.hi = extractelement <2 x i16> %vec, i32 1
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %vecins, ptr addrspace(1) %out
   %use1 = zext i16 %elt to i32
   %vec.hi.use1 = zext i16 %vec.hi to i32
 
@@ -454,7 +454,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i
   ret void
 }
 
-define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
 ; GFX9-LABEL: s_insertelement_v2i16_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -493,13 +493,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out,
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
+  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %vecins, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 {
 ; GFX9-LABEL: s_insertelement_v2i16_1_reg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -559,13 +559,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
+  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %vecins, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
+define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
 ; GFX9-LABEL: s_insertelement_v2f16_0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -606,13 +606,13 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
+  %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
   %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
-  store <2 x half> %vecins, <2 x half> addrspace(1)* %out
+  store <2 x half> %vecins, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
+define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
 ; GFX9-LABEL: s_insertelement_v2f16_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -651,13 +651,13 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
+  %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
   %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
-  store <2 x half> %vecins, <2 x half> addrspace(1)* %out
+  store <2 x half> %vecins, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: v_insertelement_v2i16_0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -721,15 +721,15 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out,
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <2 x i16>, ptr addrspace(1) %in.gep
   %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  store <2 x i16> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %elt.arg) #0 {
 ; GFX9-LABEL: v_insertelement_v2i16_0_reghi:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -796,17 +796,17 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <2 x i16>, ptr addrspace(1) %in.gep
   %elt.hi = lshr i32 %elt.arg, 16
   %elt = trunc i32 %elt.hi to i16
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  store <2 x i16> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -868,16 +868,16 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <2 x i16>, ptr addrspace(1) %in.gep
   %vecins = insertelement <2 x i16> %vec, i16 53, i32 0
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  store <2 x i16> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0
-define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: v_insertelement_v2i16_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -941,15 +941,15 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out,
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <2 x i16>, ptr addrspace(1) %in.gep
   %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  store <2 x i16> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1011,15 +1011,15 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <2 x i16>, ptr addrspace(1) %in.gep
   %vecins = insertelement <2 x i16> %vec, i16 -15, i32 1
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  store <2 x i16> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: v_insertelement_v2f16_0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1083,15 +1083,15 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <2 x half>, ptr addrspace(1) %in.gep
   %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
-  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
+  store <2 x half> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1153,15 +1153,15 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspac
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <2 x half>, ptr addrspace(1) %in.gep
   %vecins = insertelement <2 x half> %vec, half 0xH0035, i32 0
-  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
+  store <2 x half> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: v_insertelement_v2f16_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1225,15 +1225,15 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <2 x half>, ptr addrspace(1) %in.gep
   %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
-  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
+  store <2 x half> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1295,16 +1295,16 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspac
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <2 x half>, ptr addrspace(1) %in.gep
   %vecins = insertelement <2 x half> %vec, half 0xH0023, i32 1
-  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
+  store <2 x half> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; FIXME: Enable for others when argument load not split
-define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(4)* %idx.ptr) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(4) %idx.ptr) #0 {
 ; GFX9-LABEL: s_insertelement_v2i16_dynamic:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
@@ -1381,14 +1381,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)*
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %idx = load volatile i32, i32 addrspace(4)* %idx.ptr
-  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
+  %idx = load volatile i32, ptr addrspace(4) %idx.ptr
+  %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %vecins, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) #0 {
 ; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1463,15 +1463,15 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <2 x i16>, ptr addrspace(1) %in.gep
   %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  store <2 x i16> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %idx.ptr) #0 {
 ; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
@@ -1564,17 +1564,17 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
-  %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
-  %idx = load i32, i32 addrspace(1)* %idx.gep
-  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
+  %idx.gep = getelementptr inbounds i32, ptr addrspace(1) %idx.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %idx = load i32, ptr addrspace(1) %idx.gep
+  %vec = load <2 x half>, ptr addrspace(1) %in.gep
   %vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx
-  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
+  store <2 x half> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
+define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 {
 ; GFX9-LABEL: v_insertelement_v4f16_0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1643,17 +1643,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <4 x half>, ptr addrspace(1) %in.gep
   %val.trunc = trunc i32 %val to i16
   %val.cvt = bitcast i16 %val.trunc to half
   %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 0
-  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
+  store <4 x half> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
+define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
 ; GFX9-LABEL: v_insertelement_v4f16_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1721,17 +1721,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <4 x half>, ptr addrspace(1) %in.gep
   %val.trunc = trunc i32 %val to i16
   %val.cvt = bitcast i16 %val.trunc to half
   %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 1
-  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
+  store <4 x half> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
+define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 {
 ; GFX9-LABEL: v_insertelement_v4f16_2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1800,17 +1800,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <4 x half>, ptr addrspace(1) %in.gep
   %val.trunc = trunc i32 %val to i16
   %val.cvt = bitcast i16 %val.trunc to half
   %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 2
-  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
+  store <4 x half> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
+define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
 ; GFX9-LABEL: v_insertelement_v4f16_3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1878,17 +1878,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <4 x half>, ptr addrspace(1) %in.gep
   %val.trunc = trunc i32 %val to i16
   %val.cvt = bitcast i16 %val.trunc to half
   %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 3
-  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
+  store <4 x half> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
+define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
 ; GFX9-LABEL: v_insertelement_v4i16_2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1957,18 +1957,18 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out,
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <4 x i16>, ptr addrspace(1) %in.gep
   %val.trunc = trunc i32 %val to i16
   %val.cvt = bitcast i16 %val.trunc to i16
   %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 2
-  store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep
+  store <4 x i16> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; FIXME: Better code on CI?
-define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
+define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
 ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -2066,18 +2066,18 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
-  %idx.val = load volatile i32, i32 addrspace(1)* undef
-  %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %idx.val = load volatile i32, ptr addrspace(1) undef
+  %vec = load <4 x i16>, ptr addrspace(1) %in.gep
   %val.trunc = trunc i32 %val to i16
   %val.cvt = bitcast i16 %val.trunc to i16
   %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 %idx.val
-  store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep
+  store <4 x i16> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val, i32 %idxval) #0 {
+define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idxval) #0 {
 ; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -2170,17 +2170,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <4 x half>, ptr addrspace(1) %in.gep
   %val.trunc = trunc i32 %val to i16
   %val.cvt = bitcast i16 %val.trunc to half
   %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval
-  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
+  store <4 x half> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v8f16_3(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in, i32 %val) {
+define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
 ; GFX9-LABEL: v_insertelement_v8f16_3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -2250,17 +2250,17 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(<8 x half> addrspace(1)* %out
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <8 x half>, <8 x half> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <8 x half>, ptr addrspace(1) %in.gep
   %val.trunc = trunc i32 %val to i16
   %val.cvt = bitcast i16 %val.trunc to half
   %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 3
-  store <8 x half> %vecins, <8 x half> addrspace(1)* %out.gep
+  store <8 x half> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v8i16_6(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in, i32 %val) {
+define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
 ; GFX9-LABEL: v_insertelement_v8i16_6:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -2330,17 +2330,17 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(<8 x i16> addrspace(1)* %out,
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <8 x i16>, ptr addrspace(1) %in.gep
   %val.trunc = trunc i32 %val to i16
   %val.cvt = bitcast i16 %val.trunc to i16
   %vecins = insertelement <8 x i16> %vec, i16 %val.cvt, i32 6
-  store <8 x i16> %vecins, <8 x i16> addrspace(1)* %out.gep
+  store <8 x i16> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v8f16_dynamic(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in, i32 %val, i32 %n) {
+define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
 ; GFX9-LABEL: v_insertelement_v8f16_dynamic:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -2556,17 +2556,17 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(<8 x half> addrspace(1)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <8 x half>, <8 x half> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <8 x half>, ptr addrspace(1) %in.gep
   %val.trunc = trunc i32 %val to i16
   %val.cvt = bitcast i16 %val.trunc to half
   %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 %n
-  store <8 x half> %vecins, <8 x half> addrspace(1)* %out.gep
+  store <8 x half> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v16f16_3(<16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %in, i32 %val) {
+define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
 ; GFX9-LABEL: v_insertelement_v16f16_3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -2656,17 +2656,17 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(<16 x half> addrspace(1)* %o
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <16 x half>, <16 x half> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <16 x half>, <16 x half> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <16 x half>, <16 x half> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <16 x half>, ptr addrspace(1) %in.gep
   %val.trunc = trunc i32 %val to i16
   %val.cvt = bitcast i16 %val.trunc to half
   %vecins = insertelement <16 x half> %vec, half %val.cvt, i32 3
-  store <16 x half> %vecins, <16 x half> addrspace(1)* %out.gep
+  store <16 x half> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v16i16_6(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in, i32 %val) {
+define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
 ; GFX9-LABEL: v_insertelement_v16i16_6:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -2757,17 +2757,17 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(<16 x i16> addrspace(1)* %ou
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <16 x i16>, <16 x i16> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <16 x i16>, ptr addrspace(1) %in.gep
   %val.trunc = trunc i32 %val to i16
   %val.cvt = bitcast i16 %val.trunc to i16
   %vecins = insertelement <16 x i16> %vec, i16 %val.cvt, i32 6
-  store <16 x i16> %vecins, <16 x i16> addrspace(1)* %out.gep
+  store <16 x i16> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 
-define amdgpu_kernel void @v_insertelement_v16f16_dynamic(<16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %in, i32 %val, i32 %n) {
+define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
 ; GFX9-LABEL: v_insertelement_v16f16_dynamic:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -3157,13 +3157,13 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(<16 x half> addrspace(
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <16 x half>, <16 x half> addrspace(1)* %in, i64 %tid.ext
-  %out.gep = getelementptr inbounds <16 x half>, <16 x half> addrspace(1)* %out, i64 %tid.ext
-  %vec = load <16 x half>, <16 x half> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %out, i64 %tid.ext
+  %vec = load <16 x half>, ptr addrspace(1) %in.gep
   %val.trunc = trunc i32 %val to i16
   %val.cvt = bitcast i16 %val.trunc to half
   %vecins = insertelement <16 x half> %vec, half %val.cvt, i32 %n
-  store <16 x half> %vecins, <16 x half> addrspace(1)* %out.gep
+  store <16 x half> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll
index 0e3deef5a3651..80ed8318d8abe 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll
@@ -18,16 +18,16 @@
 
 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %idx.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %idx = load i32, i32 addrspace(1)* %idx.gep
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %idx.gep = getelementptr inbounds i32, ptr addrspace(1) %idx.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %idx = load i32, ptr addrspace(1) %idx.gep
+  %vec = load <2 x i16>, ptr addrspace(1) %in.gep
   %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  store <2 x i16> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll
index 316c9e7ea29c2..c63fe3d6d2a37 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll
@@ -16,16 +16,16 @@
 
 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
 ; GCN: {{flat|global}}_store_dword v{{.+}}, [[RESULT]]
-define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %idx.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %idx = load i32, i32 addrspace(1)* %idx.gep
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %idx.gep = getelementptr inbounds i32, ptr addrspace(1) %idx.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %idx = load i32, ptr addrspace(1) %idx.gep
+  %vec = load <2 x i16>, ptr addrspace(1) %in.gep
   %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  store <2 x i16> %vecins, ptr addrspace(1) %out.gep
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
index 089769be1f593..6e11346c0653e 100644
--- a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
@@ -1,37 +1,37 @@
 ; RUN: not llc -global-isel=0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 ; RUN: not llc -global-isel=1 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
-; ERROR: error: <unknown>:0:0: in function use_group_to_global_addrspacecast void (i32 addrspace(3)*): invalid addrspacecast
-define amdgpu_kernel void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) {
-  %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(1)*
-  store volatile i32 0, i32 addrspace(1)* %stof
+; ERROR: error: <unknown>:0:0: in function use_group_to_global_addrspacecast void (ptr addrspace(3)): invalid addrspacecast
+define amdgpu_kernel void @use_group_to_global_addrspacecast(ptr addrspace(3) %ptr) {
+  %stof = addrspacecast ptr addrspace(3) %ptr to ptr addrspace(1)
+  store volatile i32 0, ptr addrspace(1) %stof
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: in function use_local_to_constant32bit_addrspacecast void (i32 addrspace(3)*): invalid addrspacecast
-define amdgpu_kernel void @use_local_to_constant32bit_addrspacecast(i32 addrspace(3)* %ptr) {
-  %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(6)*
-  %load = load volatile i32, i32 addrspace(6)* %stof
+; ERROR: error: <unknown>:0:0: in function use_local_to_constant32bit_addrspacecast void (ptr addrspace(3)): invalid addrspacecast
+define amdgpu_kernel void @use_local_to_constant32bit_addrspacecast(ptr addrspace(3) %ptr) {
+  %stof = addrspacecast ptr addrspace(3) %ptr to ptr addrspace(6)
+  %load = load volatile i32, ptr addrspace(6) %stof
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: in function use_constant32bit_to_local_addrspacecast void (i32 addrspace(6)*): invalid addrspacecast
-define amdgpu_kernel void @use_constant32bit_to_local_addrspacecast(i32 addrspace(6)* %ptr) {
-  %cast = addrspacecast i32 addrspace(6)* %ptr to i32 addrspace(3)*
-  %load = load volatile i32, i32 addrspace(3)* %cast
+; ERROR: error: <unknown>:0:0: in function use_constant32bit_to_local_addrspacecast void (ptr addrspace(6)): invalid addrspacecast
+define amdgpu_kernel void @use_constant32bit_to_local_addrspacecast(ptr addrspace(6) %ptr) {
+  %cast = addrspacecast ptr addrspace(6) %ptr to ptr addrspace(3)
+  %load = load volatile i32, ptr addrspace(3) %cast
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: in function use_local_to_42_addrspacecast void (i32 addrspace(3)*): invalid addrspacecast
-define amdgpu_kernel void @use_local_to_42_addrspacecast(i32 addrspace(3)* %ptr) {
-  %cast = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(42)*
-  store volatile i32 addrspace(42)* %cast, i32 addrspace(42)* addrspace(1)* null
+; ERROR: error: <unknown>:0:0: in function use_local_to_42_addrspacecast void (ptr addrspace(3)): invalid addrspacecast
+define amdgpu_kernel void @use_local_to_42_addrspacecast(ptr addrspace(3) %ptr) {
+  %cast = addrspacecast ptr addrspace(3) %ptr to ptr addrspace(42)
+  store volatile ptr addrspace(42) %cast, ptr addrspace(1) null
   ret void
 }
 
-; ERROR: error: <unknown>:0:0: in function use_42_to_local_addrspacecast void (i32 addrspace(42)*): invalid addrspacecast
-define amdgpu_kernel void @use_42_to_local_addrspacecast(i32 addrspace(42)* %ptr) {
-  %cast = addrspacecast i32 addrspace(42)* %ptr to i32 addrspace(3)*
-  %load = load volatile i32, i32 addrspace(3)* %cast
+; ERROR: error: <unknown>:0:0: in function use_42_to_local_addrspacecast void (ptr addrspace(42)): invalid addrspacecast
+define amdgpu_kernel void @use_42_to_local_addrspacecast(ptr addrspace(42) %ptr) {
+  %cast = addrspacecast ptr addrspace(42) %ptr to ptr addrspace(3)
+  %load = load volatile i32, ptr addrspace(3) %cast
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/kcache-fold.ll b/llvm/test/CodeGen/AMDGPU/kcache-fold.ll
index 37dd977ae216e..ba80e5c3907bb 100644
--- a/llvm/test/CodeGen/AMDGPU/kcache-fold.ll
+++ b/llvm/test/CodeGen/AMDGPU/kcache-fold.ll
@@ -4,35 +4,35 @@
 ; CHECK: MOV * T{{[0-9]+\.[XYZW], KC0}}
 define amdgpu_kernel void @main1() #0 {
 main_body:
-  %tmp = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp = load <4 x float>, ptr addrspace(8) null
   %tmp7 = extractelement <4 x float> %tmp, i32 0
-  %tmp8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp8 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 1)
   %tmp9 = extractelement <4 x float> %tmp8, i32 0
-  %tmp10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp10 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 2)
   %tmp11 = extractelement <4 x float> %tmp10, i32 0
   %tmp12 = fcmp ogt float %tmp7, 0.000000e+00
   %tmp13 = select i1 %tmp12, float %tmp9, float %tmp11
-  %tmp14 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp14 = load <4 x float>, ptr addrspace(8) null
   %tmp15 = extractelement <4 x float> %tmp14, i32 1
-  %tmp16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp16 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 1)
   %tmp17 = extractelement <4 x float> %tmp16, i32 1
-  %tmp18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp18 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 2)
   %tmp19 = extractelement <4 x float> %tmp18, i32 1
   %tmp20 = fcmp ogt float %tmp15, 0.000000e+00
   %tmp21 = select i1 %tmp20, float %tmp17, float %tmp19
-  %tmp22 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp22 = load <4 x float>, ptr addrspace(8) null
   %tmp23 = extractelement <4 x float> %tmp22, i32 2
-  %tmp24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp24 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 1)
   %tmp25 = extractelement <4 x float> %tmp24, i32 2
-  %tmp26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp26 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 2)
   %tmp27 = extractelement <4 x float> %tmp26, i32 2
   %tmp28 = fcmp ogt float %tmp23, 0.000000e+00
   %tmp29 = select i1 %tmp28, float %tmp25, float %tmp27
-  %tmp30 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp30 = load <4 x float>, ptr addrspace(8) null
   %tmp31 = extractelement <4 x float> %tmp30, i32 3
-  %tmp32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp32 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 1)
   %tmp33 = extractelement <4 x float> %tmp32, i32 3
-  %tmp34 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp34 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 2)
   %tmp35 = extractelement <4 x float> %tmp34, i32 3
   %tmp36 = fcmp ogt float %tmp31, 0.000000e+00
   %tmp37 = select i1 %tmp36, float %tmp33, float %tmp35
@@ -56,35 +56,35 @@ main_body:
 ; CHECK-NOT: MOV
 define amdgpu_kernel void @main2() #0 {
 main_body:
-  %tmp = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp = load <4 x float>, ptr addrspace(8) null
   %tmp7 = extractelement <4 x float> %tmp, i32 0
-  %tmp8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp8 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 1)
   %tmp9 = extractelement <4 x float> %tmp8, i32 0
-  %tmp10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp10 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 1)
   %tmp11 = extractelement <4 x float> %tmp10, i32 1
   %tmp12 = fcmp ogt float %tmp7, 0.000000e+00
   %tmp13 = select i1 %tmp12, float %tmp9, float %tmp11
-  %tmp14 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp14 = load <4 x float>, ptr addrspace(8) null
   %tmp15 = extractelement <4 x float> %tmp14, i32 1
-  %tmp16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp16 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 2)
   %tmp17 = extractelement <4 x float> %tmp16, i32 0
-  %tmp18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp18 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 2)
   %tmp19 = extractelement <4 x float> %tmp18, i32 1
   %tmp20 = fcmp ogt float %tmp15, 0.000000e+00
   %tmp21 = select i1 %tmp20, float %tmp17, float %tmp19
-  %tmp22 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp22 = load <4 x float>, ptr addrspace(8) null
   %tmp23 = extractelement <4 x float> %tmp22, i32 2
-  %tmp24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp24 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 1)
   %tmp25 = extractelement <4 x float> %tmp24, i32 3
-  %tmp26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp26 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 1)
   %tmp27 = extractelement <4 x float> %tmp26, i32 2
   %tmp28 = fcmp ogt float %tmp23, 0.000000e+00
   %tmp29 = select i1 %tmp28, float %tmp25, float %tmp27
-  %tmp30 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp30 = load <4 x float>, ptr addrspace(8) null
   %tmp31 = extractelement <4 x float> %tmp30, i32 3
-  %tmp32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp32 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 2)
   %tmp33 = extractelement <4 x float> %tmp32, i32 3
-  %tmp34 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp34 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 2)
   %tmp35 = extractelement <4 x float> %tmp34, i32 2
   %tmp36 = fcmp ogt float %tmp31, 0.000000e+00
   %tmp37 = select i1 %tmp36, float %tmp33, float %tmp35

diff  --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
index c2fb908472a27..f956251e4fb54 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
@@ -96,17 +96,17 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
   %arr = alloca < 1339 x i32>, align 8192, addrspace(5)
   %cmp = icmp ne i32 %val, 0
   %vreg = call i32 asm sideeffect "; def vgpr10", "={v10}"()
-  call void @device_func(<1339 x i32> addrspace(5)* %arr)
+  call void @device_func(ptr addrspace(5) %arr)
   br i1 %cmp, label %store, label %end
 
 store:
-  store volatile i32 %vreg, i32 addrspace(3)* undef
+  store volatile i32 %vreg, ptr addrspace(3) undef
   ret void
 
 end:
   ret void
 }
 
-declare void @device_func(<1339 x i32> addrspace(5)*)
+declare void @device_func(ptr addrspace(5))
 
 attributes #0 = { nounwind "frame-pointer"="all" }

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll
index f00f59b2ffc2b..da9ca750d85f3 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll
@@ -9,16 +9,15 @@
 @lds.missing.align.0 = internal unnamed_addr addrspace(3) global [39 x i32] undef
 @lds.missing.align.1 = internal unnamed_addr addrspace(3) global [7 x i64] undef
 
-declare void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i1) #0
-declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i1) #0
+declare void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) nocapture, ptr addrspace(1) nocapture readonly, i32, i1) #0
+declare void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #0
 
 
 ; HSA-LABEL: {{^}}test_no_round_size_1:
 ; HSA: workgroup_group_segment_byte_size = 38
-define amdgpu_kernel void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
-  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.0.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.0.bc, i32 38, i1 false)
+define amdgpu_kernel void @test_no_round_size_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 @lds.align16.0, ptr addrspace(1) align 4 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 @lds.align16.0, i32 38, i1 false)
   ret void
 }
 
@@ -34,14 +33,12 @@ define amdgpu_kernel void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrsp
 ; HSA-LABEL: {{^}}test_round_size_2:
 ; HSA: workgroup_group_segment_byte_size = 86
 ; HSA: group_segment_alignment = 4
-define amdgpu_kernel void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
-  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.0.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.0.bc, i32 38, i1 false)
+define amdgpu_kernel void @test_round_size_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 @lds.align16.0, ptr addrspace(1) align 4 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 @lds.align16.0, i32 38, i1 false)
 
-  %lds.align16.1.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.1 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.1.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.1.bc, i32 38, i1 false)
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 @lds.align16.1, ptr addrspace(1) align 4 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 @lds.align16.1, i32 38, i1 false)
 
   ret void
 }
@@ -50,14 +47,12 @@ define amdgpu_kernel void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace
 ; HSA-LABEL: {{^}}test_round_size_2_align_8:
 ; HSA: workgroup_group_segment_byte_size = 86
 ; HSA: group_segment_alignment = 4
-define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
-  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
+define amdgpu_kernel void @test_round_size_2_align_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align16.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align16.0, i32 38, i1 false)
 
-  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align8.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align8.0, i32 38, i1 false)
 
   ret void
 }
@@ -65,22 +60,21 @@ define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 a
 ; HSA-LABEL: {{^}}test_round_local_lds_and_arg:
 ; HSA: workgroup_group_segment_byte_size = 38
 ; HSA: group_segment_alignment = 4
-define amdgpu_kernel void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
-  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.0.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false)
+define amdgpu_kernel void @test_round_local_lds_and_arg(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(3) %lds.arg) #1 {
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 @lds.align16.0, ptr addrspace(1) align 4 %in, i32 38, i1 false)
 
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.0.bc, i32 38, i1 false)
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.arg, i8 addrspace(1)* align 4 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.arg, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 @lds.align16.0, i32 38, i1 false)
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 %lds.arg, ptr addrspace(1) align 4 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 %lds.arg, i32 38, i1 false)
   ret void
 }
 
 ; HSA-LABEL: {{^}}test_round_lds_arg:
 ; HSA: workgroup_group_segment_byte_size = 0
 ; HSA: group_segment_alignment = 4
-define amdgpu_kernel void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.arg, i8 addrspace(1)* align 4 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.arg, i32 38, i1 false)
+define amdgpu_kernel void @test_round_lds_arg(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(3) %lds.arg) #1 {
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 %lds.arg, ptr addrspace(1) align 4 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 %lds.arg, i32 38, i1 false)
   ret void
 }
 
@@ -88,9 +82,9 @@ define amdgpu_kernel void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspac
 ; HSA-LABEL: {{^}}test_high_align_lds_arg:
 ; HSA: workgroup_group_segment_byte_size = 0
 ; HSA: group_segment_alignment = 4
-define amdgpu_kernel void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 {
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 64 %lds.arg, i8 addrspace(1)* align 64 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 64 %out, i8 addrspace(3)* align 64 %lds.arg, i32 38, i1 false)
+define amdgpu_kernel void @test_high_align_lds_arg(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(3) align 64 %lds.arg) #1 {
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 64 %lds.arg, ptr addrspace(1) align 64 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 64 %out, ptr addrspace(3) align 64 %lds.arg, i32 38, i1 false)
   ret void
 }
 
@@ -98,14 +92,12 @@ define amdgpu_kernel void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 add
 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0:
 ; HSA: workgroup_group_segment_byte_size = 216
 ; HSA: group_segment_alignment = 4
-define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
-  %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i8 addrspace(1)* align 4 %in, i32 160, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i32 160, i1 false)
+define amdgpu_kernel void @test_missing_alignment_size_2_order0(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 @lds.missing.align.0, ptr addrspace(1) align 4 %in, i32 160, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 @lds.missing.align.0, i32 160, i1 false)
 
-  %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i8 addrspace(1)* align 8 %in, i32 56, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i32 56, i1 false)
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.missing.align.1, ptr addrspace(1) align 8 %in, i32 56, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.missing.align.1, i32 56, i1 false)
 
   ret void
 }
@@ -114,14 +106,12 @@ define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)*
 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1:
 ; HSA: workgroup_group_segment_byte_size = 216
 ; HSA: group_segment_alignment = 4
-define amdgpu_kernel void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
-  %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i8 addrspace(1)* align 8 %in, i32 56, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i32 56, i1 false)
+define amdgpu_kernel void @test_missing_alignment_size_2_order1(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.missing.align.1, ptr addrspace(1) align 8 %in, i32 56, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.missing.align.1, i32 56, i1 false)
 
-  %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i8 addrspace(1)* align 4 %in, i32 160, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i32 160, i1 false)
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 @lds.missing.align.0, ptr addrspace(1) align 4 %in, i32 160, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 @lds.missing.align.0, i32 160, i1 false)
 
   ret void
 }
@@ -131,18 +121,15 @@ define amdgpu_kernel void @test_missing_alignment_size_2_order1(i8 addrspace(1)*
 ; HSA-LABEL: {{^}}test_round_size_3_order0:
 ; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
-define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
-  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false)
+define amdgpu_kernel void @test_round_size_3_order0(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align32.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align32.0, i32 38, i1 false)
 
-  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align16.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align16.0, i32 38, i1 false)
 
-  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align8.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align8.0, i32 38, i1 false)
 
   ret void
 }
@@ -152,18 +139,15 @@ define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 ad
 ; HSA-LABEL: {{^}}test_round_size_3_order1:
 ; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
-define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
-  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false)
+define amdgpu_kernel void @test_round_size_3_order1(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align32.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align32.0, i32 38, i1 false)
 
-  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align8.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align8.0, i32 38, i1 false)
 
-  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align16.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align16.0, i32 38, i1 false)
 
   ret void
 }
@@ -173,18 +157,15 @@ define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 ad
 ; HSA-LABEL: {{^}}test_round_size_3_order2:
 ; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
-define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
-  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
+define amdgpu_kernel void @test_round_size_3_order2(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align16.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align16.0, i32 38, i1 false)
 
-  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false)
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align32.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align32.0, i32 38, i1 false)
 
-  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align8.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align8.0, i32 38, i1 false)
 
   ret void
 }
@@ -194,18 +175,15 @@ define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 ad
 ; HSA-LABEL: {{^}}test_round_size_3_order3:
 ; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
-define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
-  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
+define amdgpu_kernel void @test_round_size_3_order3(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align16.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align16.0, i32 38, i1 false)
 
-  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align8.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align8.0, i32 38, i1 false)
 
-  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false)
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align32.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align32.0, i32 38, i1 false)
 
   ret void
 }
@@ -215,18 +193,15 @@ define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 ad
 ; HSA-LABEL: {{^}}test_round_size_3_order4:
 ; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
-define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
-  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
+define amdgpu_kernel void @test_round_size_3_order4(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align8.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align8.0, i32 38, i1 false)
 
-  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false)
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align32.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align32.0, i32 38, i1 false)
 
-  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align16.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align16.0, i32 38, i1 false)
 
   ret void
 }
@@ -236,18 +211,15 @@ define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 ad
 ; HSA-LABEL: {{^}}test_round_size_3_order5:
 ; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
-define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
-  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
-
-  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
-
-  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
-  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
-  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false)
+define amdgpu_kernel void @test_round_size_3_order5(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align8.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align8.0, i32 38, i1 false)
+
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align16.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align16.0, i32 38, i1 false)
+
+  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align32.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
+  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align32.0, i32 38, i1 false)
 
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll
index 10dfb1cce0f15..74eb1f403412f 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
-declare float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* nocapture, float, i32, i32, i1)
+declare float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) nocapture, float, i32, i32, i1)
 
 ; GCN-LABEL: {{^}}lds_ds_fadd:
 ; VI-DAG: s_mov_b32 m0
@@ -11,15 +11,15 @@ declare float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* nocapture, float, i32
 ; GCN: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64
 ; GCN: s_waitcnt lgkmcnt(1)
 ; GCN: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]]
-define amdgpu_kernel void @lds_ds_fadd(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) {
+define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %ptrf, i32 %idx) {
   %idx.add = add nuw i32 %idx, 4
   %shl0 = shl i32 %idx.add, 3
   %shl1 = shl i32 %idx.add, 4
-  %ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
-  %ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
-  %a1 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
-  store float %a3, float addrspace(1)* %out
+  %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
+  %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3)
+  %a1 = call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
+  %a2 = call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
+  %a3 = call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %ptrf, float %a1, i32 0, i32 0, i1 false)
+  store float %a3, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
index 1eb56f5f35281..93fc998098c48 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
@@ -11,13 +11,13 @@
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=G_GFX9 %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=G_GFX10 %s
 
-declare float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* nocapture, float, i32, i32, i1)
-declare float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* nocapture, float, i32, i32, i1)
-declare double @llvm.amdgcn.ds.fmin.f64(double addrspace(3)* nocapture, double, i32, i32, i1)
-declare double @llvm.amdgcn.ds.fmax.f64(double addrspace(3)* nocapture, double, i32, i32, i1)
+declare float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) nocapture, float, i32, i32, i1)
+declare float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) nocapture, float, i32, i32, i1)
+declare double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) nocapture, double, i32, i32, i1)
+declare double @llvm.amdgcn.ds.fmax.f64(ptr addrspace(3) nocapture, double, i32, i32, i1)
 
 
-define amdgpu_kernel void @lds_ds_fmin(float addrspace(5)* %out, float addrspace(3)* %ptrf, i32 %idx) {
+define amdgpu_kernel void @lds_ds_fmin(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) {
 ; SI-LABEL: lds_ds_fmin:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
@@ -294,16 +294,16 @@ define amdgpu_kernel void @lds_ds_fmin(float addrspace(5)* %out, float addrspace
   %idx.add = add nuw i32 %idx, 4
   %shl0 = shl i32 %idx.add, 3
   %shl1 = shl i32 %idx.add, 4
-  %ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
-  %ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
-  %a1 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
-  store float %a3, float addrspace(5)* %out
+  %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
+  %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3)
+  %a1 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
+  %a2 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
+  %a3 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptrf, float %a1, i32 0, i32 0, i1 false)
+  store float %a3, ptr addrspace(5) %out
   ret void
 }
 
-define amdgpu_kernel void @lds_ds_fmax(float addrspace(5)* %out, float addrspace(3)* %ptrf, i32 %idx) {
+define amdgpu_kernel void @lds_ds_fmax(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) {
 ; SI-LABEL: lds_ds_fmax:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
@@ -580,16 +580,16 @@ define amdgpu_kernel void @lds_ds_fmax(float addrspace(5)* %out, float addrspace
   %idx.add = add nuw i32 %idx, 4
   %shl0 = shl i32 %idx.add, 3
   %shl1 = shl i32 %idx.add, 4
-  %ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
-  %ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
-  %a1 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false)
-  store float %a3, float addrspace(5)* %out
+  %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
+  %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3)
+  %a1 = call float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) %ptr0, float 4.2e+1, i32 0, i32 0, i1 false)
+  %a2 = call float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) %ptr1, float 4.2e+1, i32 0, i32 0, i1 false)
+  %a3 = call float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) %ptrf, float %a1, i32 0, i32 0, i1 false)
+  store float %a3, ptr addrspace(5) %out
   ret void
 }
 
-define amdgpu_kernel void @lds_ds_fmin_f64(double addrspace(5)* %out, double addrspace(3)* %ptrf, i32 %idx) {
+define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) {
 ; SI-LABEL: lds_ds_fmin_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
@@ -920,16 +920,16 @@ define amdgpu_kernel void @lds_ds_fmin_f64(double addrspace(5)* %out, double add
   %idx.add = add nuw i32 %idx, 4
   %shl0 = shl i32 %idx.add, 3
   %shl1 = shl i32 %idx.add, 4
-  %ptr0 = inttoptr i32 %shl0 to double addrspace(3)*
-  %ptr1 = inttoptr i32 %shl1 to double addrspace(3)*
-  %a1 = call double @llvm.amdgcn.ds.fmin.f64(double addrspace(3)* %ptr0, double 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call double @llvm.amdgcn.ds.fmin.f64(double addrspace(3)* %ptr1, double 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call double @llvm.amdgcn.ds.fmin.f64(double addrspace(3)* %ptrf, double %a1, i32 0, i32 0, i1 false)
-  store double %a3, double addrspace(5)* %out
+  %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
+  %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3)
+  %a1 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr0, double 4.2e+1, i32 0, i32 0, i1 false)
+  %a2 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr1, double 4.2e+1, i32 0, i32 0, i1 false)
+  %a3 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptrf, double %a1, i32 0, i32 0, i1 false)
+  store double %a3, ptr addrspace(5) %out
   ret void
 }
 
-define amdgpu_kernel void @lds_ds_fmax_f64(double addrspace(5)* %out, double addrspace(3)* %ptrf, i32 %idx) {
+define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) {
 ; SI-LABEL: lds_ds_fmax_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
@@ -1260,11 +1260,11 @@ define amdgpu_kernel void @lds_ds_fmax_f64(double addrspace(5)* %out, double add
   %idx.add = add nuw i32 %idx, 4
   %shl0 = shl i32 %idx.add, 3
   %shl1 = shl i32 %idx.add, 4
-  %ptr0 = inttoptr i32 %shl0 to double addrspace(3)*
-  %ptr1 = inttoptr i32 %shl1 to double addrspace(3)*
-  %a1 = call double @llvm.amdgcn.ds.fmax.f64(double addrspace(3)* %ptr0, double 4.2e+1, i32 0, i32 0, i1 false)
-  %a2 = call double @llvm.amdgcn.ds.fmax.f64(double addrspace(3)* %ptr1, double 4.2e+1, i32 0, i32 0, i1 false)
-  %a3 = call double @llvm.amdgcn.ds.fmax.f64(double addrspace(3)* %ptrf, double %a1, i32 0, i32 0, i1 false)
-  store double %a3, double addrspace(5)* %out
+  %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
+  %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3)
+  %a1 = call double @llvm.amdgcn.ds.fmax.f64(ptr addrspace(3) %ptr0, double 4.2e+1, i32 0, i32 0, i1 false)
+  %a2 = call double @llvm.amdgcn.ds.fmax.f64(ptr addrspace(3) %ptr1, double 4.2e+1, i32 0, i32 0, i1 false)
+  %a3 = call double @llvm.amdgcn.ds.fmax.f64(ptr addrspace(3) %ptrf, double %a1, i32 0, i32 0, i1 false)
+  store double %a3, ptr addrspace(5) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-bounds.ll b/llvm/test/CodeGen/AMDGPU/lds-bounds.ll
index 0fecb68ade286..fe68e56b784ef 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-bounds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-bounds.ll
@@ -5,24 +5,24 @@
 
 ; GCN-LABEL: {{^}}store_aligned:
 ; GCN: ds_write_b64
-define amdgpu_cs void @store_aligned(i32 addrspace(3)* %ptr) #0 {
+define amdgpu_cs void @store_aligned(ptr addrspace(3) %ptr) #0 {
 entry:
-  %ptr.gep.1 = getelementptr i32, i32 addrspace(3)* %ptr, i32 1
+  %ptr.gep.1 = getelementptr i32, ptr addrspace(3) %ptr, i32 1
 
-  store i32 42, i32 addrspace(3)* %ptr, align 8
-  store i32 43, i32 addrspace(3)* %ptr.gep.1
+  store i32 42, ptr addrspace(3) %ptr, align 8
+  store i32 43, ptr addrspace(3) %ptr.gep.1
   ret void
 }
 
 
 ; GCN-LABEL: {{^}}load_aligned:
 ; GCN: ds_read_b64
-define amdgpu_cs <2 x float> @load_aligned(i32 addrspace(3)* %ptr) #0 {
+define amdgpu_cs <2 x float> @load_aligned(ptr addrspace(3) %ptr) #0 {
 entry:
-  %ptr.gep.1 = getelementptr i32, i32 addrspace(3)* %ptr, i32 1
+  %ptr.gep.1 = getelementptr i32, ptr addrspace(3) %ptr, i32 1
 
-  %v.0 = load i32, i32 addrspace(3)* %ptr, align 8
-  %v.1 = load i32, i32 addrspace(3)* %ptr.gep.1
+  %v.0 = load i32, ptr addrspace(3) %ptr, align 8
+  %v.1 = load i32, ptr addrspace(3) %ptr.gep.1
 
   %r.0 = insertelement <2 x i32> undef, i32 %v.0, i32 0
   %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1
@@ -35,11 +35,11 @@ entry:
 ; GCN: ds_write2_b32
 define amdgpu_cs void @store_global_const_idx() #0 {
 entry:
-  %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 3
-  %ptr.b = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 4
+  %ptr.a = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 3
+  %ptr.b = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 4
 
-  store i32 42, i32 addrspace(3)* %ptr.a
-  store i32 43, i32 addrspace(3)* %ptr.b
+  store i32 42, ptr addrspace(3) %ptr.a
+  store i32 43, ptr addrspace(3) %ptr.b
   ret void
 }
 
@@ -48,11 +48,11 @@ entry:
 ; GCN: ds_read2_b32
 define amdgpu_cs <2 x float> @load_global_const_idx() #0 {
 entry:
-  %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 3
-  %ptr.b = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 4
+  %ptr.a = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 3
+  %ptr.b = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 4
 
-  %v.0 = load i32, i32 addrspace(3)* %ptr.a
-  %v.1 = load i32, i32 addrspace(3)* %ptr.b
+  %v.0 = load i32, ptr addrspace(3) %ptr.a
+  %v.1 = load i32, ptr addrspace(3) %ptr.b
 
   %r.0 = insertelement <2 x i32> undef, i32 %v.0, i32 0
   %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1
@@ -67,11 +67,11 @@ entry:
 ; NOSI: ds_write2_b32
 define amdgpu_cs void @store_global_var_idx_case1(i32 %idx) #0 {
 entry:
-  %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 %idx
-  %ptr.b = getelementptr i32, i32 addrspace(3)* %ptr.a, i32 1
+  %ptr.a = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 %idx
+  %ptr.b = getelementptr i32, ptr addrspace(3) %ptr.a, i32 1
 
-  store i32 42, i32 addrspace(3)* %ptr.a
-  store i32 43, i32 addrspace(3)* %ptr.b
+  store i32 42, ptr addrspace(3) %ptr.a
+  store i32 43, ptr addrspace(3) %ptr.b
   ret void
 }
 
@@ -82,11 +82,11 @@ entry:
 ; NOSI: ds_read2_b32
 define amdgpu_cs <2 x float> @load_global_var_idx_case1(i32 %idx) #0 {
 entry:
-  %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 %idx
-  %ptr.b = getelementptr i32, i32 addrspace(3)* %ptr.a, i32 1
+  %ptr.a = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 %idx
+  %ptr.b = getelementptr i32, ptr addrspace(3) %ptr.a, i32 1
 
-  %v.0 = load i32, i32 addrspace(3)* %ptr.a
-  %v.1 = load i32, i32 addrspace(3)* %ptr.b
+  %v.0 = load i32, ptr addrspace(3) %ptr.a
+  %v.1 = load i32, ptr addrspace(3) %ptr.b
 
   %r.0 = insertelement <2 x i32> undef, i32 %v.0, i32 0
   %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1
@@ -100,11 +100,11 @@ entry:
 define amdgpu_cs void @store_global_var_idx_case2(i32 %idx) #0 {
 entry:
   %idx.and = and i32 %idx, 255
-  %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 %idx.and
-  %ptr.b = getelementptr i32, i32 addrspace(3)* %ptr.a, i32 1
+  %ptr.a = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 %idx.and
+  %ptr.b = getelementptr i32, ptr addrspace(3) %ptr.a, i32 1
 
-  store i32 42, i32 addrspace(3)* %ptr.a
-  store i32 43, i32 addrspace(3)* %ptr.b
+  store i32 42, ptr addrspace(3) %ptr.a
+  store i32 43, ptr addrspace(3) %ptr.b
   ret void
 }
 
@@ -114,11 +114,11 @@ entry:
 define amdgpu_cs <2 x float> @load_global_var_idx_case2(i32 %idx) #0 {
 entry:
   %idx.and = and i32 %idx, 255
-  %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 %idx.and
-  %ptr.b = getelementptr i32, i32 addrspace(3)* %ptr.a, i32 1
+  %ptr.a = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 %idx.and
+  %ptr.b = getelementptr i32, ptr addrspace(3) %ptr.a, i32 1
 
-  %v.0 = load i32, i32 addrspace(3)* %ptr.a
-  %v.1 = load i32, i32 addrspace(3)* %ptr.b
+  %v.0 = load i32, ptr addrspace(3) %ptr.a
+  %v.1 = load i32, ptr addrspace(3) %ptr.b
 
   %r.0 = insertelement <2 x i32> undef, i32 %v.0, i32 0
   %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 66e2bfaeeb444..d8e3f52331ab4 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -22,7 +22,7 @@ define void @use_module() #0 {
 ; CHECK-NEXT:    ds_write_b16 v0, v0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
-  store i16 0, i16 addrspace(3)* @module_variable
+  store i16 0, ptr addrspace(3) @module_variable
   ret void
 }
 
@@ -49,10 +49,10 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) #1 {
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    ds_write_b32 v2, v0
 ; CHECK-NEXT:    s_endpgm
-  store i16 2, i16 addrspace(3)* @kernel_normal
+  store i16 2, ptr addrspace(3) @kernel_normal
 
-  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx
-  store float 0.0, float addrspace(3)* %arrayidx1
+  %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_normal, i32 0, i32 %idx
+  store float 0.0, ptr addrspace(3) %arrayidx1
   ret void
 }
 
@@ -85,12 +85,12 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    ds_write_b32 v3, v0
 ; CHECK-NEXT:    s_endpgm
   call void @use_module()
-  store i16 1, i16 addrspace(3)* @module_variable
+  store i16 1, ptr addrspace(3) @module_variable
 
-  store i16 2, i16 addrspace(3)* @kernel_normal
+  store i16 2, ptr addrspace(3) @kernel_normal
 
-  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx
-  store float 0.0, float addrspace(3)* %arrayidx1
+  %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_normal, i32 0, i32 %idx
+  store float 0.0, ptr addrspace(3) %arrayidx1
   ret void
 }
 
@@ -107,10 +107,10 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) #1
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    ds_write_b32 v2, v0
 ; CHECK-NEXT:    s_endpgm
-  store i16 2, i16 addrspace(3)* @kernel_overalign
+  store i16 2, ptr addrspace(3) @kernel_overalign
 
-  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx
-  store float 0.0, float addrspace(3)* %arrayidx1
+  %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_normal, i32 0, i32 %idx
+  store float 0.0, ptr addrspace(3) %arrayidx1
   ret void
 }
 
@@ -143,12 +143,12 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    ds_write_b32 v3, v0
 ; CHECK-NEXT:    s_endpgm
   call void @use_module()
-  store i16 1, i16 addrspace(3)* @module_variable
+  store i16 1, ptr addrspace(3) @module_variable
 
-  store i16 2, i16 addrspace(3)* @kernel_overalign
+  store i16 2, ptr addrspace(3) @kernel_overalign
 
-  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx
-  store float 0.0, float addrspace(3)* %arrayidx1
+  %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_normal, i32 0, i32 %idx
+  store float 0.0, ptr addrspace(3) %arrayidx1
   ret void
 }
 
@@ -165,10 +165,10 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) #1
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    ds_write_b32 v2, v0
 ; CHECK-NEXT:    s_endpgm
-  store i16 2, i16 addrspace(3)* @kernel_normal
+  store i16 2, ptr addrspace(3) @kernel_normal
 
-  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx
-  store float 0.0, float addrspace(3)* %arrayidx1
+  %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_overalign, i32 0, i32 %idx
+  store float 0.0, ptr addrspace(3) %arrayidx1
   ret void
 }
 
@@ -201,12 +201,12 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
 ; CHECK-NEXT:    ds_write_b32 v3, v0
 ; CHECK-NEXT:    s_endpgm
   call void @use_module()
-  store i16 1, i16 addrspace(3)* @module_variable
+  store i16 1, ptr addrspace(3) @module_variable
 
-  store i16 2, i16 addrspace(3)* @kernel_normal
+  store i16 2, ptr addrspace(3) @kernel_normal
 
-  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx
-  store float 0.0, float addrspace(3)* %arrayidx1
+  %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_overalign, i32 0, i32 %idx
+  store float 0.0, ptr addrspace(3) %arrayidx1
   ret void
 }
 
@@ -223,10 +223,10 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx)
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    ds_write_b32 v2, v0
 ; CHECK-NEXT:    s_endpgm
-  store i16 2, i16 addrspace(3)* @kernel_overalign
+  store i16 2, ptr addrspace(3) @kernel_overalign
 
-  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx
-  store float 0.0, float addrspace(3)* %arrayidx1
+  %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_overalign, i32 0, i32 %idx
+  store float 0.0, ptr addrspace(3) %arrayidx1
   ret void
 }
 
@@ -259,12 +259,12 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
 ; CHECK-NEXT:    ds_write_b32 v3, v0
 ; CHECK-NEXT:    s_endpgm
   call void @use_module()
-  store i16 1, i16 addrspace(3)* @module_variable
+  store i16 1, ptr addrspace(3) @module_variable
 
-  store i16 2, i16 addrspace(3)* @kernel_overalign
+  store i16 2, ptr addrspace(3) @kernel_overalign
 
-  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx
-  store float 0.0, float addrspace(3)* %arrayidx1
+  %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_overalign, i32 0, i32 %idx
+  store float 0.0, ptr addrspace(3) %arrayidx1
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
index 540422ba45ab9..2e3aefa59ab7d 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -28,7 +28,7 @@ define void @func_use_lds_global() {
 ; GFX9-NEXT:    s_trap 2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  store float 0.0, float addrspace(3)* @lds, align 4
+  store float 0.0, ptr addrspace(3) @lds, align 4
   ret void
 }
 
@@ -46,6 +46,6 @@ define void @func_use_lds_global_constexpr_cast() {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_trap 2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  store i32 ptrtoint (float addrspace(3)* @lds to i32), i32 addrspace(1)* undef, align 4
+  store i32 ptrtoint (ptr addrspace(3) @lds to i32), ptr addrspace(1) undef, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-initializer.ll b/llvm/test/CodeGen/AMDGPU/lds-initializer.ll
index 8b46b4c3e6b28..b26bf02165f7c 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-initializer.ll
@@ -5,9 +5,9 @@
 
 @lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]
 
-define amdgpu_kernel void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) {
- %gep = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds, i32 0, i32 10
-  %ld = load i32, i32 addrspace(3)* %gep
-  store i32 %ld, i32 addrspace(1)* %out
+define amdgpu_kernel void @load_init_lds_global(ptr addrspace(1) %out, i1 %p) {
+ %gep = getelementptr [8 x i32], ptr addrspace(3) @lds, i32 0, i32 10
+  %ld = load i32, ptr addrspace(3) %gep
+  store i32 %ld, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll b/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
index 620d4ae9f9c49..c7ca67a426235 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
@@ -18,7 +18,7 @@
 
 ; GCN: .LBB0_3:
 ; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @copy_local_to_global_loop_m0_init(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(3)* noalias nocapture readonly %in, i32 %n) #0 {
+define amdgpu_kernel void @copy_local_to_global_loop_m0_init(ptr addrspace(1) noalias nocapture %out, ptr addrspace(3) noalias nocapture readonly %in, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
@@ -35,10 +35,10 @@ bb:
 .lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader
   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
   %i.01 = phi i32 [ %tmp4, %.lr.ph ], [ 0, %.lr.ph.preheader ]
-  %tmp1 = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %i.01
-  %tmp2 = load i32, i32 addrspace(3)* %tmp1, align 4
-  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %indvars.iv
-  store i32 %tmp2, i32 addrspace(1)* %tmp3, align 4
+  %tmp1 = getelementptr inbounds i32, ptr addrspace(3) %in, i32 %i.01
+  %tmp2 = load i32, ptr addrspace(3) %tmp1, align 4
+  %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %indvars.iv
+  store i32 %tmp2, ptr addrspace(1) %tmp3, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %tmp4 = add nuw nsw i32 %i.01, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
index b5fceed2c105b..195d49240d2d6 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
@@ -10,17 +10,16 @@
 ; GCN-LABEL: test_local_misaligned_v2:
 ; GCN-DAG: ds_{{read2|load_2addr}}_b32
 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
-define amdgpu_kernel void @test_local_misaligned_v2(i32 addrspace(3)* %arg) {
+define amdgpu_kernel void @test_local_misaligned_v2(ptr addrspace(3) %arg) {
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
-  %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
-  %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
+  %load = load <2 x i32>, ptr addrspace(3) %gep, align 4
   %v1 = extractelement <2 x i32> %load, i32 0
   %v2 = extractelement <2 x i32> %load, i32 1
   %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
   %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
-  store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 4
+  store <2 x i32> %v4, ptr addrspace(3) %gep, align 4
   ret void
 }
 
@@ -29,12 +28,11 @@ bb:
 ; GCN-DAG: ds_{{read2|load_2addr}}_b32
 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
-define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
+define amdgpu_kernel void @test_local_misaligned_v4(ptr addrspace(3) %arg) {
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
-  %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
-  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
+  %load = load <4 x i32>, ptr addrspace(3) %gep, align 4
   %v1 = extractelement <4 x i32> %load, i32 0
   %v2 = extractelement <4 x i32> %load, i32 1
   %v3 = extractelement <4 x i32> %load, i32 2
@@ -43,7 +41,7 @@ bb:
   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
-  store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 4
+  store <4 x i32> %v8, ptr addrspace(3) %gep, align 4
   ret void
 }
 
@@ -52,19 +50,18 @@ bb:
 ; GCN-DAG: ds_{{read|load}}_b32
 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
 ; GCN-DAG: ds_{{write|store}}_b32
-define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
+define amdgpu_kernel void @test_local_misaligned_v3(ptr addrspace(3) %arg) {
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
-  %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
-  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
+  %load = load <3 x i32>, ptr addrspace(3) %gep, align 4
   %v1 = extractelement <3 x i32> %load, i32 0
   %v2 = extractelement <3 x i32> %load, i32 1
   %v3 = extractelement <3 x i32> %load, i32 2
   %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
   %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
   %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
-  store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 4
+  store <3 x i32> %v7, ptr addrspace(3) %gep, align 4
   ret void
 }
 
@@ -75,17 +72,16 @@ bb:
 ; SPLIT-DAG: flat_load_{{dword|b32}} v
 ; SPLIT-DAG: flat_store_{{dword|b32}} v
 ; SPLIT-DAG: flat_store_{{dword|b32}} v
-define amdgpu_kernel void @test_flat_misaligned_v2(i32* %arg) {
+define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) {
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
-  %ptr = bitcast i32* %gep to <2 x i32>*
-  %load = load <2 x i32>, <2 x i32>* %ptr, align 4
+  %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
+  %load = load <2 x i32>, ptr %gep, align 4
   %v1 = extractelement <2 x i32> %load, i32 0
   %v2 = extractelement <2 x i32> %load, i32 1
   %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
   %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
-  store <2 x i32> %v4, <2 x i32>* %ptr, align 4
+  store <2 x i32> %v4, ptr %gep, align 4
   ret void
 }
 
@@ -100,12 +96,11 @@ bb:
 ; SPLIT-DAG: flat_store_{{dword|b32}} v
 ; SPLIT-DAG: flat_store_{{dword|b32}} v
 ; SPLIT-DAG: flat_store_{{dword|b32}} v
-define amdgpu_kernel void @test_flat_misaligned_v4(i32* %arg) {
+define amdgpu_kernel void @test_flat_misaligned_v4(ptr %arg) {
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
-  %ptr = bitcast i32* %gep to <4 x i32>*
-  %load = load <4 x i32>, <4 x i32>* %ptr, align 4
+  %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
+  %load = load <4 x i32>, ptr %gep, align 4
   %v1 = extractelement <4 x i32> %load, i32 0
   %v2 = extractelement <4 x i32> %load, i32 1
   %v3 = extractelement <4 x i32> %load, i32 2
@@ -114,7 +109,7 @@ bb:
   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
-  store <4 x i32> %v8, <4 x i32>* %ptr, align 4
+  store <4 x i32> %v8, ptr %gep, align 4
   ret void
 }
 
@@ -127,84 +122,79 @@ bb:
 ; SPLIT-DAG: flat_store_{{dword|b32}} v
 ; SPLIT-DAG: flat_store_{{dword|b32}} v
 ; SPLIT-DAG: flat_store_{{dword|b32}} v
-define amdgpu_kernel void @test_flat_misaligned_v3(i32* %arg) {
+define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) {
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
-  %ptr = bitcast i32* %gep to <3 x i32>*
-  %load = load <3 x i32>, <3 x i32>* %ptr, align 4
+  %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
+  %load = load <3 x i32>, ptr %gep, align 4
   %v1 = extractelement <3 x i32> %load, i32 0
   %v2 = extractelement <3 x i32> %load, i32 1
   %v3 = extractelement <3 x i32> %load, i32 2
   %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
   %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
   %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
-  store <3 x i32> %v7, <3 x i32>* %ptr, align 4
+  store <3 x i32> %v7, ptr %gep, align 4
   ret void
 }
 
 ; GCN-LABEL: test_local_aligned_v2:
 ; GCN-DAG: ds_{{read|load}}_b64
 ; GCN-DAG: ds_{{write|store}}_b64
-define amdgpu_kernel void @test_local_aligned_v2(i32 addrspace(3)* %arg) {
+define amdgpu_kernel void @test_local_aligned_v2(ptr addrspace(3) %arg) {
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
-  %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
-  %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 8
+  %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
+  %load = load <2 x i32>, ptr addrspace(3) %gep, align 8
   %v1 = extractelement <2 x i32> %load, i32 0
   %v2 = extractelement <2 x i32> %load, i32 1
   %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
   %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
-  store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 8
+  store <2 x i32> %v4, ptr addrspace(3) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: test_local_aligned_v3:
 ; GCN-DAG: ds_{{read|load}}_b96
 ; GCN-DAG: ds_{{write|store}}_b96
-define amdgpu_kernel void @test_local_aligned_v3(i32 addrspace(3)* %arg) {
+define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) {
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
-  %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
-  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16
+  %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
+  %load = load <3 x i32>, ptr addrspace(3) %gep, align 16
   %v1 = extractelement <3 x i32> %load, i32 0
   %v2 = extractelement <3 x i32> %load, i32 1
   %v3 = extractelement <3 x i32> %load, i32 2
   %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
   %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
   %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
-  store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 16
+  store <3 x i32> %v7, ptr addrspace(3) %gep, align 16
   ret void
 }
 
 ; GCN-LABEL: test_flat_aligned_v2:
 ; GCN-DAG: flat_load_{{dwordx2|b64}} v
 ; GCN-DAG: flat_store_{{dwordx2|b64}} v
-define amdgpu_kernel void @test_flat_aligned_v2(i32* %arg) {
+define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) {
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
-  %ptr = bitcast i32* %gep to <2 x i32>*
-  %load = load <2 x i32>, <2 x i32>* %ptr, align 8
+  %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
+  %load = load <2 x i32>, ptr %gep, align 8
   %v1 = extractelement <2 x i32> %load, i32 0
   %v2 = extractelement <2 x i32> %load, i32 1
   %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
   %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
-  store <2 x i32> %v4, <2 x i32>* %ptr, align 8
+  store <2 x i32> %v4, ptr %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: test_flat_aligned_v4:
 ; GCN-DAG: flat_load_{{dwordx4|b128}} v
 ; GCN-DAG: flat_store_{{dwordx4|b128}} v
-define amdgpu_kernel void @test_flat_aligned_v4(i32* %arg) {
+define amdgpu_kernel void @test_flat_aligned_v4(ptr %arg) {
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
-  %ptr = bitcast i32* %gep to <4 x i32>*
-  %load = load <4 x i32>, <4 x i32>* %ptr, align 16
+  %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
+  %load = load <4 x i32>, ptr %gep, align 16
   %v1 = extractelement <4 x i32> %load, i32 0
   %v2 = extractelement <4 x i32> %load, i32 1
   %v3 = extractelement <4 x i32> %load, i32 2
@@ -213,7 +203,7 @@ bb:
   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
-  store <4 x i32> %v8, <4 x i32>* %ptr, align 16
+  store <4 x i32> %v8, ptr %gep, align 16
   ret void
 }
 
@@ -222,12 +212,11 @@ bb:
 ; ALIGNED-DAG: ds_{{write2|store_2addr}}_b64
 ; UNALIGNED-DAG: ds_{{read2|load_2addr}}_b64
 ; UNALIGNED-DAG: ds_{{write2|store_2addr}}_b64
-define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) {
+define amdgpu_kernel void @test_local_v4_aligned8(ptr addrspace(3) %arg) {
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
-  %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
-  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8
+  %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
+  %load = load <4 x i32>, ptr addrspace(3) %gep, align 8
   %v1 = extractelement <4 x i32> %load, i32 0
   %v2 = extractelement <4 x i32> %load, i32 1
   %v3 = extractelement <4 x i32> %load, i32 2
@@ -236,7 +225,7 @@ bb:
   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
-  store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 8
+  store <4 x i32> %v8, ptr addrspace(3) %gep, align 8
   ret void
 }
 
@@ -247,12 +236,11 @@ bb:
 ; SPLIT-DAG: flat_load_{{dwordx2|b64}} v
 ; SPLIT-DAG: flat_store_{{dwordx2|b64}} v
 ; SPLIT-DAG: flat_store_{{dwordx2|b64}} v
-define amdgpu_kernel void @test_flat_v4_aligned8(i32* %arg) {
+define amdgpu_kernel void @test_flat_v4_aligned8(ptr %arg) {
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
-  %ptr = bitcast i32* %gep to <4 x i32>*
-  %load = load <4 x i32>, <4 x i32>* %ptr, align 8
+  %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
+  %load = load <4 x i32>, ptr %gep, align 8
   %v1 = extractelement <4 x i32> %load, i32 0
   %v2 = extractelement <4 x i32> %load, i32 1
   %v3 = extractelement <4 x i32> %load, i32 2
@@ -261,7 +249,7 @@ bb:
   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
-  store <4 x i32> %v8, <4 x i32>* %ptr, align 8
+  store <4 x i32> %v8, ptr %gep, align 8
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll b/llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll
index fff2a92007293..8a48c2989d1ea 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll
@@ -10,9 +10,9 @@
 ; reads and writes are bundled together in the same instruction.
 
 ; CHECK: {{^}}lds_crash:
-define amdgpu_kernel void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @lds_crash(ptr addrspace(1) %out, ptr addrspace(3) %in, i32 %a, i32 %b, i32 %c) {
 entry:
-  %0 = load i32, i32 addrspace(3)* %in
+  %0 = load i32, ptr addrspace(3) %in
   ; This block needs to be > 115 ISA instructions to hit the bug,
   ; so we'll use udiv instructions.
   %div0 = udiv i32 %0, %b
@@ -23,6 +23,6 @@ entry:
   %div5 = udiv i32 %div4, %c
   %div6 = udiv i32 %div5, %div0
   %div7 = udiv i32 %div6, %div1
-  store i32 %div7, i32 addrspace(1)* %out
+  store i32 %div7, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll b/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll
index f8fb12eefa620..6b35574f0b756 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll
@@ -10,16 +10,16 @@
 
 @local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4
 
-define amdgpu_kernel void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
+define amdgpu_kernel void @lds_input_queue(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %index) {
 entry:
-  %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
-  %1 = load i32, i32 addrspace(3)* %0
+  %0 = getelementptr inbounds [2 x i32], ptr addrspace(3) @local_mem, i32 0, i32 %index
+  %1 = load i32, ptr addrspace(3) %0
   call void @llvm.r600.group.barrier()
 
   ; This will start a new clause for the vertex fetch
-  %2 = load i32, i32 addrspace(1)* %in
+  %2 = load i32, ptr addrspace(1) %in
   %3 = add i32 %1, %2
-  store i32 %3, i32 addrspace(1)* %out
+  store i32 %3, ptr addrspace(1) %out
   ret void
 }
 
@@ -40,9 +40,9 @@ declare void @llvm.r600.group.barrier() nounwind convergent
 ; load from global memory which immediately follows a load from a global value that
 ; has been declared in the local memory space:
 ;
-;  %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
-;  %1 = load i32, i32 addrspace(3)* %0
-;  %2 = load i32, i32 addrspace(1)* %in
+;  %0 = getelementptr inbounds [2 x i32], ptr addrspace(3) @local_mem, i32 0, i32 %index
+;  %1 = load i32, ptr addrspace(3) %0
+;  %2 = load i32, ptr addrspace(1) %in
 ;
 ; The instruction selection phase will generate ISA that looks like this:
 ; %oqap = LDS_READ_RET
@@ -88,12 +88,11 @@ declare void @llvm.r600.group.barrier() nounwind convergent
 ; CHECK: LDS_READ_RET
 ; CHECK-NOT: ALU clause
 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
-define amdgpu_kernel void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @local_global_alias(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0
-  %1 = load i32, i32 addrspace(3)* %0
-  %2 = load i32, i32 addrspace(1)* %in
-  %3 = add i32 %2, %1
-  store i32 %3, i32 addrspace(1)* %out
+  %0 = load i32, ptr addrspace(3) @local_mem
+  %1 = load i32, ptr addrspace(1) %in
+  %2 = add i32 %1, %0
+  store i32 %2, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-reject-anonymous-kernels.ll b/llvm/test/CodeGen/AMDGPU/lds-reject-anonymous-kernels.ll
index 2c45c08dbaaa7..8ab5b9d70d237 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-reject-anonymous-kernels.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-reject-anonymous-kernels.ll
@@ -5,8 +5,8 @@
 
 ; CHECK: LLVM ERROR: Anonymous kernels cannot use LDS variables
 define amdgpu_kernel void @0() {
-  %val0 = load i32, i32 addrspace(3)* @var1
+  %val0 = load i32, ptr addrspace(3) @var1
   %val1 = add i32 %val0, 4
-  store i32 %val1, i32 addrspace(3)* @var1
+  store i32 %val1, ptr addrspace(3) @var1
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
index ae062bd06a0f3..4bd3a2f964e72 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
@@ -44,11 +44,11 @@
 ; GCN: .amdgpu_lds lds.defined, 32, 8
 define amdgpu_gs float @test_basic(i32 inreg %wave, i32 %arg1) #0 {
 main_body:
-  %gep0 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @lds.external, i32 0, i32 %arg1
-  %tmp = load i32, i32 addrspace(3)* %gep0
+  %gep0 = getelementptr [0 x i32], ptr addrspace(3) @lds.external, i32 0, i32 %arg1
+  %tmp = load i32, ptr addrspace(3) %gep0
 
-  %gep1 = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds.defined, i32 0, i32 %wave
-  store i32 123, i32 addrspace(3)* %gep1
+  %gep1 = getelementptr [8 x i32], ptr addrspace(3) @lds.defined, i32 0, i32 %wave
+  store i32 123, ptr addrspace(3) %gep1
 
   %r = bitcast i32 %tmp to float
   ret float %r

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-size.ll b/llvm/test/CodeGen/AMDGPU/lds-size.ll
index 313e4d0e07426..ab104c4abfbca 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-size.ll
@@ -18,17 +18,17 @@
 ; GCN: ; LDSByteSize: 4 bytes/workgroup (compile time only)
 @lds = internal unnamed_addr addrspace(3) global i32 undef, align 4
 
-define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %cond) {
 entry:
   %0 = icmp eq i32 %cond, 0
   br i1 %0, label %if, label %else
 
 if:
-  store i32 1, i32 addrspace(3)* @lds
+  store i32 1, ptr addrspace(3) @lds
   br label %endif
 
 else:
-  store i32 2, i32 addrspace(3)* @lds
+  store i32 2, ptr addrspace(3) @lds
   br label %endif
 
 endif:

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
index d0c75fbf2383e..b497979fde71f 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
@@ -8,7 +8,7 @@
 
 @lds = addrspace(3) global [256 x i32] zeroinitializer
 
-define amdgpu_kernel void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {
+define amdgpu_kernel void @load_zeroinit_lds_global(ptr addrspace(1) %out, i1 %p) {
   ; GCN-LABEL: name: load_zeroinit_lds_global
   ; GCN: bb.0 (%ir-block.0):
   ; GCN:   liveins: $sgpr0_sgpr1
@@ -27,8 +27,8 @@ define amdgpu_kernel void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %
   ; GFX8:  BUFFER_STORE_DWORD_OFFSET killed [[DS_READ_B32_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec
   ; GFX9:  FLAT_STORE_DWORD killed [[COPY1]], killed [[DS_READ_B32_]], 0, 0, implicit $exec, implicit $flat_scr
   ; GCN:   S_ENDPGM 0
- %gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10
-  %ld = load i32, i32 addrspace(3)* %gep
-  store i32 %ld, i32 addrspace(1)* %out
+ %gep = getelementptr [256 x i32], ptr addrspace(3) @lds, i32 0, i32 10
+  %ld = load i32, ptr addrspace(3) %gep
+  store i32 %ld, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll b/llvm/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll
index 0c43c0d4de60b..e0de5430a23e6 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll
@@ -4,8 +4,8 @@
 ; and dereferenceable flags.
 
 ; GCN: BUFFER_LOAD_USHORT{{.*}} :: (dereferenceable invariant load (s16) from %ir.ptr, addrspace 4)
-define half @legalize_f16_load(half addrspace(4)* dereferenceable(4) %ptr) {
-  %load = load half, half addrspace(4)* %ptr, !invariant.load !0
+define half @legalize_f16_load(ptr addrspace(4) dereferenceable(4) %ptr) {
+  %load = load half, ptr addrspace(4) %ptr, !invariant.load !0
   %add = fadd half %load, 1.0
   ret half %add
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll b/llvm/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
index e85a1b690af60..68f2333f945aa 100644
--- a/llvm/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
@@ -11,16 +11,16 @@
 ; CHECK: {{^}}setcc_expand:
 ; CHECK: SET
 ; CHECK-NOT: CND
-define amdgpu_kernel void @setcc_expand(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @setcc_expand(ptr addrspace(1) %out, i32 %in) {
 entry:
   %0 = icmp eq i32 %in, 5
   br i1 %0, label %IF, label %ENDIF
 IF:
-  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  store i32 0, i32 addrspace(1)* %1
+  %1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  store i32 0, ptr addrspace(1) %1
   br label %ENDIF
 
 ENDIF:
-  store i32 0, i32 addrspace(1)* %out
+  store i32 0, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/literal-constant-like-operand-instruction-size.ll b/llvm/test/CodeGen/AMDGPU/literal-constant-like-operand-instruction-size.ll
index 6e3641948ac39..4417a280e5101 100644
--- a/llvm/test/CodeGen/AMDGPU/literal-constant-like-operand-instruction-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/literal-constant-like-operand-instruction-size.ll
@@ -21,19 +21,19 @@ declare void @llvm.amdgcn.s.sleep(i32) #0
 ; GCN-NEXT: s_getpc_b64 s[[[PC_LO]]:[[PC_HI]]]
 
 ; GCN: [[BB3]]: ; %bb3
-define amdgpu_kernel void @branch_offset_test(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @branch_offset_test(ptr addrspace(1) %arg, i32 %cnd) #0 {
 bb:
   %cmp = icmp eq i32 %cnd, 0
   br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
 
 bb2:
-  store i32 1, i32 addrspace(1)* @name1
-  store i32 2, i32 addrspace(1)* @name2
-  store i32 3, i32 addrspace(1)* @name3
+  store i32 1, ptr addrspace(1) @name1
+  store i32 2, ptr addrspace(1) @name2
+  store i32 3, ptr addrspace(1) @name3
   call void @llvm.amdgcn.s.sleep(i32 0)
   br label %bb3
 
 bb3:
-  store volatile i32 %cnd, i32 addrspace(1)* %arg
+  store volatile i32 %cnd, ptr addrspace(1) %arg
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/literals.ll b/llvm/test/CodeGen/AMDGPU/literals.ll
index 1c546ba9f74ba..41fb1fd53fd8e 100644
--- a/llvm/test/CodeGen/AMDGPU/literals.ll
+++ b/llvm/test/CodeGen/AMDGPU/literals.ll
@@ -10,10 +10,10 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: ADD_INT * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
 ; CHECK-NEXT: 5
-define amdgpu_kernel void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @i32_literal(ptr addrspace(1) %out, i32 %in) {
 entry:
   %0 = add i32 5, %in
-  store i32 %0, i32 addrspace(1)* %out
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -27,10 +27,10 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: ADD * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
 ; CHECK-NEXT: 1084227584(5.0
-define amdgpu_kernel void @float_literal(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @float_literal(ptr addrspace(1) %out, float %in) {
 entry:
   %0 = fadd float 5.0, %in
-  store float %0, float addrspace(1)* %out
+  store float %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -41,9 +41,9 @@ entry:
 ; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Z, 0.0
 ; CHECK-NEXT: MOV {{\** *}}T[[GPR]].W, 0.0
 
-define amdgpu_kernel void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @inline_literal_reg_sequence(ptr addrspace(1) %out) {
 entry:
-  store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> addrspace(1)* %out
+  store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, ptr addrspace(1) %out
   ret void
 }
 
@@ -52,10 +52,10 @@ entry:
 ; CHECK-NEXT: DOT4 T[[GPR]].Y (MASKED), 1.0
 ; CHECK-NEXT: DOT4 T[[GPR]].Z (MASKED), 1.0
 ; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0
-define amdgpu_kernel void @inline_literal_dot4(float addrspace(1)* %out) {
+define amdgpu_kernel void @inline_literal_dot4(ptr addrspace(1) %out) {
 entry:
   %0 = call float @llvm.r600.dot4(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
-  store float %0, float addrspace(1)* %out
+  store float %0, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index 54c351f8390c9..712a15ebd2305 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -9,7 +9,7 @@
 ; RUN:  llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11SELDAG,GFX11CHECK %s
 ; RUN:  llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11GLISEL,GFX11CHECK %s
 
-define amdgpu_kernel void @sgpr_isnan_f16(i32 addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
 ; GFX7SELDAG-LABEL: sgpr_isnan_f16:
 ; GFX7SELDAG:       ; %bb.0:
 ; GFX7SELDAG-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -90,7 +90,7 @@ define amdgpu_kernel void @sgpr_isnan_f16(i32 addrspace(1)* %out, half %x) {
 ; GFX11CHECK-NEXT:    s_endpgm
   %result = call i1 @llvm.is.fpclass.f16(half %x, i32 3)
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
index ecf6e9c1b3824..c746dab2010f5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
@@ -9,7 +9,7 @@
 ; RUN:  llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11CHECK %s
 ; RUN:  llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11CHECK %s
 
-define amdgpu_kernel void @sgpr_isnan_f32(i32 addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) {
 ; GFX7SELDAG-LABEL: sgpr_isnan_f32:
 ; GFX7SELDAG:       ; %bb.0:
 ; GFX7SELDAG-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -84,11 +84,11 @@ define amdgpu_kernel void @sgpr_isnan_f32(i32 addrspace(1)* %out, float %x) {
 ; GFX11CHECK-NEXT:    s_endpgm
   %result = call i1 @llvm.is.fpclass.f32(float %x, i32 3)  ; nan
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @sgpr_isnan_f64(i32 addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) {
 ; GFX7ISELDAG-LABEL: sgpr_isnan_f64:
 ; GFX7ISELDAG:       ; %bb.0:
 ; GFX7ISELDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -168,7 +168,7 @@ define amdgpu_kernel void @sgpr_isnan_f64(i32 addrspace(1)* %out, double %x) {
 ; GFX11CHECK-NEXT:    s_endpgm
   %result = call i1 @llvm.is.fpclass.f64(double %x, i32 3)  ; nan
   %sext = sext i1 %result to i32
-  store i32 %sext, i32 addrspace(1)* %out, align 4
+  store i32 %sext, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
index f243598fb219a..94f6772f791c0 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
@@ -6,32 +6,32 @@
 ; GCN: s_load_dwordx8
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v8f32(float addrspace(4)* noalias nocapture readonly %weights, float addrspace(1)* noalias nocapture %out_ptr) {
+define amdgpu_kernel void @constant_load_v8f32(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) {
 entry:
-  %out_ptr.promoted = load float, float addrspace(1)* %out_ptr, align 4
-  %tmp = load float, float addrspace(4)* %weights, align 4
+  %out_ptr.promoted = load float, ptr addrspace(1) %out_ptr, align 4
+  %tmp = load float, ptr addrspace(4) %weights, align 4
   %add = fadd float %tmp, %out_ptr.promoted
-  %arrayidx.1 = getelementptr inbounds float, float addrspace(4)* %weights, i64 1
-  %tmp1 = load float, float addrspace(4)* %arrayidx.1, align 4
+  %arrayidx.1 = getelementptr inbounds float, ptr addrspace(4) %weights, i64 1
+  %tmp1 = load float, ptr addrspace(4) %arrayidx.1, align 4
   %add.1 = fadd float %tmp1, %add
-  %arrayidx.2 = getelementptr inbounds float, float addrspace(4)* %weights, i64 2
-  %tmp2 = load float, float addrspace(4)* %arrayidx.2, align 4
+  %arrayidx.2 = getelementptr inbounds float, ptr addrspace(4) %weights, i64 2
+  %tmp2 = load float, ptr addrspace(4) %arrayidx.2, align 4
   %add.2 = fadd float %tmp2, %add.1
-  %arrayidx.3 = getelementptr inbounds float, float addrspace(4)* %weights, i64 3
-  %tmp3 = load float, float addrspace(4)* %arrayidx.3, align 4
+  %arrayidx.3 = getelementptr inbounds float, ptr addrspace(4) %weights, i64 3
+  %tmp3 = load float, ptr addrspace(4) %arrayidx.3, align 4
   %add.3 = fadd float %tmp3, %add.2
-  %arrayidx.4 = getelementptr inbounds float, float addrspace(4)* %weights, i64 4
-  %tmp4 = load float, float addrspace(4)* %arrayidx.4, align 4
+  %arrayidx.4 = getelementptr inbounds float, ptr addrspace(4) %weights, i64 4
+  %tmp4 = load float, ptr addrspace(4) %arrayidx.4, align 4
   %add.4 = fadd float %tmp4, %add.3
-  %arrayidx.5 = getelementptr inbounds float, float addrspace(4)* %weights, i64 5
-  %tmp5 = load float, float addrspace(4)* %arrayidx.5, align 4
+  %arrayidx.5 = getelementptr inbounds float, ptr addrspace(4) %weights, i64 5
+  %tmp5 = load float, ptr addrspace(4) %arrayidx.5, align 4
   %add.5 = fadd float %tmp5, %add.4
-  %arrayidx.6 = getelementptr inbounds float, float addrspace(4)* %weights, i64 6
-  %tmp6 = load float, float addrspace(4)* %arrayidx.6, align 4
+  %arrayidx.6 = getelementptr inbounds float, ptr addrspace(4) %weights, i64 6
+  %tmp6 = load float, ptr addrspace(4) %arrayidx.6, align 4
   %add.6 = fadd float %tmp6, %add.5
-  %arrayidx.7 = getelementptr inbounds float, float addrspace(4)* %weights, i64 7
-  %tmp7 = load float, float addrspace(4)* %arrayidx.7, align 4
+  %arrayidx.7 = getelementptr inbounds float, ptr addrspace(4) %weights, i64 7
+  %tmp7 = load float, ptr addrspace(4) %arrayidx.7, align 4
   %add.7 = fadd float %tmp7, %add.6
-  store float %add.7, float addrspace(1)* %out_ptr, align 4
+  store float %add.7, ptr addrspace(1) %out_ptr, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index dcd420870c2f3..6984650ebbf0f 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -6,9 +6,9 @@
 ; GCN: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
 ; GCN-NOHSA: buffer_store_dwordx2
 ; GCN-HSA: flat_store_dwordx2
-define amdgpu_kernel void @constant_load_f64(double addrspace(1)* %out, double addrspace(4)* %in) #0 {
-  %ld = load double, double addrspace(4)* %in
-  store double %ld, double addrspace(1)* %out
+define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %ld = load double, ptr addrspace(4) %in
+  store double %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -17,32 +17,32 @@ attributes #0 = { nounwind }
 ; Tests whether a load-chain of 8 constants of 64bit each gets vectorized into a wider load.
 ; FUNC-LABEL: {{^}}constant_load_2v4f64:
 ; GCN: s_load_dwordx16
-define amdgpu_kernel void @constant_load_2v4f64(double addrspace(4)* noalias nocapture readonly %weights, double addrspace(1)* noalias nocapture %out_ptr) {
+define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) {
 entry:
-  %out_ptr.promoted = load double, double addrspace(1)* %out_ptr, align 4
-  %tmp = load double, double addrspace(4)* %weights, align 4
+  %out_ptr.promoted = load double, ptr addrspace(1) %out_ptr, align 4
+  %tmp = load double, ptr addrspace(4) %weights, align 4
   %add = fadd double %tmp, %out_ptr.promoted
-  %arrayidx.1 = getelementptr inbounds double, double addrspace(4)* %weights, i64 1
-  %tmp1 = load double, double addrspace(4)* %arrayidx.1, align 4
+  %arrayidx.1 = getelementptr inbounds double, ptr addrspace(4) %weights, i64 1
+  %tmp1 = load double, ptr addrspace(4) %arrayidx.1, align 4
   %add.1 = fadd double %tmp1, %add
-  %arrayidx.2 = getelementptr inbounds double, double addrspace(4)* %weights, i64 2
-  %tmp2 = load double, double addrspace(4)* %arrayidx.2, align 4
+  %arrayidx.2 = getelementptr inbounds double, ptr addrspace(4) %weights, i64 2
+  %tmp2 = load double, ptr addrspace(4) %arrayidx.2, align 4
   %add.2 = fadd double %tmp2, %add.1
-  %arrayidx.3 = getelementptr inbounds double, double addrspace(4)* %weights, i64 3
-  %tmp3 = load double, double addrspace(4)* %arrayidx.3, align 4
+  %arrayidx.3 = getelementptr inbounds double, ptr addrspace(4) %weights, i64 3
+  %tmp3 = load double, ptr addrspace(4) %arrayidx.3, align 4
   %add.3 = fadd double %tmp3, %add.2
-  %arrayidx.4 = getelementptr inbounds double, double addrspace(4)* %weights, i64 4
-  %tmp4 = load double, double addrspace(4)* %arrayidx.4, align 4
+  %arrayidx.4 = getelementptr inbounds double, ptr addrspace(4) %weights, i64 4
+  %tmp4 = load double, ptr addrspace(4) %arrayidx.4, align 4
   %add.4 = fadd double %tmp4, %add.3
-  %arrayidx.5 = getelementptr inbounds double, double addrspace(4)* %weights, i64 5
-  %tmp5 = load double, double addrspace(4)* %arrayidx.5, align 4
+  %arrayidx.5 = getelementptr inbounds double, ptr addrspace(4) %weights, i64 5
+  %tmp5 = load double, ptr addrspace(4) %arrayidx.5, align 4
   %add.5 = fadd double %tmp5, %add.4
-  %arrayidx.6 = getelementptr inbounds double, double addrspace(4)* %weights, i64 6
-  %tmp6 = load double, double addrspace(4)* %arrayidx.6, align 4
+  %arrayidx.6 = getelementptr inbounds double, ptr addrspace(4) %weights, i64 6
+  %tmp6 = load double, ptr addrspace(4) %arrayidx.6, align 4
   %add.6 = fadd double %tmp6, %add.5
-  %arrayidx.7 = getelementptr inbounds double, double addrspace(4)* %weights, i64 7
-  %tmp7 = load double, double addrspace(4)* %arrayidx.7, align 4
+  %arrayidx.7 = getelementptr inbounds double, ptr addrspace(4) %weights, i64 7
+  %tmp7 = load double, ptr addrspace(4) %arrayidx.7, align 4
   %add.7 = fadd double %tmp7, %add.6
-  store double %add.7, double addrspace(1)* %out_ptr, align 4
+  store double %add.7, ptr addrspace(1) %out_ptr, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 8ea45def2525d..482e412589022 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -9,68 +9,68 @@
 
 ; EG: VTX_READ_8
 ; EG: AND_INT
-define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 {
-  %load = load i1, i1 addrspace(4)* %in
-  store i1 %load, i1 addrspace(1)* %out
+define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load i1, ptr addrspace(4) %in
+  store i1 %load, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v2i1:
-define amdgpu_kernel void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(4)* %in
-  store <2 x i1> %load, <2 x i1> addrspace(1)* %out
+define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <2 x i1>, ptr addrspace(4) %in
+  store <2 x i1> %load, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v3i1:
-define amdgpu_kernel void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(4)* %in
-  store <3 x i1> %load, <3 x i1> addrspace(1)* %out
+define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <3 x i1>, ptr addrspace(4) %in
+  store <3 x i1> %load, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v4i1:
-define amdgpu_kernel void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(4)* %in
-  store <4 x i1> %load, <4 x i1> addrspace(1)* %out
+define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <4 x i1>, ptr addrspace(4) %in
+  store <4 x i1> %load, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v8i1:
-define amdgpu_kernel void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(4)* %in
-  store <8 x i1> %load, <8 x i1> addrspace(1)* %out
+define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <8 x i1>, ptr addrspace(4) %in
+  store <8 x i1> %load, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v16i1:
-define amdgpu_kernel void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(4)* %in
-  store <16 x i1> %load, <16 x i1> addrspace(1)* %out
+define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <16 x i1>, ptr addrspace(4) %in
+  store <16 x i1> %load, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v32i1:
-define amdgpu_kernel void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(4)* %in
-  store <32 x i1> %load, <32 x i1> addrspace(1)* %out
+define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <32 x i1>, ptr addrspace(4) %in
+  store <32 x i1> %load, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v64i1:
-define amdgpu_kernel void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(4)* %in
-  store <64 x i1> %load, <64 x i1> addrspace(1)* %out
+define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <64 x i1>, ptr addrspace(4) %in
+  store <64 x i1> %load, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_i1_to_i32:
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_dword
-define amdgpu_kernel void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 {
-  %a = load i1, i1 addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %a = load i1, ptr addrspace(4) %in
   %ext = zext i1 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -81,138 +81,138 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i
 
 ; EG: VTX_READ_8
 ; EG: BFE_INT
-define amdgpu_kernel void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 {
-  %a = load i1, i1 addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %a = load i1, ptr addrspace(4) %in
   %ext = sext i1 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i32:
-define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <1 x i1>, <1 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <1 x i1>, ptr addrspace(4) %in
   %ext = zext <1 x i1> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  store <1 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i32:
-define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <1 x i1>, <1 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <1 x i1>, ptr addrspace(4) %in
   %ext = sext <1 x i1> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  store <1 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i32:
-define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <2 x i1>, ptr addrspace(4) %in
   %ext = zext <2 x i1> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i32:
-define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <2 x i1>, ptr addrspace(4) %in
   %ext = sext <2 x i1> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i32:
-define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <3 x i1>, ptr addrspace(4) %in
   %ext = zext <3 x i1> %load to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  store <3 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i32:
-define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <3 x i1>, ptr addrspace(4) %in
   %ext = sext <3 x i1> %load to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  store <3 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i32:
-define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <4 x i1>, ptr addrspace(4) %in
   %ext = zext <4 x i1> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i32:
-define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <4 x i1>, ptr addrspace(4) %in
   %ext = sext <4 x i1> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i32:
-define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <8 x i1>, ptr addrspace(4) %in
   %ext = zext <8 x i1> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  store <8 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i32:
-define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <8 x i1>, ptr addrspace(4) %in
   %ext = sext <8 x i1> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  store <8 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i32:
-define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <16 x i1>, ptr addrspace(4) %in
   %ext = zext <16 x i1> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i32:
-define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <16 x i1>, ptr addrspace(4) %in
   %ext = sext <16 x i1> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i32:
-define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <32 x i1>, ptr addrspace(4) %in
   %ext = zext <32 x i1> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  store <32 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i32:
-define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <32 x i1>, ptr addrspace(4) %in
   %ext = sext <32 x i1> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  store <32 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i32:
-define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <64 x i1>, ptr addrspace(4) %in
   %ext = zext <64 x i1> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  store <64 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i32:
-define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <64 x i1>, ptr addrspace(4) %in
   %ext = sext <64 x i1> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  store <64 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -221,10 +221,10 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspac
 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
 ; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]]
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 {
-  %a = load i1, i1 addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %a = load i1, ptr addrspace(4) %in
   %ext = zext i1 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -233,138 +233,138 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 {
-  %a = load i1, i1 addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %a = load i1, ptr addrspace(4) %in
   %ext = sext i1 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i64:
-define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <1 x i1>, <1 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <1 x i1>, ptr addrspace(4) %in
   %ext = zext <1 x i1> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  store <1 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i64:
-define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <1 x i1>, <1 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <1 x i1>, ptr addrspace(4) %in
   %ext = sext <1 x i1> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  store <1 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i64:
-define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <2 x i1>, ptr addrspace(4) %in
   %ext = zext <2 x i1> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i64:
-define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <2 x i1>, ptr addrspace(4) %in
   %ext = sext <2 x i1> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i64:
-define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <3 x i1>, ptr addrspace(4) %in
   %ext = zext <3 x i1> %load to <3 x i64>
-  store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
+  store <3 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i64:
-define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <3 x i1>, ptr addrspace(4) %in
   %ext = sext <3 x i1> %load to <3 x i64>
-  store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
+  store <3 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i64:
-define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <4 x i1>, ptr addrspace(4) %in
   %ext = zext <4 x i1> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  store <4 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i64:
-define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <4 x i1>, ptr addrspace(4) %in
   %ext = sext <4 x i1> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  store <4 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i64:
-define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <8 x i1>, ptr addrspace(4) %in
   %ext = zext <8 x i1> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  store <8 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i64:
-define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <8 x i1>, ptr addrspace(4) %in
   %ext = sext <8 x i1> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  store <8 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i64:
-define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <16 x i1>, ptr addrspace(4) %in
   %ext = zext <16 x i1> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  store <16 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i64:
-define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <16 x i1>, ptr addrspace(4) %in
   %ext = sext <16 x i1> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  store <16 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i64:
-define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <32 x i1>, ptr addrspace(4) %in
   %ext = zext <32 x i1> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  store <32 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i64:
-define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <32 x i1>, ptr addrspace(4) %in
   %ext = sext <32 x i1> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  store <32 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i64:
-define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <64 x i1>, ptr addrspace(4) %in
   %ext = zext <64 x i1> %load to <64 x i64>
-  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+  store <64 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i64:
-define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
+  %load = load <64 x i1>, ptr addrspace(4) %in
   %ext = sext <64 x i1> %load to <64 x i64>
-  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+  store <64 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 6c654b2fcb793..98641f302a6ab 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-NOHSA-VI %s
 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck --check-prefix=EG %s
 
-define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(4)* %in) {
+define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 ; GCN-NOHSA-SI-LABEL: constant_load_i16:
 ; GCN-NOHSA-SI:       ; %bb.0: ; %entry
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -78,12 +78,12 @@ define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspa
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  %ld = load i16, i16 addrspace(4)* %in
-  store i16 %ld, i16 addrspace(1)* %out
+  %ld = load i16, ptr addrspace(4) %in
+  store i16 %ld, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) {
+define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 ; GCN-NOHSA-SI-LABEL: constant_load_v2i16:
 ; GCN-NOHSA-SI:       ; %bb.0: ; %entry
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -136,12 +136,12 @@ define amdgpu_kernel void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  %ld = load <2 x i16>, <2 x i16> addrspace(4)* %in
-  store <2 x i16> %ld, <2 x i16> addrspace(1)* %out
+  %ld = load <2 x i16>, ptr addrspace(4) %in
+  store <2 x i16> %ld, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) {
+define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 ; GCN-NOHSA-SI-LABEL: constant_load_v3i16:
 ; GCN-NOHSA-SI:       ; %bb.0: ; %entry
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -225,12 +225,12 @@ define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3
 ; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in
-  store <3 x i16> %ld, <3 x i16> addrspace(1)* %out
+  %ld = load <3 x i16>, ptr addrspace(4) %in
+  store <3 x i16> %ld, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) {
+define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 ; GCN-NOHSA-SI-LABEL: constant_load_v4i16:
 ; GCN-NOHSA-SI:       ; %bb.0: ; %entry
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -286,12 +286,12 @@ define amdgpu_kernel void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  %ld = load <4 x i16>, <4 x i16> addrspace(4)* %in
-  store <4 x i16> %ld, <4 x i16> addrspace(1)* %out
+  %ld = load <4 x i16>, ptr addrspace(4) %in
+  store <4 x i16> %ld, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) {
+define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 ; GCN-NOHSA-SI-LABEL: constant_load_v8i16:
 ; GCN-NOHSA-SI:       ; %bb.0: ; %entry
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -353,12 +353,12 @@ define amdgpu_kernel void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  %ld = load <8 x i16>, <8 x i16> addrspace(4)* %in
-  store <8 x i16> %ld, <8 x i16> addrspace(1)* %out
+  %ld = load <8 x i16>, ptr addrspace(4) %in
+  store <8 x i16> %ld, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) {
+define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 ; GCN-NOHSA-SI-LABEL: constant_load_v16i16:
 ; GCN-NOHSA-SI:       ; %bb.0: ; %entry
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
@@ -449,12 +449,12 @@ define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  %ld = load <16 x i16>, <16 x i16> addrspace(4)* %in
-  store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
+  %ld = load <16 x i16>, ptr addrspace(4) %in
+  store <16 x i16> %ld, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_v16i16_align2(<16 x i16> addrspace(4)* %ptr0) #0 {
+define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_load_v16i16_align2:
 ; GCN-NOHSA-SI:       ; %bb.0: ; %entry
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -591,12 +591,12 @@ define amdgpu_kernel void @constant_load_v16i16_align2(<16 x i16> addrspace(4)*
 ; EG-NEXT:     MOV * T2.X, literal.x,
 ; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
 entry:
-  %ld =  load <16 x i16>, <16 x i16> addrspace(4)* %ptr0, align 2
-  store <16 x i16> %ld, <16 x i16> addrspace(1)* undef, align 32
+  %ld =  load <16 x i16>, ptr addrspace(4) %ptr0, align 2
+  store <16 x i16> %ld, ptr addrspace(1) undef, align 32
   ret void
 }
 
-define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_i16_to_i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -659,13 +659,13 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out,
 ; EG-NEXT:    ALU clause starting at 9:
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %a = load i16, i16 addrspace(4)* %in
+  %a = load i16, ptr addrspace(4) %in
   %ext = zext i16 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_sextload_i16_to_i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -729,13 +729,13 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out,
 ; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
-  %a = load i16, i16 addrspace(4)* %in
+  %a = load i16, ptr addrspace(4) %in
   %ext = sext i16 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v1i16_to_v1i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -798,13 +798,13 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(
 ; EG-NEXT:    ALU clause starting at 9:
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <1 x i16>, <1 x i16> addrspace(4)* %in
+  %load = load <1 x i16>, ptr addrspace(4) %in
   %ext = zext <1 x i16> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  store <1 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_sextload_v1i16_to_v1i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -868,13 +868,13 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(
 ; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
-  %load = load <1 x i16>, <1 x i16> addrspace(4)* %in
+  %load = load <1 x i16>, ptr addrspace(4) %in
   %ext = sext <1 x i16> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  store <1 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v2i16_to_v2i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -938,14 +938,14 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(
 ; EG-NEXT:     AND_INT T4.X, T4.X, literal.x,
 ; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
-  %load = load <2 x i16>, <2 x i16> addrspace(4)* %in
+  %load = load <2 x i16>, ptr addrspace(4) %in
   %ext = zext <2 x i16> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; TODO: We should use ASHR instead of LSHR + BFE
-define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_sextload_v2i16_to_v2i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1010,13 +1010,13 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(
 ; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
 ; EG-NEXT:     BFE_INT * T5.Y, PV.W, 0.0, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-  %load = load <2 x i16>, <2 x i16> addrspace(4)* %in
+  %load = load <2 x i16>, ptr addrspace(4) %in
   %ext = sext <2 x i16> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) {
+define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v3i16_to_v3i32:
 ; GCN-NOHSA-SI:       ; %bb.0: ; %entry
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1093,13 +1093,13 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(
 ; EG-NEXT:     MOV * T3.Y, T1.X,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in
+  %ld = load <3 x i16>, ptr addrspace(4) %in
   %ext = zext <3 x i16> %ld to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  store <3 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) {
+define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 ; GCN-NOHSA-SI-LABEL: constant_sextload_v3i16_to_v3i32:
 ; GCN-NOHSA-SI:       ; %bb.0: ; %entry
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1179,15 +1179,15 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(
 ; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in
+  %ld = load <3 x i16>, ptr addrspace(4) %in
   %ext = sext <3 x i16> %ld to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  store <3 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; v4i16 is naturally 8 byte aligned
 ; TODO: This should use LD, but for some there are redundant MOVs
-define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v4i16_to_v4i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1271,16 +1271,16 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(
 ; EG-NEXT:     AND_INT T5.X, T0.Y, literal.x,
 ; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
-  %load = load <4 x i16>, <4 x i16> addrspace(4)* %in
+  %load = load <4 x i16>, ptr addrspace(4) %in
   %ext = zext <4 x i16> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; v4i16 is naturally 8 byte aligned
 ; TODO: This should use LD, but for some there are redundant MOVs
 ; TODO: We should use ASHR instead of LSHR + BFE
-define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_sextload_v4i16_to_v4i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1366,16 +1366,16 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(
 ; EG-NEXT:     LSHR T6.X, KC0[2].Y, literal.x,
 ; EG-NEXT:     BFE_INT * T5.Y, PS, 0.0, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-  %load = load <4 x i16>, <4 x i16> addrspace(4)* %in
+  %load = load <4 x i16>, ptr addrspace(4) %in
   %ext = sext <4 x i16> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; v8i16 is naturally 16 byte aligned
 ; TODO: These should use LSHR instead of BFE_UINT
 ; TODO: This should use DST, but for some there are redundant MOVs
-define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v8i16_to_v8i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1497,16 +1497,16 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(
 ; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
 ; EG-NEXT:     LSHR * T10.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <8 x i16>, <8 x i16> addrspace(4)* %in
+  %load = load <8 x i16>, ptr addrspace(4) %in
   %ext = zext <8 x i16> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  store <8 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; v8i16 is naturally 16 byte aligned
 ; TODO: 4 of these should use ASHR instead of LSHR + BFE_INT
 ; TODO: This should use DST, but for some there are redundant MOVs
-define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_sextload_v8i16_to_v8i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1630,13 +1630,13 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(
 ; EG-NEXT:     LSHR T10.X, PS, literal.x,
 ; EG-NEXT:     BFE_INT * T9.Y, PV.Z, 0.0, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-  %load = load <8 x i16>, <8 x i16> addrspace(4)* %in
+  %load = load <8 x i16>, ptr addrspace(4) %in
   %ext = sext <8 x i16> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  store <8 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1844,13 +1844,13 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspa
 ; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
 ; EG-NEXT:     LSHR * T18.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <16 x i16>, <16 x i16> addrspace(4)* %in
+  %load = load <16 x i16>, ptr addrspace(4) %in
   %ext = zext <16 x i16> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_sextload_v16i16_to_v16i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2062,13 +2062,13 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspa
 ; EG-NEXT:     LSHR T12.X, PS, literal.x,
 ; EG-NEXT:     BFE_INT * T18.Y, PV.Z, 0.0, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-  %load = load <16 x i16>, <16 x i16> addrspace(4)* %in
+  %load = load <16 x i16>, ptr addrspace(4) %in
   %ext = sext <16 x i16> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v32i16_to_v32i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x9
@@ -2448,13 +2448,13 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspa
 ; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
 ; EG-NEXT:     LSHR * T34.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <32 x i16>, <32 x i16> addrspace(4)* %in
+  %load = load <32 x i16>, ptr addrspace(4) %in
   %ext = zext <32 x i16> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  store <32 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_sextload_v32i16_to_v32i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x9
@@ -2844,13 +2844,13 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspa
 ; EG-NEXT:     LSHR T24.X, PS, literal.x,
 ; EG-NEXT:     BFE_INT * T34.Y, PV.Z, 0.0, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-  %load = load <32 x i16>, <32 x i16> addrspace(4)* %in
+  %load = load <32 x i16>, ptr addrspace(4) %in
   %ext = sext <32 x i16> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  store <32 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v64i16_to_v64i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[36:39], s[0:1], 0x9
@@ -3578,13 +3578,13 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; EG-NEXT:    240(3.363116e-43), 0(0.000000e+00)
 ; EG-NEXT:     LSHR * T66.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <64 x i16>, <64 x i16> addrspace(4)* %in
+  %load = load <64 x i16>, ptr addrspace(4) %in
   %ext = zext <64 x i16> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  store <64 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_sextload_v64i16_to_v64i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[36:39], s[0:1], 0x9
@@ -4330,13 +4330,13 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; EG-NEXT:     LSHR T48.X, PS, literal.x,
 ; EG-NEXT:     BFE_INT * T66.Y, PV.Z, 0.0, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-  %load = load <64 x i16>, <64 x i16> addrspace(4)* %in
+  %load = load <64 x i16>, ptr addrspace(4) %in
   %ext = sext <64 x i16> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  store <64 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_i16_to_i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4404,9 +4404,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out,
 ; EG-NEXT:     MOV * T0.Y, 0.0,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %a = load i16, i16 addrspace(4)* %in
+  %a = load i16, ptr addrspace(4) %in
   %ext = zext i16 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -4415,7 +4415,7 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out,
 ;          t31: i64 = any_extend t28
 ;        t33: i64 = sign_extend_inreg t31, ValueType:ch:i16
 ; TODO: These could be expanded earlier using ASHR 15
-define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_sextload_i16_to_i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4485,13 +4485,13 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out,
 ; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
 ; EG-NEXT:     ASHR * T0.Y, PV.X, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-  %a = load i16, i16 addrspace(4)* %in
+  %a = load i16, ptr addrspace(4) %in
   %ext = sext i16 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v1i16_to_v1i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4559,13 +4559,13 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(
 ; EG-NEXT:     MOV * T0.Y, 0.0,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <1 x i16>, <1 x i16> addrspace(4)* %in
+  %load = load <1 x i16>, ptr addrspace(4) %in
   %ext = zext <1 x i16> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  store <1 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_sextload_v1i16_to_v1i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4635,13 +4635,13 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(
 ; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
 ; EG-NEXT:     ASHR * T0.Y, PV.X, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-  %load = load <1 x i16>, <1 x i16> addrspace(4)* %in
+  %load = load <1 x i16>, ptr addrspace(4) %in
   %ext = sext <1 x i16> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  store <1 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v2i16_to_v2i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4715,13 +4715,13 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(
 ; EG-NEXT:     MOV T4.W, 0.0,
 ; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
-  %load = load <2 x i16>, <2 x i16> addrspace(4)* %in
+  %load = load <2 x i16>, ptr addrspace(4) %in
   %ext = zext <2 x i16> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_sextload_v2i16_to_v2i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4798,13 +4798,13 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(
 ; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
 ; EG-NEXT:     ASHR * T4.Y, PV.X, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-  %load = load <2 x i16>, <2 x i16> addrspace(4)* %in
+  %load = load <2 x i16>, ptr addrspace(4) %in
   %ext = sext <2 x i16> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v4i16_to_v4i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4911,13 +4911,13 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
 ; EG-NEXT:     LSHR * T8.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <4 x i16>, <4 x i16> addrspace(4)* %in
+  %load = load <4 x i16>, ptr addrspace(4) %in
   %ext = zext <4 x i16> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  store <4 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_sextload_v4i16_to_v4i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5037,13 +5037,13 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(
 ; EG-NEXT:     LSHR T8.X, PV.W, literal.x,
 ; EG-NEXT:     ASHR * T7.Y, PV.X, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-  %load = load <4 x i16>, <4 x i16> addrspace(4)* %in
+  %load = load <4 x i16>, ptr addrspace(4) %in
   %ext = sext <4 x i16> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  store <4 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v8i16_to_v8i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5203,13 +5203,13 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(
 ; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
 ; EG-NEXT:     LSHR * T14.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <8 x i16>, <8 x i16> addrspace(4)* %in
+  %load = load <8 x i16>, ptr addrspace(4) %in
   %ext = zext <8 x i16> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  store <8 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_sextload_v8i16_to_v8i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5403,13 +5403,13 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(
 ; EG-NEXT:     LSHR T7.X, PV.W, literal.x,
 ; EG-NEXT:     ASHR * T14.Y, PV.X, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-  %load = load <8 x i16>, <8 x i16> addrspace(4)* %in
+  %load = load <8 x i16>, ptr addrspace(4) %in
   %ext = sext <8 x i16> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  store <8 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5686,13 +5686,13 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspa
 ; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
 ; EG-NEXT:     LSHR * T26.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <16 x i16>, <16 x i16> addrspace(4)* %in
+  %load = load <16 x i16>, ptr addrspace(4) %in
   %ext = zext <16 x i16> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  store <16 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_sextload_v16i16_to_v16i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -6041,13 +6041,13 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspa
 ; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
 ; EG-NEXT:     ASHR * T26.Y, PV.X, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-  %load = load <16 x i16>, <16 x i16> addrspace(4)* %in
+  %load = load <16 x i16>, ptr addrspace(4) %in
   %ext = sext <16 x i16> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  store <16 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v32i16_to_v32i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x9
@@ -6564,13 +6564,13 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; EG-NEXT:    2(2.802597e-45), 240(3.363116e-43)
 ; EG-NEXT:     LSHR * T50.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <32 x i16>, <32 x i16> addrspace(4)* %in
+  %load = load <32 x i16>, ptr addrspace(4) %in
   %ext = zext <32 x i16> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  store <32 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_sextload_v32i16_to_v32i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x9
@@ -7224,25 +7224,25 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; EG-NEXT:     LSHR T38.X, PV.W, literal.x,
 ; EG-NEXT:     ASHR * T50.Y, PV.X, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-  %load = load <32 x i16>, <32 x i16> addrspace(4)* %in
+  %load = load <32 x i16>, ptr addrspace(4) %in
   %ext = sext <32 x i16> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  store <32 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; These trigger undefined register machine verifier errors
 
-; define amdgpu_kernel void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 {
-;   %load = load <64 x i16>, <64 x i16> addrspace(4)* %in
+; define amdgpu_kernel void @constant_zextload_v64i16_to_v64i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+;   %load = load <64 x i16>, ptr addrspace(4) %in
 ;   %ext = zext <64 x i16> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   store <64 x i64> %ext, ptr addrspace(1) %out
 ;   ret void
 ; }
 
-; define amdgpu_kernel void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 {
-;   %load = load <64 x i16>, <64 x i16> addrspace(4)* %in
+; define amdgpu_kernel void @constant_sextload_v64i16_to_v64i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+;   %load = load <64 x i16>, ptr addrspace(4) %in
 ;   %ext = sext <64 x i16> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   store <64 x i64> %ext, ptr addrspace(1) %out
 ;   ret void
 ; }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index be0ffa0f65ee1..4f8dcdd7aa6d2 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -7,9 +7,9 @@
 ; FUNC-LABEL: {{^}}constant_load_i64:
 ; GCN: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; EG: VTX_READ_64
-define amdgpu_kernel void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(4)* %in) #0 {
-  %ld = load i64, i64 addrspace(4)* %in
-  store i64 %ld, i64 addrspace(1)* %out
+define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %ld = load i64, ptr addrspace(4) %in
+  store i64 %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -17,10 +17,10 @@ define amdgpu_kernel void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspa
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
-  %ld = load <2 x i64>, <2 x i64> addrspace(4)* %in
-  store <2 x i64> %ld, <2 x i64> addrspace(1)* %out
+  %ld = load <2 x i64>, ptr addrspace(4) %in
+  store <2 x i64> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -30,10 +30,10 @@ entry:
 
 ; EG-DAG: VTX_READ_128
 ; EG-DAG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
-  %ld = load <3 x i64>, <3 x i64> addrspace(4)* %in
-  store <3 x i64> %ld, <3 x i64> addrspace(1)* %out
+  %ld = load <3 x i64>, ptr addrspace(4) %in
+  store <3 x i64> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -42,10 +42,10 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
-  %ld = load <4 x i64>, <4 x i64> addrspace(4)* %in
-  store <4 x i64> %ld, <4 x i64> addrspace(1)* %out
+  %ld = load <4 x i64>, ptr addrspace(4) %in
+  store <4 x i64> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -56,10 +56,10 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
-  %ld = load <8 x i64>, <8 x i64> addrspace(4)* %in
-  store <8 x i64> %ld, <8 x i64> addrspace(1)* %out
+  %ld = load <8 x i64>, ptr addrspace(4) %in
+  store <8 x i64> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -75,10 +75,10 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define amdgpu_kernel void @constant_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
-  %ld = load <16 x i64>, <16 x i64> addrspace(4)* %in
-  store <16 x i64> %ld, <16 x i64> addrspace(1)* %out
+  %ld = load <16 x i64>, ptr addrspace(4) %in
+  store <16 x i64> %ld, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 9d0d387bfaa9e..9c193a7d4a452 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -10,10 +10,10 @@
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; TODO: NOT AND
-define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
-  %ld = load i8, i8 addrspace(4)* %in
-  store i8 %ld, i8 addrspace(1)* %out
+  %ld = load i8, ptr addrspace(4) %in
+  store i8 %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -22,10 +22,10 @@ entry:
 ; GCN-HSA: flat_load_ushort v
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
-  %ld = load <2 x i8>, <2 x i8> addrspace(4)* %in
-  store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
+  %ld = load <2 x i8>, ptr addrspace(4) %in
+  store <2 x i8> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -33,10 +33,10 @@ entry:
 ; GCN: s_load_dword s
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
-  %ld = load <3 x i8>, <3 x i8> addrspace(4)* %in
-  store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
+  %ld = load <3 x i8>, ptr addrspace(4) %in
+  store <3 x i8> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -44,10 +44,10 @@ entry:
 ; GCN: s_load_dword s
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
-  %ld = load <4 x i8>, <4 x i8> addrspace(4)* %in
-  store <4 x i8> %ld, <4 x i8> addrspace(1)* %out
+  %ld = load <4 x i8>, ptr addrspace(4) %in
+  store <4 x i8> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -55,10 +55,10 @@ entry:
 ; GCN: s_load_dwordx2
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
-  %ld = load <8 x i8>, <8 x i8> addrspace(4)* %in
-  store <8 x i8> %ld, <8 x i8> addrspace(1)* %out
+  %ld = load <8 x i8>, ptr addrspace(4) %in
+  store <8 x i8> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -66,10 +66,10 @@ entry:
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
-  %ld = load <16 x i8>, <16 x i8> addrspace(4)* %in
-  store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
+  %ld = load <16 x i8>, ptr addrspace(4) %in
+  store <16 x i8> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -78,10 +78,10 @@ entry:
 ; GCN-HSA: flat_load_ubyte
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
-  %a = load i8, i8 addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %a = load i8, ptr addrspace(4) %in
   %ext = zext i8 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -92,20 +92,20 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i
 ; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define amdgpu_kernel void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
-  %ld = load i8, i8 addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %ld = load i8, ptr addrspace(4) %in
   %ext = sext i8 %ld to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i32:
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(4)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <1 x i8>, ptr addrspace(4) %in
   %ext = zext <1 x i8> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  store <1 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -114,10 +114,10 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1
 ; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(4)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <1 x i8>, ptr addrspace(4) %in
   %ext = sext <1 x i8> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  store <1 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -129,10 +129,10 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1
 ; TODO: This should use DST, but for some there are redundant MOVs
 ; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal
 ; EG: 8
-define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <2 x i8>, ptr addrspace(4) %in
   %ext = zext <2 x i8> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -150,10 +150,10 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <2 x i8>, ptr addrspace(4) %in
   %ext = sext <2 x i8> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -170,11 +170,11 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1
 ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
-  %ld = load <3 x i8>, <3 x i8> addrspace(4)* %in
+  %ld = load <3 x i8>, ptr addrspace(4) %in
   %ext = zext <3 x i8> %ld to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  store <3 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -193,11 +193,11 @@ entry:
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
-  %ld = load <3 x i8>, <3 x i8> addrspace(4)* %in
+  %ld = load <3 x i8>, ptr addrspace(4) %in
   %ext = sext <3 x i8> %ld to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  store <3 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -214,10 +214,10 @@ entry:
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <4 x i8>, ptr addrspace(4) %in
   %ext = zext <4 x i8> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -236,10 +236,10 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <4 x i8>, ptr addrspace(4) %in
   %ext = sext <4 x i8> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -264,10 +264,10 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <8 x i8>, ptr addrspace(4) %in
   %ext = zext <8 x i8> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  store <8 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -294,10 +294,10 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <8 x i8>, ptr addrspace(4) %in
   %ext = sext <8 x i8> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  store <8 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -335,10 +335,10 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <16 x i8>, ptr addrspace(4) %in
   %ext = zext <16 x i8> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -378,10 +378,10 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspac
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <16 x i8>, ptr addrspace(4) %in
   %ext = sext <16 x i8> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -450,10 +450,10 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspac
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(4)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <32 x i8>, ptr addrspace(4) %in
   %ext = zext <32 x i8> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  store <32 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -526,10 +526,10 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspac
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(4)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <32 x i8>, ptr addrspace(4) %in
   %ext = sext <32 x i8> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  store <32 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -539,10 +539,10 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspac
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(4)* %in) #0 {
-  %load = load <64 x i8>, <64 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <64 x i8>, ptr addrspace(4) %in
   %ext = zext <64 x i8> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  store <64 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -552,10 +552,10 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspac
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(4)* %in) #0 {
-  %load = load <64 x i8>, <64 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <64 x i8>, ptr addrspace(4) %in
   %ext = sext <64 x i8> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  store <64 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -570,10 +570,10 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspac
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define amdgpu_kernel void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
-  %a = load i8, i8 addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %a = load i8, ptr addrspace(4) %in
   %ext = zext i8 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -589,10 +589,10 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define amdgpu_kernel void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
-  %a = load i8, i8 addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %a = load i8, ptr addrspace(4) %in
   %ext = sext i8 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -600,10 +600,10 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(4)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <1 x i8>, ptr addrspace(4) %in
   %ext = zext <1 x i8> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  store <1 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -613,90 +613,90 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(4)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <1 x i8>, ptr addrspace(4) %in
   %ext = sext <1 x i8> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  store <1 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <2 x i8>, ptr addrspace(4) %in
   %ext = zext <2 x i8> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <2 x i8>, ptr addrspace(4) %in
   %ext = sext <2 x i8> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <4 x i8>, ptr addrspace(4) %in
   %ext = zext <4 x i8> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  store <4 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <4 x i8>, ptr addrspace(4) %in
   %ext = sext <4 x i8> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  store <4 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <8 x i8>, ptr addrspace(4) %in
   %ext = zext <8 x i8> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  store <8 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <8 x i8>, ptr addrspace(4) %in
   %ext = sext <8 x i8> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  store <8 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <16 x i8>, ptr addrspace(4) %in
   %ext = zext <16 x i8> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  store <16 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <16 x i8>, ptr addrspace(4) %in
   %ext = sext <16 x i8> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  store <16 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -704,10 +704,10 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspac
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(4)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <32 x i8>, ptr addrspace(4) %in
   %ext = zext <32 x i8> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  store <32 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -715,26 +715,26 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspac
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(4)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <32 x i8>, ptr addrspace(4) %in
   %ext = sext <32 x i8> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  store <32 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i64:
-; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(4)* %in) #0 {
-;   %load = load <64 x i8>, <64 x i8> addrspace(4)* %in
+; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+;   %load = load <64 x i8>, ptr addrspace(4) %in
 ;   %ext = zext <64 x i8> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   store <64 x i64> %ext, ptr addrspace(1) %out
 ;   ret void
 ; }
 
 ; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i64:
-; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(4)* %in) #0 {
-;   %load = load <64 x i8>, <64 x i8> addrspace(4)* %in
+; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+;   %load = load <64 x i8>, ptr addrspace(4) %in
 ;   %ext = sext <64 x i8> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   store <64 x i64> %ext, ptr addrspace(1) %out
 ;   ret void
 ; }
 
@@ -744,10 +744,10 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspac
 
 ; GCN-HSA: flat_load_ubyte v[[VAL:[0-9]+]],
 ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
-define amdgpu_kernel void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
-  %a = load i8, i8 addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %a = load i8, ptr addrspace(4) %in
   %ext = zext i8 %a to i16
-  store i16 %ext, i16 addrspace(1)* %out
+  store i16 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -759,18 +759,18 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i
 ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
-  %a = load i8, i8 addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %a = load i8, ptr addrspace(4) %in
   %ext = sext i8 %a to i16
-  store i16 %ext, i16 addrspace(1)* %out
+  store i16 %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i16:
-define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(4)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <1 x i8>, ptr addrspace(4) %in
   %ext = zext <1 x i8> %load to <1 x i16>
-  store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
+  store <1 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -778,20 +778,20 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(4)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <1 x i8>, ptr addrspace(4) %in
   %ext = sext <1 x i8> %load to <1 x i16>
-  store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
+  store <1 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i16:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <2 x i8>, ptr addrspace(4) %in
   %ext = zext <2 x i8> %load to <2 x i16>
-  store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -800,20 +800,20 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <2 x i8>, ptr addrspace(4) %in
   %ext = sext <2 x i8> %load to <2 x i16>
-  store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i16:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <4 x i8>, ptr addrspace(4) %in
   %ext = zext <4 x i8> %load to <4 x i16>
-  store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
+  store <4 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -824,20 +824,20 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <4 x i8>, ptr addrspace(4) %in
   %ext = sext <4 x i8> %load to <4 x i16>
-  store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
+  store <4 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i16:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <8 x i8>, ptr addrspace(4) %in
   %ext = zext <8 x i8> %load to <8 x i16>
-  store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
+  store <8 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -853,20 +853,20 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 
-define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <8 x i8>, ptr addrspace(4) %in
   %ext = sext <8 x i8> %load to <8 x i16>
-  store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
+  store <8 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i16:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <16 x i8>, ptr addrspace(4) %in
   %ext = zext <16 x i8> %load to <16 x i16>
-  store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
+  store <16 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -889,10 +889,10 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspac
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <16 x i8>, ptr addrspace(4) %in
   %ext = sext <16 x i8> %load to <16 x i16>
-  store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
+  store <16 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -900,10 +900,10 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspac
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(4)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <32 x i8>, ptr addrspace(4) %in
   %ext = zext <32 x i8> %load to <32 x i16>
-  store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
+  store <32 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -943,26 +943,26 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspac
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(4)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(4)* %in
+define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+  %load = load <32 x i8>, ptr addrspace(4) %in
   %ext = sext <32 x i8> %load to <32 x i16>
-  store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
+  store <32 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i16:
-; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(4)* %in) #0 {
-;   %load = load <64 x i8>, <64 x i8> addrspace(4)* %in
+; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+;   %load = load <64 x i8>, ptr addrspace(4) %in
 ;   %ext = zext <64 x i8> %load to <64 x i16>
-;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
+;   store <64 x i16> %ext, ptr addrspace(1) %out
 ;   ret void
 ; }
 
 ; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i16:
-; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(4)* %in) #0 {
-;   %load = load <64 x i8>, <64 x i8> addrspace(4)* %in
+; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
+;   %load = load <64 x i8>, ptr addrspace(4) %in
 ;   %ext = sext <64 x i8> %load to <64 x i16>
-;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
+;   store <64 x i16> %ext, ptr addrspace(1) %out
 ;   ret void
 ; }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-global-f64.ll b/llvm/test/CodeGen/AMDGPU/load-global-f64.ll
index 27d644ce011e6..55646206622f1 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-f64.ll
@@ -8,19 +8,19 @@
 
 ; GCN-HSA: flat_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
 ; GCN-HSA: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]]
-define amdgpu_kernel void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %ld = load double, double addrspace(1)* %in
-  store double %ld, double addrspace(1)* %out
+define amdgpu_kernel void @global_load_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %ld = load double, ptr addrspace(1) %in
+  store double %ld, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v2f64:
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define amdgpu_kernel void @global_load_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %ld = load <2 x double>, <2 x double> addrspace(1)* %in
-  store <2 x double> %ld, <2 x double> addrspace(1)* %out
+  %ld = load <2 x double>, ptr addrspace(1) %in
+  store <2 x double> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -29,10 +29,10 @@ entry:
 ; GCN-NOHSA-DAG: buffer_load_dwordx2
 ; GCN-HSA-DAG: flat_load_dwordx4
 ; GCN-HSA-DAG: flat_load_dwordx2
-define amdgpu_kernel void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %ld = load <3 x double>, <3 x double> addrspace(1)* %in
-  store <3 x double> %ld, <3 x double> addrspace(1)* %out
+  %ld = load <3 x double>, ptr addrspace(1) %in
+  store <3 x double> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -42,10 +42,10 @@ entry:
 
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define amdgpu_kernel void @global_load_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %ld = load <4 x double>, <4 x double> addrspace(1)* %in
-  store <4 x double> %ld, <4 x double> addrspace(1)* %out
+  %ld = load <4 x double>, ptr addrspace(1) %in
+  store <4 x double> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -59,10 +59,10 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define amdgpu_kernel void @global_load_v8f64(<8 x double> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %ld = load <8 x double>, <8 x double> addrspace(1)* %in
-  store <8 x double> %ld, <8 x double> addrspace(1)* %out
+  %ld = load <8 x double>, ptr addrspace(1) %in
+  store <8 x double> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -84,10 +84,10 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define amdgpu_kernel void @global_load_v16f64(<16 x double> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %ld = load <16 x double>, <16 x double> addrspace(1)* %in
-  store <16 x double> %ld, <16 x double> addrspace(1)* %out
+  %ld = load <16 x double>, ptr addrspace(1) %in
+  store <16 x double> %ld, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-global-i1.ll b/llvm/test/CodeGen/AMDGPU/load-global-i1.ll
index 3575a707bd532..f36d55ec3fdc2 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i1.ll
@@ -9,68 +9,68 @@
 
 ; EG: VTX_READ_8
 ; EG: AND_INT
-define amdgpu_kernel void @global_load_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
-  %load = load i1, i1 addrspace(1)* %in
-  store i1 %load, i1 addrspace(1)* %out
+define amdgpu_kernel void @global_load_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load i1, ptr addrspace(1) %in
+  store i1 %load, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v2i1:
-define amdgpu_kernel void @global_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
-  store <2 x i1> %load, <2 x i1> addrspace(1)* %out
+define amdgpu_kernel void @global_load_v2i1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <2 x i1>, ptr addrspace(1) %in
+  store <2 x i1> %load, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v3i1:
-define amdgpu_kernel void @global_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
-  store <3 x i1> %load, <3 x i1> addrspace(1)* %out
+define amdgpu_kernel void @global_load_v3i1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <3 x i1>, ptr addrspace(1) %in
+  store <3 x i1> %load, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v4i1:
-define amdgpu_kernel void @global_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
-  store <4 x i1> %load, <4 x i1> addrspace(1)* %out
+define amdgpu_kernel void @global_load_v4i1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <4 x i1>, ptr addrspace(1) %in
+  store <4 x i1> %load, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v8i1:
-define amdgpu_kernel void @global_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
-  store <8 x i1> %load, <8 x i1> addrspace(1)* %out
+define amdgpu_kernel void @global_load_v8i1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <8 x i1>, ptr addrspace(1) %in
+  store <8 x i1> %load, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v16i1:
-define amdgpu_kernel void @global_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
-  store <16 x i1> %load, <16 x i1> addrspace(1)* %out
+define amdgpu_kernel void @global_load_v16i1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <16 x i1>, ptr addrspace(1) %in
+  store <16 x i1> %load, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v32i1:
-define amdgpu_kernel void @global_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
-  store <32 x i1> %load, <32 x i1> addrspace(1)* %out
+define amdgpu_kernel void @global_load_v32i1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <32 x i1>, ptr addrspace(1) %in
+  store <32 x i1> %load, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v64i1:
-define amdgpu_kernel void @global_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
-  store <64 x i1> %load, <64 x i1> addrspace(1)* %out
+define amdgpu_kernel void @global_load_v64i1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <64 x i1>, ptr addrspace(1) %in
+  store <64 x i1> %load, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_i1_to_i32:
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_dword
-define amdgpu_kernel void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
-  %a = load i1, i1 addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_i1_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %a = load i1, ptr addrspace(1) %in
   %ext = zext i1 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -81,138 +81,138 @@ define amdgpu_kernel void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1
 
 ; EG: VTX_READ_8
 ; EG: BFE_INT
-define amdgpu_kernel void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
-  %a = load i1, i1 addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_i1_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %a = load i1, ptr addrspace(1) %in
   %ext = sext i1 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v1i1_to_v1i32:
-define amdgpu_kernel void @global_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
-  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v1i1_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <1 x i1>, ptr addrspace(1) %in
   %ext = zext <1 x i1> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  store <1 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v1i1_to_v1i32:
-define amdgpu_kernel void @global_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
-  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v1i1_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <1 x i1>, ptr addrspace(1) %in
   %ext = sext <1 x i1> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  store <1 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v2i1_to_v2i32:
-define amdgpu_kernel void @global_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v2i1_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <2 x i1>, ptr addrspace(1) %in
   %ext = zext <2 x i1> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v2i1_to_v2i32:
-define amdgpu_kernel void @global_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v2i1_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <2 x i1>, ptr addrspace(1) %in
   %ext = sext <2 x i1> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v3i1_to_v3i32:
-define amdgpu_kernel void @global_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v3i1_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <3 x i1>, ptr addrspace(1) %in
   %ext = zext <3 x i1> %load to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  store <3 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v3i1_to_v3i32:
-define amdgpu_kernel void @global_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v3i1_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <3 x i1>, ptr addrspace(1) %in
   %ext = sext <3 x i1> %load to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  store <3 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v4i1_to_v4i32:
-define amdgpu_kernel void @global_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v4i1_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <4 x i1>, ptr addrspace(1) %in
   %ext = zext <4 x i1> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v4i1_to_v4i32:
-define amdgpu_kernel void @global_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v4i1_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <4 x i1>, ptr addrspace(1) %in
   %ext = sext <4 x i1> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v8i1_to_v8i32:
-define amdgpu_kernel void @global_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v8i1_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <8 x i1>, ptr addrspace(1) %in
   %ext = zext <8 x i1> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  store <8 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v8i1_to_v8i32:
-define amdgpu_kernel void @global_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v8i1_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <8 x i1>, ptr addrspace(1) %in
   %ext = sext <8 x i1> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  store <8 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v16i1_to_v16i32:
-define amdgpu_kernel void @global_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v16i1_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <16 x i1>, ptr addrspace(1) %in
   %ext = zext <16 x i1> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v16i1_to_v16i32:
-define amdgpu_kernel void @global_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v16i1_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <16 x i1>, ptr addrspace(1) %in
   %ext = sext <16 x i1> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v32i1_to_v32i32:
-define amdgpu_kernel void @global_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v32i1_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <32 x i1>, ptr addrspace(1) %in
   %ext = zext <32 x i1> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  store <32 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v32i1_to_v32i32:
-define amdgpu_kernel void @global_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v32i1_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <32 x i1>, ptr addrspace(1) %in
   %ext = sext <32 x i1> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  store <32 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v64i1_to_v64i32:
-define amdgpu_kernel void @global_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v64i1_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <64 x i1>, ptr addrspace(1) %in
   %ext = zext <64 x i1> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  store <64 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v64i1_to_v64i32:
-define amdgpu_kernel void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v64i1_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <64 x i1>, ptr addrspace(1) %in
   %ext = sext <64 x i1> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  store <64 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -221,10 +221,10 @@ define amdgpu_kernel void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(
 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
 ; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]]{{$}}
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
-  %a = load i1, i1 addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_i1_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %a = load i1, ptr addrspace(1) %in
   %ext = zext i1 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -233,138 +233,138 @@ define amdgpu_kernel void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
-  %a = load i1, i1 addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_i1_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %a = load i1, ptr addrspace(1) %in
   %ext = sext i1 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v1i1_to_v1i64:
-define amdgpu_kernel void @global_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
-  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v1i1_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <1 x i1>, ptr addrspace(1) %in
   %ext = zext <1 x i1> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  store <1 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v1i1_to_v1i64:
-define amdgpu_kernel void @global_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
-  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v1i1_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <1 x i1>, ptr addrspace(1) %in
   %ext = sext <1 x i1> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  store <1 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v2i1_to_v2i64:
-define amdgpu_kernel void @global_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v2i1_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <2 x i1>, ptr addrspace(1) %in
   %ext = zext <2 x i1> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v2i1_to_v2i64:
-define amdgpu_kernel void @global_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v2i1_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <2 x i1>, ptr addrspace(1) %in
   %ext = sext <2 x i1> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v3i1_to_v3i64:
-define amdgpu_kernel void @global_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v3i1_to_v3i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <3 x i1>, ptr addrspace(1) %in
   %ext = zext <3 x i1> %load to <3 x i64>
-  store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
+  store <3 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v3i1_to_v3i64:
-define amdgpu_kernel void @global_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v3i1_to_v3i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <3 x i1>, ptr addrspace(1) %in
   %ext = sext <3 x i1> %load to <3 x i64>
-  store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
+  store <3 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v4i1_to_v4i64:
-define amdgpu_kernel void @global_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v4i1_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <4 x i1>, ptr addrspace(1) %in
   %ext = zext <4 x i1> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  store <4 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v4i1_to_v4i64:
-define amdgpu_kernel void @global_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v4i1_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <4 x i1>, ptr addrspace(1) %in
   %ext = sext <4 x i1> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  store <4 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v8i1_to_v8i64:
-define amdgpu_kernel void @global_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v8i1_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <8 x i1>, ptr addrspace(1) %in
   %ext = zext <8 x i1> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  store <8 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v8i1_to_v8i64:
-define amdgpu_kernel void @global_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v8i1_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <8 x i1>, ptr addrspace(1) %in
   %ext = sext <8 x i1> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  store <8 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v16i1_to_v16i64:
-define amdgpu_kernel void @global_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v16i1_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <16 x i1>, ptr addrspace(1) %in
   %ext = zext <16 x i1> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  store <16 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v16i1_to_v16i64:
-define amdgpu_kernel void @global_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v16i1_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <16 x i1>, ptr addrspace(1) %in
   %ext = sext <16 x i1> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  store <16 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v32i1_to_v32i64:
-define amdgpu_kernel void @global_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v32i1_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <32 x i1>, ptr addrspace(1) %in
   %ext = zext <32 x i1> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  store <32 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v32i1_to_v32i64:
-define amdgpu_kernel void @global_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v32i1_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <32 x i1>, ptr addrspace(1) %in
   %ext = sext <32 x i1> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  store <32 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v64i1_to_v64i64:
-define amdgpu_kernel void @global_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v64i1_to_v64i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <64 x i1>, ptr addrspace(1) %in
   %ext = zext <64 x i1> %load to <64 x i64>
-  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+  store <64 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v64i1_to_v64i64:
-define amdgpu_kernel void @global_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v64i1_to_v64i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <64 x i1>, ptr addrspace(1) %in
   %ext = sext <64 x i1> %load to <64 x i64>
-  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+  store <64 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 2f3a6f01e2912..72f20109350cb 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -7,7 +7,7 @@
 
 ; FIXME: r600 is broken because the bigger testcases spill and it's not implemented
 
-define amdgpu_kernel void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-NOHSA-SI-LABEL: global_load_i16:
 ; GCN-NOHSA-SI:       ; %bb.0: ; %entry
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -107,12 +107,12 @@ define amdgpu_kernel void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  %ld = load i16, i16 addrspace(1)* %in
-  store i16 %ld, i16 addrspace(1)* %out
+  %ld = load i16, ptr addrspace(1) %in
+  store i16 %ld, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-NOHSA-SI-LABEL: global_load_v2i16:
 ; GCN-NOHSA-SI:       ; %bb.0: ; %entry
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -192,12 +192,12 @@ define amdgpu_kernel void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
-  store <2 x i16> %ld, <2 x i16> addrspace(1)* %out
+  %ld = load <2 x i16>, ptr addrspace(1) %in
+  store <2 x i16> %ld, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-NOHSA-SI-LABEL: global_load_v3i16:
 ; GCN-NOHSA-SI:       ; %bb.0: ; %entry
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -324,12 +324,12 @@ define amdgpu_kernel void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x
 ; CM-NEXT:     LSHR * T8.X, T0.W, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
-  store <3 x i16> %ld, <3 x i16> addrspace(1)* %out
+  %ld = load <3 x i16>, ptr addrspace(1) %in
+  store <3 x i16> %ld, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-NOHSA-SI-LABEL: global_load_v4i16:
 ; GCN-NOHSA-SI:       ; %bb.0: ; %entry
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -409,12 +409,12 @@ define amdgpu_kernel void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
-  store <4 x i16> %ld, <4 x i16> addrspace(1)* %out
+  %ld = load <4 x i16>, ptr addrspace(1) %in
+  store <4 x i16> %ld, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-NOHSA-SI-LABEL: global_load_v8i16:
 ; GCN-NOHSA-SI:       ; %bb.0: ; %entry
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -494,12 +494,12 @@ define amdgpu_kernel void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  %ld = load <8 x i16>, <8 x i16> addrspace(1)* %in
-  store <8 x i16> %ld, <8 x i16> addrspace(1)* %out
+  %ld = load <8 x i16>, ptr addrspace(1) %in
+  store <8 x i16> %ld, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-NOHSA-SI-LABEL: global_load_v16i16:
 ; GCN-NOHSA-SI:       ; %bb.0: ; %entry
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -614,12 +614,12 @@ define amdgpu_kernel void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16
 ; CM-NEXT:     LSHR * T1.X, PV.W, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  %ld = load <16 x i16>, <16 x i16> addrspace(1)* %in
-  store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
+  %ld = load <16 x i16>, ptr addrspace(1) %in
+  store <16 x i16> %ld, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_load_v16i16_align2(<16 x i16> addrspace(1)* %in, <16 x i16> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
 ; GCN-NOHSA-SI-LABEL: global_load_v16i16_align2:
 ; GCN-NOHSA-SI:       ; %bb.0: ; %entry
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -796,12 +796,12 @@ define amdgpu_kernel void @global_load_v16i16_align2(<16 x i16> addrspace(1)* %i
 ; CM-NEXT:     LSHR * T3.X, KC0[2].Z, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  %ld =  load <16 x i16>, <16 x i16> addrspace(1)* %in, align 2
-  store <16 x i16> %ld, <16 x i16> addrspace(1)* %out, align 32
+  %ld =  load <16 x i16>, ptr addrspace(1) %in, align 2
+  store <16 x i16> %ld, ptr addrspace(1) %out, align 32
   ret void
 }
 
-define amdgpu_kernel void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_i16_to_i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -880,13 +880,13 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i1
 ; CM-NEXT:    ALU clause starting at 9:
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %a = load i16, i16 addrspace(1)* %in
+  %a = load i16, ptr addrspace(1) %in
   %ext = zext i16 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_sextload_i16_to_i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -968,13 +968,13 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i1
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %a = load i16, i16 addrspace(1)* %in
+  %a = load i16, ptr addrspace(1) %in
   %ext = sext i16 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v1i16_to_v1i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1053,13 +1053,13 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)
 ; CM-NEXT:    ALU clause starting at 9:
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+  %load = load <1 x i16>, ptr addrspace(1) %in
   %ext = zext <1 x i16> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  store <1 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_sextload_v1i16_to_v1i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1141,13 +1141,13 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+  %load = load <1 x i16>, ptr addrspace(1) %in
   %ext = sext <1 x i16> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  store <1 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v2i16_to_v2i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1239,14 +1239,14 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)
 ; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
 ; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %load = load <2 x i16>, ptr addrspace(1) %in
   %ext = zext <2 x i16> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; TODO: This should use ASHR instead of LSHR + BFE
-define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_sextload_v2i16_to_v2i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1339,13 +1339,13 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)
 ; CM-NEXT:     LSHR T4.X, KC0[2].Y, literal.x,
 ; CM-NEXT:     BFE_INT * T5.Y, PV.W, 0.0, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %load = load <2 x i16>, ptr addrspace(1) %in
   %ext = sext <2 x i16> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v3i16_to_v3i32:
 ; GCN-NOHSA-SI:       ; %bb.0: ; %entry
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1449,13 +1449,13 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)
 ; CM-NEXT:     MOV * T3.Y, T1.X,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
+  %ld = load <3 x i16>, ptr addrspace(1) %in
   %ext = zext <3 x i16> %ld to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  store <3 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-NOHSA-SI-LABEL: global_sextload_v3i16_to_v3i32:
 ; GCN-NOHSA-SI:       ; %bb.0: ; %entry
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1565,14 +1565,14 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)
 ; CM-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
+  %ld = load <3 x i16>, ptr addrspace(1) %in
   %ext = sext <3 x i16> %ld to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  store <3 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; TODO: This should use DST, but for some there are redundant MOVs
-define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v4i16_to_v4i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1686,15 +1686,15 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)
 ; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
 ; CM-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %load = load <4 x i16>, ptr addrspace(1) %in
   %ext = zext <4 x i16> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; TODO: We should use ASHR instead of LSHR + BFE
 ; TODO: This should use DST, but for some there are redundant MOVs
-define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_sextload_v4i16_to_v4i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1813,14 +1813,14 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)
 ; CM-NEXT:     LSHR T6.X, KC0[2].Y, literal.x,
 ; CM-NEXT:     BFE_INT * T5.Y, PV.Z, 0.0, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %load = load <4 x i16>, ptr addrspace(1) %in
   %ext = sext <4 x i16> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; TODO: These should use LSHR instead of BFE_UINT
-define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1962,14 +1962,14 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)
 ; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
 ; CM-NEXT:     LSHR * T10.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+  %load = load <8 x i16>, ptr addrspace(1) %in
   %ext = zext <8 x i16> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  store <8 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; TODO: These should use ASHR instead of LSHR + BFE_INT
-define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_sextload_v8i16_to_v8i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2115,13 +2115,13 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)
 ; CM-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
 ; CM-NEXT:     BFE_INT * T9.Y, PV.Z, 0.0, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+  %load = load <8 x i16>, ptr addrspace(1) %in
   %ext = sext <8 x i16> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  store <8 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v16i16_to_v16i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2351,13 +2351,13 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace
 ; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
 ; CM-NEXT:     LSHR * T18.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+  %load = load <16 x i16>, ptr addrspace(1) %in
   %ext = zext <16 x i16> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_sextload_v16i16_to_v16i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -2598,13 +2598,13 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace
 ; CM-NEXT:     LSHR T11.X, KC0[2].Y, literal.x,
 ; CM-NEXT:     BFE_INT * T17.Y, PV.Z, 0.0, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+  %load = load <16 x i16>, ptr addrspace(1) %in
   %ext = sext <16 x i16> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v32i16_to_v32i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3009,13 +3009,13 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
 ; CM-NEXT:     LSHR * T34.X, PV.W, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+  %load = load <32 x i16>, ptr addrspace(1) %in
   %ext = zext <32 x i16> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  store <32 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_sextload_v32i16_to_v32i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -3453,13 +3453,13 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace
 ; CM-NEXT:     LSHR T34.X, PV.Z, literal.x,
 ; CM-NEXT:     BFE_INT * T33.Y, T0.Y, 0.0, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+  %load = load <32 x i16>, ptr addrspace(1) %in
   %ext = sext <32 x i16> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  store <32 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v64i16_to_v64i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -4245,13 +4245,13 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
 ; CM-NEXT:     LSHR * T66.X, PV.W, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+  %load = load <64 x i16>, ptr addrspace(1) %in
   %ext = zext <64 x i16> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  store <64 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_sextload_v64i16_to_v64i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
@@ -5102,13 +5102,13 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace
 ; CM-NEXT:     LSHR T66.X, PV.Z, literal.x,
 ; CM-NEXT:     BFE_INT * T65.Y, T0.Y, 0.0, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+  %load = load <64 x i16>, ptr addrspace(1) %in
   %ext = sext <64 x i16> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  store <64 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_i16_to_i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5193,9 +5193,9 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i1
 ; CM-NEXT:     MOV * T0.Y, 0.0,
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %a = load i16, i16 addrspace(1)* %in
+  %a = load i16, ptr addrspace(1) %in
   %ext = zext i16 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -5205,7 +5205,7 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i1
 ;        t33: i64 = sign_extend_inreg t31, ValueType:ch:i16
 
 ; TODO: These could be expanded earlier using ASHR 15
-define amdgpu_kernel void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_sextload_i16_to_i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5294,13 +5294,13 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i1
 ; CM-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:     ASHR * T0.Y, PV.X, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-  %a = load i16, i16 addrspace(1)* %in
+  %a = load i16, ptr addrspace(1) %in
   %ext = sext i16 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v1i16_to_v1i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5385,14 +5385,14 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)
 ; CM-NEXT:     MOV * T0.Y, 0.0,
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+  %load = load <1 x i16>, ptr addrspace(1) %in
   %ext = zext <1 x i16> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  store <1 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; TODO: These could be expanded earlier using ASHR 15
-define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_sextload_v1i16_to_v1i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5481,13 +5481,13 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)
 ; CM-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:     ASHR * T0.Y, PV.X, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+  %load = load <1 x i16>, ptr addrspace(1) %in
   %ext = sext <1 x i16> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  store <1 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v2i16_to_v2i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5589,13 +5589,13 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)
 ; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
 ; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %load = load <2 x i16>, ptr addrspace(1) %in
   %ext = zext <2 x i16> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_sextload_v2i16_to_v2i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5703,13 +5703,13 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)
 ; CM-NEXT:     LSHR T5.X, KC0[2].Y, literal.x,
 ; CM-NEXT:     ASHR * T4.Y, PV.X, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %load = load <2 x i16>, ptr addrspace(1) %in
   %ext = sext <2 x i16> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v4i16_to_v4i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -5855,13 +5855,13 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ; CM-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %load = load <4 x i16>, ptr addrspace(1) %in
   %ext = zext <4 x i16> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  store <4 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_sextload_v4i16_to_v4i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -6014,13 +6014,13 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)
 ; CM-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
 ; CM-NEXT:     ASHR * T7.Y, PV.X, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %load = load <4 x i16>, ptr addrspace(1) %in
   %ext = sext <4 x i16> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  store <4 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -6232,13 +6232,13 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ; CM-NEXT:     LSHR * T14.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+  %load = load <8 x i16>, ptr addrspace(1) %in
   %ext = zext <8 x i16> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  store <8 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_sextload_v8i16_to_v8i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -6461,13 +6461,13 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)
 ; CM-NEXT:     LSHR T14.X, KC0[2].Y, literal.x,
 ; CM-NEXT:     ASHR * T7.Y, PV.X, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+  %load = load <8 x i16>, ptr addrspace(1) %in
   %ext = sext <8 x i16> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  store <8 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v16i16_to_v16i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -6835,13 +6835,13 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ; CM-NEXT:     LSHR * T26.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+  %load = load <16 x i16>, ptr addrspace(1) %in
   %ext = zext <16 x i16> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  store <16 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_sextload_v16i16_to_v16i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -7233,13 +7233,13 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace
 ; CM-NEXT:     LSHR T26.X, KC0[2].Y, literal.x,
 ; CM-NEXT:     ASHR * T12.Y, PV.X, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+  %load = load <16 x i16>, ptr addrspace(1) %in
   %ext = sext <16 x i16> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  store <16 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_zextload_v32i16_to_v32i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -7964,13 +7964,13 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
 ; CM-NEXT:     LSHR * T50.X, PV.W, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+  %load = load <32 x i16>, ptr addrspace(1) %in
   %ext = zext <32 x i16> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  store <32 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-NOHSA-SI-LABEL: global_sextload_v32i16_to_v32i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -8698,23 +8698,23 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace
 ; CM-NEXT:     LSHR T50.X, PV.W, literal.x,
 ; CM-NEXT:     ASHR * T38.Y, PV.X, literal.y,
 ; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
-  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+  %load = load <32 x i16>, ptr addrspace(1) %in
   %ext = sext <32 x i16> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  store <32 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
-; define amdgpu_kernel void @global_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
-;   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+; define amdgpu_kernel void @global_zextload_v64i16_to_v64i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+;   %load = load <64 x i16>, ptr addrspace(1) %in
 ;   %ext = zext <64 x i16> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   store <64 x i64> %ext, ptr addrspace(1) %out
 ;   ret void
 ; }
 
-; define amdgpu_kernel void @global_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
-;   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+; define amdgpu_kernel void @global_sextload_v64i16_to_v64i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+;   %load = load <64 x i16>, ptr addrspace(1) %in
 ;   %ext = sext <64 x i16> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   store <64 x i64> %ext, ptr addrspace(1) %out
 ;   ret void
 ; }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-global-i64.ll b/llvm/test/CodeGen/AMDGPU/load-global-i64.ll
index 868dd29dd51c2..87ac70639370a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i64.ll
@@ -13,9 +13,9 @@
 ; GCN-HSA: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]]
 
 ; EG: VTX_READ_64
-define amdgpu_kernel void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
-  %ld = load i64, i64 addrspace(1)* %in
-  store i64 %ld, i64 addrspace(1)* %out
+define amdgpu_kernel void @global_load_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %ld = load i64, ptr addrspace(1) %in
+  store i64 %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -24,10 +24,10 @@ define amdgpu_kernel void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace
 ; GCN-HSA: flat_load_dwordx4
 
 ; EG: VTX_READ_128
-define amdgpu_kernel void @global_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %ld = load <2 x i64>, <2 x i64> addrspace(1)* %in
-  store <2 x i64> %ld, <2 x i64> addrspace(1)* %out
+  %ld = load <2 x i64>, ptr addrspace(1) %in
+  store <2 x i64> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -40,10 +40,10 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define amdgpu_kernel void @global_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %ld = load <3 x i64>, <3 x i64> addrspace(1)* %in
-  store <3 x i64> %ld, <3 x i64> addrspace(1)* %out
+  %ld = load <3 x i64>, ptr addrspace(1) %in
+  store <3 x i64> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -56,10 +56,10 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define amdgpu_kernel void @global_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %ld = load <4 x i64>, <4 x i64> addrspace(1)* %in
-  store <4 x i64> %ld, <4 x i64> addrspace(1)* %out
+  %ld = load <4 x i64>, ptr addrspace(1) %in
+  store <4 x i64> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -78,10 +78,10 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define amdgpu_kernel void @global_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %ld = load <8 x i64>, <8 x i64> addrspace(1)* %in
-  store <8 x i64> %ld, <8 x i64> addrspace(1)* %out
+  %ld = load <8 x i64>, ptr addrspace(1) %in
+  store <8 x i64> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -112,10 +112,10 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define amdgpu_kernel void @global_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %ld = load <16 x i64>, <16 x i64> addrspace(1)* %in
-  store <16 x i64> %ld, <16 x i64> addrspace(1)* %out
+  %ld = load <16 x i64>, ptr addrspace(1) %in
+  store <16 x i64> %ld, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index f5d1b6386fdb4..1013168ca450f 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -11,10 +11,10 @@
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; TODO: NOT AND
-define amdgpu_kernel void @global_load_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %ld = load i8, i8 addrspace(1)* %in
-  store i8 %ld, i8 addrspace(1)* %out
+  %ld = load i8, ptr addrspace(1) %in
+  store i8 %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -23,10 +23,10 @@ entry:
 ; GCN-HSA: flat_load_ushort v
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
-  store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
+  %ld = load <2 x i8>, ptr addrspace(1) %in
+  store <2 x i8> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -35,10 +35,10 @@ entry:
 ; GCN-HSA: flat_load_dword v
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
-  store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
+  %ld = load <3 x i8>, ptr addrspace(1) %in
+  store <3 x i8> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -47,10 +47,10 @@ entry:
 ; GCN-HSA: flat_load_dword v
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in
-  store <4 x i8> %ld, <4 x i8> addrspace(1)* %out
+  %ld = load <4 x i8>, ptr addrspace(1) %in
+  store <4 x i8> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -59,10 +59,10 @@ entry:
 ; GCN-HSA: flat_load_dwordx2
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %ld = load <8 x i8>, <8 x i8> addrspace(1)* %in
-  store <8 x i8> %ld, <8 x i8> addrspace(1)* %out
+  %ld = load <8 x i8>, ptr addrspace(1) %in
+  store <8 x i8> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -72,10 +72,10 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %ld = load <16 x i8>, <16 x i8> addrspace(1)* %in
-  store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
+  %ld = load <16 x i8>, ptr addrspace(1) %in
+  store <16 x i8> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -84,10 +84,10 @@ entry:
 ; GCN-HSA: flat_load_ubyte
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
-  %a = load i8, i8 addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_i8_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %a = load i8, ptr addrspace(1) %in
   %ext = zext i8 %a to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -98,20 +98,20 @@ define amdgpu_kernel void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8
 ; EG: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define amdgpu_kernel void @global_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
-  %ld = load i8, i8 addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_i8_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %ld = load i8, ptr addrspace(1) %in
   %ext = sext i8 %ld to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i32:
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v1i8_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <1 x i8>, ptr addrspace(1) %in
   %ext = zext <1 x i8> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  store <1 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -120,10 +120,10 @@ define amdgpu_kernel void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)*
 ; EG: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define amdgpu_kernel void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v1i8_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <1 x i8>, ptr addrspace(1) %in
   %ext = sext <1 x i8> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  store <1 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -135,10 +135,10 @@ define amdgpu_kernel void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)*
 ; TODO: These should use DST, but for some there are redundant MOVs
 ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
 ; EG-DAG: 8
-define amdgpu_kernel void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v2i8_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <2 x i8>, ptr addrspace(1) %in
   %ext = zext <2 x i8> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -152,10 +152,10 @@ define amdgpu_kernel void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)*
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v2i8_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <2 x i8>, ptr addrspace(1) %in
   %ext = sext <2 x i8> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -174,11 +174,11 @@ define amdgpu_kernel void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)*
 ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @global_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v3i8_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
+  %ld = load <3 x i8>, ptr addrspace(1) %in
   %ext = zext <3 x i8> %ld to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  store <3 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -207,11 +207,11 @@ entry:
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @global_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v3i8_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
+  %ld = load <3 x i8>, ptr addrspace(1) %in
   %ext = sext <3 x i8> %ld to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+  store <3 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -227,10 +227,10 @@ entry:
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <4 x i8>, ptr addrspace(1) %in
   %ext = zext <4 x i8> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -248,10 +248,10 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)*
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <4 x i8>, ptr addrspace(1) %in
   %ext = sext <4 x i8> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -273,10 +273,10 @@ define amdgpu_kernel void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)*
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v8i8_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <8 x i8>, ptr addrspace(1) %in
   %ext = zext <8 x i8> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  store <8 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -300,10 +300,10 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)*
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v8i8_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <8 x i8>, ptr addrspace(1) %in
   %ext = sext <8 x i8> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  store <8 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -341,10 +341,10 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)*
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <16 x i8>, ptr addrspace(1) %in
   %ext = zext <16 x i8> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -384,10 +384,10 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v16i8_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <16 x i8>, ptr addrspace(1) %in
   %ext = sext <16 x i8> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -456,10 +456,10 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <32 x i8>, ptr addrspace(1) %in
   %ext = zext <32 x i8> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  store <32 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -532,10 +532,10 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <32 x i8>, ptr addrspace(1) %in
   %ext = sext <32 x i8> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  store <32 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -545,10 +545,10 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
-  %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <64 x i8>, ptr addrspace(1) %in
   %ext = zext <64 x i8> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  store <64 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -558,10 +558,10 @@ define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
-  %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <64 x i8>, ptr addrspace(1) %in
   %ext = sext <64 x i8> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  store <64 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -576,10 +576,10 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define amdgpu_kernel void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
-  %a = load i8, i8 addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %a = load i8, ptr addrspace(1) %in
   %ext = zext i8 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -595,10 +595,10 @@ define amdgpu_kernel void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define amdgpu_kernel void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
-  %a = load i8, i8 addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %a = load i8, ptr addrspace(1) %in
   %ext = sext i8 %a to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -606,10 +606,10 @@ define amdgpu_kernel void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define amdgpu_kernel void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v1i8_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <1 x i8>, ptr addrspace(1) %in
   %ext = zext <1 x i8> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  store <1 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -619,90 +619,90 @@ define amdgpu_kernel void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)*
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define amdgpu_kernel void @global_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v1i8_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <1 x i8>, ptr addrspace(1) %in
   %ext = sext <1 x i8> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  store <1 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v2i8_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <2 x i8>, ptr addrspace(1) %in
   %ext = zext <2 x i8> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v2i8_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <2 x i8>, ptr addrspace(1) %in
   %ext = sext <2 x i8> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v4i8_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <4 x i8>, ptr addrspace(1) %in
   %ext = zext <4 x i8> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  store <4 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v4i8_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <4 x i8>, ptr addrspace(1) %in
   %ext = sext <4 x i8> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  store <4 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v8i8_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <8 x i8>, ptr addrspace(1) %in
   %ext = zext <8 x i8> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  store <8 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <8 x i8>, ptr addrspace(1) %in
   %ext = sext <8 x i8> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  store <8 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <16 x i8>, ptr addrspace(1) %in
   %ext = zext <16 x i8> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  store <16 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <16 x i8>, ptr addrspace(1) %in
   %ext = sext <16 x i8> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  store <16 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -710,10 +710,10 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <32 x i8>, ptr addrspace(1) %in
   %ext = zext <32 x i8> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  store <32 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -721,26 +721,26 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <32 x i8>, ptr addrspace(1) %in
   %ext = sext <32 x i8> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  store <32 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i64:
-; define amdgpu_kernel void @global_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
-;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+; define amdgpu_kernel void @global_zextload_v64i8_to_v64i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+;   %load = load <64 x i8>, ptr addrspace(1) %in
 ;   %ext = zext <64 x i8> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   store <64 x i64> %ext, ptr addrspace(1) %out
 ;   ret void
 ; }
 
 ; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i64:
-; define amdgpu_kernel void @global_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
-;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+; define amdgpu_kernel void @global_sextload_v64i8_to_v64i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+;   %load = load <64 x i8>, ptr addrspace(1) %in
 ;   %ext = sext <64 x i8> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   store <64 x i64> %ext, ptr addrspace(1) %out
 ;   ret void
 ; }
 
@@ -752,10 +752,10 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(
 ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
-  %a = load i8, i8 addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_i8_to_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %a = load i8, ptr addrspace(1) %in
   %ext = zext i8 %a to i16
-  store i16 %ext, i16 addrspace(1)* %out
+  store i16 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -768,20 +768,20 @@ define amdgpu_kernel void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define amdgpu_kernel void @global_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
-  %a = load i8, i8 addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_i8_to_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %a = load i8, ptr addrspace(1) %in
   %ext = sext i8 %a to i16
-  store i16 %ext, i16 addrspace(1)* %out
+  store i16 %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i16:
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v1i8_to_v1i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <1 x i8>, ptr addrspace(1) %in
   %ext = zext <1 x i8> %load to <1 x i16>
-  store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
+  store <1 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -789,20 +789,20 @@ define amdgpu_kernel void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)*
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define amdgpu_kernel void @global_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v1i8_to_v1i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <1 x i8>, ptr addrspace(1) %in
   %ext = sext <1 x i8> %load to <1 x i16>
-  store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
+  store <1 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i16:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v2i8_to_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <2 x i8>, ptr addrspace(1) %in
   %ext = zext <2 x i8> %load to <2 x i16>
-  store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -811,20 +811,20 @@ define amdgpu_kernel void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)*
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define amdgpu_kernel void @global_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v2i8_to_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <2 x i8>, ptr addrspace(1) %in
   %ext = sext <2 x i8> %load to <2 x i16>
-  store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i16:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <4 x i8>, ptr addrspace(1) %in
   %ext = zext <4 x i8> %load to <4 x i16>
-  store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
+  store <4 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -835,20 +835,20 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)*
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define amdgpu_kernel void @global_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v4i8_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <4 x i8>, ptr addrspace(1) %in
   %ext = sext <4 x i8> %load to <4 x i16>
-  store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
+  store <4 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i16:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <8 x i8>, ptr addrspace(1) %in
   %ext = zext <8 x i8> %load to <8 x i16>
-  store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
+  store <8 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -863,20 +863,20 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)*
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define amdgpu_kernel void @global_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <8 x i8>, ptr addrspace(1) %in
   %ext = sext <8 x i8> %load to <8 x i16>
-  store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
+  store <8 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i16:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <16 x i8>, ptr addrspace(1) %in
   %ext = zext <16 x i8> %load to <16 x i16>
-  store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
+  store <16 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -899,10 +899,10 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <16 x i8>, ptr addrspace(1) %in
   %ext = sext <16 x i8> %load to <16 x i16>
-  store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
+  store <16 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -910,10 +910,10 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <32 x i8>, ptr addrspace(1) %in
   %ext = zext <32 x i8> %load to <32 x i16>
-  store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
+  store <32 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -953,26 +953,26 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %load = load <32 x i8>, ptr addrspace(1) %in
   %ext = sext <32 x i8> %load to <32 x i16>
-  store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
+  store <32 x i16> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i16:
-; define amdgpu_kernel void @global_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
-;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+; define amdgpu_kernel void @global_zextload_v64i8_to_v64i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+;   %load = load <64 x i8>, ptr addrspace(1) %in
 ;   %ext = zext <64 x i8> %load to <64 x i16>
-;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
+;   store <64 x i16> %ext, ptr addrspace(1) %out
 ;   ret void
 ; }
 
 ; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i16:
-; define amdgpu_kernel void @global_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
-;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+; define amdgpu_kernel void @global_sextload_v64i8_to_v64i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+;   %load = load <64 x i8>, ptr addrspace(1) %in
 ;   %ext = sext <64 x i8> %load to <64 x i16>
-;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
+;   store <64 x i16> %ext, ptr addrspace(1) %out
 ;   ret void
 ; }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
index 15de6af60c8f0..306cd7fb0f4af 100644
--- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX803 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900-FLATSCR %s
 
-define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(i16 addrspace(3)* noalias %in) #0 {
+define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(ptr addrspace(3) noalias %in) #0 {
 ; GFX900-LABEL: load_local_lo_hi_v2i16_multi_use_lo:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -62,16 +62,16 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(i16 addrspace(3)* noalias
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
-  %load.lo = load i16, i16 addrspace(3)* %in
-  %load.hi = load i16, i16 addrspace(3)* %gep
-  store i16 %load.lo, i16 addrspace(3)* null
+  %gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 8
+  %load.lo = load i16, ptr addrspace(3) %in
+  %load.hi = load i16, ptr addrspace(3) %gep
+  store i16 %load.lo, ptr addrspace(3) null
   %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
   ret <2 x i16> %build1
 }
 
-define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(i16 addrspace(3)* noalias %in) #0 {
+define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(ptr addrspace(3) noalias %in) #0 {
 ; GFX900-LABEL: load_local_lo_hi_v2i16_multi_use_hi:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -129,16 +129,16 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(i16 addrspace(3)* noalias
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
-  %load.lo = load i16, i16 addrspace(3)* %in
-  %load.hi = load i16, i16 addrspace(3)* %gep
-  store i16 %load.hi, i16 addrspace(3)* null
+  %gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 8
+  %load.lo = load i16, ptr addrspace(3) %in
+  %load.hi = load i16, ptr addrspace(3) %gep
+  store i16 %load.hi, ptr addrspace(3) null
   %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
   ret <2 x i16> %build1
 }
 
-define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(i16 addrspace(3)* noalias %in, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 {
+define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out0, ptr addrspace(3) noalias %out1) #0 {
 ; GFX900-LABEL: load_local_lo_hi_v2i16_multi_use_lohi:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -196,17 +196,17 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(i16 addrspace(3)* noalia
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
-  %load.lo = load i16, i16 addrspace(3)* %in
-  %load.hi = load i16, i16 addrspace(3)* %gep
-  store i16 %load.lo, i16 addrspace(3)* %out0
-  store i16 %load.hi, i16 addrspace(3)* %out1
+  %gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 8
+  %load.lo = load i16, ptr addrspace(3) %in
+  %load.hi = load i16, ptr addrspace(3) %gep
+  store i16 %load.lo, ptr addrspace(3) %out0
+  store i16 %load.hi, ptr addrspace(3) %out1
   %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
   ret <2 x i16> %build1
 }
 
-define <2 x i16> @load_local_hi_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
+define <2 x i16> @load_local_hi_v2i16_undeflo(ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_hi_v2i16_undeflo:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -238,12 +238,12 @@ define <2 x i16> @load_local_hi_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i16, i16 addrspace(3)* %in
+  %load = load i16, ptr addrspace(3) %in
   %build = insertelement <2 x i16> undef, i16 %load, i32 1
   ret <2 x i16> %build
 }
 
-define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
+define <2 x i16> @load_local_hi_v2i16_reglo(ptr addrspace(3) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_local_hi_v2i16_reglo:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -279,13 +279,13 @@ define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i16, i16 addrspace(3)* %in
+  %load = load i16, ptr addrspace(3) %in
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
   ret <2 x i16> %build1
 }
 
-define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
+define void @load_local_hi_v2i16_reglo_vreg(ptr addrspace(3) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_local_hi_v2i16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -327,14 +327,14 @@ define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i16, i16 addrspace(3)* %in
+  %load = load i16, ptr addrspace(3) %in
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define <2 x i16> @load_local_hi_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
+define <2 x i16> @load_local_hi_v2i16_zerolo(ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_hi_v2i16_zerolo:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -370,13 +370,13 @@ define <2 x i16> @load_local_hi_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i16, i16 addrspace(3)* %in
+  %load = load i16, ptr addrspace(3) %in
   %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
   ret <2 x i16> %build
 }
 
 ; FIXME: Remove m0 initialization
-define i32 @load_local_hi_v2i16_zerolo_shift(i16 addrspace(3)* %in) #0 {
+define i32 @load_local_hi_v2i16_zerolo_shift(ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_hi_v2i16_zerolo_shift:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -410,13 +410,13 @@ define i32 @load_local_hi_v2i16_zerolo_shift(i16 addrspace(3)* %in) #0 {
 ; GFX900-FLATSCR-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i16, i16 addrspace(3)* %in
+  %load = load i16, ptr addrspace(3) %in
   %zext = zext i16 %load to i32
   %shift = shl i32 %zext, 16
   ret i32 %shift
 }
 
-define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
+define void @load_local_hi_v2f16_reglo_vreg(ptr addrspace(3) %in, half %reg) #0 {
 ; GFX900-LABEL: load_local_hi_v2f16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -458,14 +458,14 @@ define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load half, half addrspace(3)* %in
+  %load = load half, ptr addrspace(3) %in
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
   %build1 = insertelement <2 x half> %build0, half %load, i32 1
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
+define void @load_local_hi_v2i16_reglo_vreg_zexti8(ptr addrspace(3) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_local_hi_v2i16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -507,15 +507,15 @@ define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %re
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i8, i8 addrspace(3)* %in
+  %load = load i8, ptr addrspace(3) %in
   %ext = zext i8 %load to i16
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
+define void @load_local_hi_v2i16_reglo_vreg_sexti8(ptr addrspace(3) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_local_hi_v2i16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -557,15 +557,15 @@ define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %re
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i8, i8 addrspace(3)* %in
+  %load = load i8, ptr addrspace(3) %in
   %ext = sext i8 %load to i16
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_local_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 {
+define void @load_local_hi_v2f16_reglo_vreg_zexti8(ptr addrspace(3) %in, half %reg) #0 {
 ; GFX900-LABEL: load_local_hi_v2f16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -607,17 +607,17 @@ define void @load_local_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %r
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i8, i8 addrspace(3)* %in
+  %load = load i8, ptr addrspace(3) %in
   %ext = zext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
 
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
   %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_local_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 {
+define void @load_local_hi_v2f16_reglo_vreg_sexti8(ptr addrspace(3) %in, half %reg) #0 {
 ; GFX900-LABEL: load_local_hi_v2f16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -659,17 +659,17 @@ define void @load_local_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %r
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i8, i8 addrspace(3)* %in
+  %load = load i8, ptr addrspace(3) %in
   %ext = sext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
 
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
   %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 {
+define void @load_global_hi_v2i16_reglo_vreg(ptr addrspace(1) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_global_hi_v2i16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -712,15 +712,15 @@ define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
-  %load = load i16, i16 addrspace(1)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 -2047
+  %load = load i16, ptr addrspace(1) %gep
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 {
+define void @load_global_hi_v2f16_reglo_vreg(ptr addrspace(1) %in, half %reg) #0 {
 ; GFX900-LABEL: load_global_hi_v2f16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -763,15 +763,15 @@ define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg)
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
-  %load = load half, half addrspace(1)* %gep
+  %gep = getelementptr inbounds half, ptr addrspace(1) %in, i64 -2047
+  %load = load half, ptr addrspace(1) %gep
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
   %build1 = insertelement <2 x half> %build0, half %load, i32 1
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
+define void @load_global_hi_v2i16_reglo_vreg_zexti8(ptr addrspace(1) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_global_hi_v2i16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -814,16 +814,16 @@ define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %r
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
-  %load = load i8, i8 addrspace(1)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(1) %in, i64 -4095
+  %load = load i8, ptr addrspace(1) %gep
   %ext = zext i8 %load to i16
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
+define void @load_global_hi_v2i16_reglo_vreg_sexti8(ptr addrspace(1) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_global_hi_v2i16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -866,16 +866,16 @@ define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %r
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
-  %load = load i8, i8 addrspace(1)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(1) %in, i64 -4095
+  %load = load i8, ptr addrspace(1) %gep
   %ext = sext i8 %load to i16
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_global_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, half %reg) #0 {
+define void @load_global_hi_v2f16_reglo_vreg_sexti8(ptr addrspace(1) %in, half %reg) #0 {
 ; GFX900-LABEL: load_global_hi_v2f16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -918,17 +918,17 @@ define void @load_global_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, half %
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
-  %load = load i8, i8 addrspace(1)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(1) %in, i64 -4095
+  %load = load i8, ptr addrspace(1) %gep
   %ext = sext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
   %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_global_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, half %reg) #0 {
+define void @load_global_hi_v2f16_reglo_vreg_zexti8(ptr addrspace(1) %in, half %reg) #0 {
 ; GFX900-LABEL: load_global_hi_v2f16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -971,17 +971,17 @@ define void @load_global_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, half %
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
-  %load = load i8, i8 addrspace(1)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(1) %in, i64 -4095
+  %load = load i8, ptr addrspace(1) %gep
   %ext = zext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
   %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_flat_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 {
+define void @load_flat_hi_v2i16_reglo_vreg(ptr %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_flat_hi_v2i16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1022,14 +1022,14 @@ define void @load_flat_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 {
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i16, i16* %in
+  %load = load i16, ptr %in
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_flat_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 {
+define void @load_flat_hi_v2f16_reglo_vreg(ptr %in, half %reg) #0 {
 ; GFX900-LABEL: load_flat_hi_v2f16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1070,14 +1070,14 @@ define void @load_flat_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 {
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load half, half* %in
+  %load = load half, ptr %in
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
   %build1 = insertelement <2 x half> %build0, half %load, i32 1
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 {
+define void @load_flat_hi_v2i16_reglo_vreg_zexti8(ptr %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_flat_hi_v2i16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1118,15 +1118,15 @@ define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 {
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i8, i8* %in
+  %load = load i8, ptr %in
   %ext = zext i8 %load to i16
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 {
+define void @load_flat_hi_v2i16_reglo_vreg_sexti8(ptr %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_flat_hi_v2i16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1167,15 +1167,15 @@ define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 {
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i8, i8* %in
+  %load = load i8, ptr %in
   %ext = sext i8 %load to i16
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_flat_hi_v2f16_reglo_vreg_zexti8(i8* %in, half %reg) #0 {
+define void @load_flat_hi_v2f16_reglo_vreg_zexti8(ptr %in, half %reg) #0 {
 ; GFX900-LABEL: load_flat_hi_v2f16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1216,16 +1216,16 @@ define void @load_flat_hi_v2f16_reglo_vreg_zexti8(i8* %in, half %reg) #0 {
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i8, i8* %in
+  %load = load i8, ptr %in
   %ext = zext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
   %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_flat_hi_v2f16_reglo_vreg_sexti8(i8* %in, half %reg) #0 {
+define void @load_flat_hi_v2f16_reglo_vreg_sexti8(ptr %in, half %reg) #0 {
 ; GFX900-LABEL: load_flat_hi_v2f16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1266,16 +1266,16 @@ define void @load_flat_hi_v2f16_reglo_vreg_sexti8(i8* %in, half %reg) #0 {
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i8, i8* %in
+  %load = load i8, ptr %in
   %ext = sext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
   %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval(i16) %in, i16 %reg) #0 {
+define void @load_private_hi_v2i16_reglo_vreg(ptr addrspace(5) byval(i16) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1316,15 +1316,15 @@ define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval(i16) %in,
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
-  %load = load i16, i16 addrspace(5)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i64 2047
+  %load = load i16, ptr addrspace(5) %gep
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval(half) %in, half %reg) #0 {
+define void @load_private_hi_v2f16_reglo_vreg(ptr addrspace(5) byval(half) %in, half %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1365,15 +1365,15 @@ define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval(half) %in
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047
-  %load = load half, half addrspace(5)* %gep
+  %gep = getelementptr inbounds half, ptr addrspace(5) %in, i64 2047
+  %load = load half, ptr addrspace(5) %gep
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
   %build1 = insertelement <2 x half> %build0, half %load, i32 1
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval(i16) %in, i16 %reg) #0 {
+define void @load_private_hi_v2i16_reglo_vreg_nooff(ptr addrspace(5) byval(i16) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_nooff:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1415,14 +1415,14 @@ define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval(i16)
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
+  %load = load volatile i16, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 {
+define void @load_private_hi_v2f16_reglo_vreg_nooff(ptr addrspace(5) %in, half %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg_nooff:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1464,14 +1464,14 @@ define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
+  %load = load volatile half, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
   %build1 = insertelement <2 x half> %build0, half %load, i32 1
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval(i8) %in, i16 %reg) #0 {
+define void @load_private_hi_v2i16_reglo_vreg_zexti8(ptr addrspace(5) byval(i8) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1512,16 +1512,16 @@ define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval(i8)
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
-  %load = load i8, i8 addrspace(5)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i64 4095
+  %load = load i8, ptr addrspace(5) %gep
   %ext = zext i8 %load to i16
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(5)* byval(i8) %in, half %reg) #0 {
+define void @load_private_hi_v2f16_reglo_vreg_zexti8(ptr addrspace(5) byval(i8) %in, half %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1562,17 +1562,17 @@ define void @load_private_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(5)* byval(i8)
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
-  %load = load i8, i8 addrspace(5)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i64 4095
+  %load = load i8, ptr addrspace(5) %gep
   %ext = zext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
   %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(5)* byval(i8) %in, half %reg) #0 {
+define void @load_private_hi_v2f16_reglo_vreg_sexti8(ptr addrspace(5) byval(i8) %in, half %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1613,17 +1613,17 @@ define void @load_private_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(5)* byval(i8)
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
-  %load = load i8, i8 addrspace(5)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i64 4095
+  %load = load i8, ptr addrspace(5) %gep
   %ext = sext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
   %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval(i8) %in, i16 %reg) #0 {
+define void @load_private_hi_v2i16_reglo_vreg_sexti8(ptr addrspace(5) byval(i8) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1664,16 +1664,16 @@ define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval(i8)
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
-  %load = load i8, i8 addrspace(5)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i64 4095
+  %load = load i8, ptr addrspace(5) %gep
   %ext = sext i8 %load to i16
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
+define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(ptr addrspace(5) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1715,15 +1715,15 @@ define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
+  %load = load volatile i8, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
   %ext = zext i8 %load to i16
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
+define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(ptr addrspace(5) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1765,15 +1765,15 @@ define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
+  %load = load volatile i8, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
   %ext = sext i8 %load to i16
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 {
+define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(ptr addrspace(5) %in, half %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1815,16 +1815,16 @@ define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
+  %load = load volatile i8, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
   %ext = zext i8 %load to i16
   %bc.ext = bitcast i16 %ext to half
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
   %build1 = insertelement <2 x half> %build0, half %bc.ext, i32 1
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 {
+define void @load_constant_hi_v2i16_reglo_vreg(ptr addrspace(4) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_constant_hi_v2i16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1867,15 +1867,15 @@ define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg)
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047
-  %load = load i16, i16 addrspace(4)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(4) %in, i64 -2047
+  %load = load i16, ptr addrspace(4) %gep
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 {
+define void @load_constant_hi_v2f16_reglo_vreg(ptr addrspace(4) %in, half %reg) #0 {
 ; GFX900-LABEL: load_constant_hi_v2f16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1918,15 +1918,15 @@ define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047
-  %load = load half, half addrspace(4)* %gep
+  %gep = getelementptr inbounds half, ptr addrspace(4) %in, i64 -2047
+  %load = load half, ptr addrspace(4) %gep
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
   %build1 = insertelement <2 x half> %build0, half %load, i32 1
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_constant_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, half %reg) #0 {
+define void @load_constant_hi_v2f16_reglo_vreg_sexti8(ptr addrspace(4) %in, half %reg) #0 {
 ; GFX900-LABEL: load_constant_hi_v2f16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1969,17 +1969,17 @@ define void @load_constant_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, half
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
-  %load = load i8, i8 addrspace(4)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %in, i64 -4095
+  %load = load i8, ptr addrspace(4) %gep
   %ext = sext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
   %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_constant_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, half %reg) #0 {
+define void @load_constant_hi_v2f16_reglo_vreg_zexti8(ptr addrspace(4) %in, half %reg) #0 {
 ; GFX900-LABEL: load_constant_hi_v2f16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2022,20 +2022,20 @@ define void @load_constant_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, half
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
-  %load = load i8, i8 addrspace(4)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %in, i64 -4095
+  %load = load i8, ptr addrspace(4) %gep
   %ext = zext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
   %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
 ; Local object gives known offset, so requires converting from offen
 ; to offset variant.
 
-define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, [10 x i32] addrspace(5)* %obj0) #0 {
+define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, ptr addrspace(5) %obj0) #0 {
 ; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_to_offset:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2089,17 +2089,16 @@ define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, [10 x i32] add
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %obj1 = alloca [4096 x i16], align 2, addrspace(5)
-  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
-  store volatile i32 123, i32 addrspace(5)* %bc
-  %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027
-  %load = load i16, i16 addrspace(5)* %gep
+  store volatile i32 123, ptr addrspace(5) %obj0
+  %gep = getelementptr inbounds [4096 x i16], ptr addrspace(5) %obj1, i32 0, i32 2027
+  %load = load i16, ptr addrspace(5) %gep
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg, [10 x i32] addrspace(5)* %obj0) #0 {
+define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg, ptr addrspace(5) %obj0) #0 {
 ; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2153,18 +2152,17 @@ define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg, [10 x i
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
-  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
-  store volatile i32 123, i32 addrspace(5)* %bc
-  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
-  %load = load i8, i8 addrspace(5)* %gep
+  store volatile i32 123, ptr addrspace(5) %obj0
+  %gep = getelementptr inbounds [4096 x i8], ptr addrspace(5) %obj1, i32 0, i32 4055
+  %load = load i8, ptr addrspace(5) %gep
   %ext = sext i8 %load to i16
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg, [10 x i32] addrspace(5)* %obj0) #0 {
+define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg, ptr addrspace(5) %obj0) #0 {
 ; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2218,20 +2216,19 @@ define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg, [10 x i
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
-  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
-  store volatile i32 123, i32 addrspace(5)* %bc
-  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
-  %load = load i8, i8 addrspace(5)* %gep
+  store volatile i32 123, ptr addrspace(5) %obj0
+  %gep = getelementptr inbounds [4096 x i8], ptr addrspace(5) %obj1, i32 0, i32 4055
+  %load = load i8, ptr addrspace(5) %gep
   %ext = zext i8 %load to i16
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
 ; FIXME: Remove m0 init and waitcnt between reads
 ; FIXME: Is there a cost to using the extload over not?
-define <2 x i16> @load_local_v2i16_split_multi_chain(i16 addrspace(3)* %in) #0 {
+define <2 x i16> @load_local_v2i16_split_multi_chain(ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_v2i16_split_multi_chain:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2273,15 +2270,15 @@ define <2 x i16> @load_local_v2i16_split_multi_chain(i16 addrspace(3)* %in) #0 {
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
-  %load0 = load volatile i16, i16 addrspace(3)* %in
-  %load1 = load volatile i16, i16 addrspace(3)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 1
+  %load0 = load volatile i16, ptr addrspace(3) %in
+  %load1 = load volatile i16, ptr addrspace(3) %gep
   %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
   ret <2 x i16> %build1
 }
 
-define <2 x i16> @load_local_lo_hi_v2i16_samechain(i16 addrspace(3)* %in) #0 {
+define <2 x i16> @load_local_lo_hi_v2i16_samechain(ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_lo_hi_v2i16_samechain:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2324,16 +2321,16 @@ define <2 x i16> @load_local_lo_hi_v2i16_samechain(i16 addrspace(3)* %in) #0 {
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
-  %load.lo = load i16, i16 addrspace(3)* %in
-  %load.hi = load i16, i16 addrspace(3)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 8
+  %load.lo = load i16, ptr addrspace(3) %in
+  %load.hi = load i16, ptr addrspace(3) %gep
   %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
   ret <2 x i16> %build1
 }
 
 ; FIXME: Remove and
-define <2 x i16> @load_local_v2i16_broadcast(i16 addrspace(3)* %in) #0 {
+define <2 x i16> @load_local_v2i16_broadcast(ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_v2i16_broadcast:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2371,14 +2368,14 @@ define <2 x i16> @load_local_v2i16_broadcast(i16 addrspace(3)* %in) #0 {
 ; GFX900-FLATSCR-NEXT:    v_perm_b32 v0, v0, v0, s0
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
-  %load0 = load i16, i16 addrspace(3)* %in
+  %gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 1
+  %load0 = load i16, ptr addrspace(3) %in
   %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load0, i32 1
   ret <2 x i16> %build1
 }
 
-define <2 x i16> @load_local_lo_hi_v2i16_side_effect(i16 addrspace(3)* %in, i16 addrspace(3)* %may.alias) #0 {
+define <2 x i16> @load_local_lo_hi_v2i16_side_effect(ptr addrspace(3) %in, ptr addrspace(3) %may.alias) #0 {
 ; GFX900-LABEL: load_local_lo_hi_v2i16_side_effect:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2428,17 +2425,17 @@ define <2 x i16> @load_local_lo_hi_v2i16_side_effect(i16 addrspace(3)* %in, i16
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
-  %load.lo = load i16, i16 addrspace(3)* %in
-  store i16 123, i16 addrspace(3)* %may.alias
-  %load.hi = load i16, i16 addrspace(3)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 8
+  %load.lo = load i16, ptr addrspace(3) %in
+  store i16 123, ptr addrspace(3) %may.alias
+  %load.hi = load i16, ptr addrspace(3) %gep
   %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
   ret <2 x i16> %build1
 }
 
 ; FIXME: Remove waitcnt between reads
-define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 {
+define <2 x i16> @load_global_v2i16_split(ptr addrspace(1) %in) #0 {
 ; GFX900-LABEL: load_global_v2i16_split:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2483,16 +2480,16 @@ define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 {
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
-  %load0 = load volatile i16, i16 addrspace(1)* %in
-  %load1 = load volatile i16, i16 addrspace(1)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 1
+  %load0 = load volatile i16, ptr addrspace(1) %in
+  %load1 = load volatile i16, ptr addrspace(1) %gep
   %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
   ret <2 x i16> %build1
 }
 
 ; FIXME: Remove waitcnt between reads
-define <2 x i16> @load_flat_v2i16_split(i16* %in) #0 {
+define <2 x i16> @load_flat_v2i16_split(ptr %in) #0 {
 ; GFX900-LABEL: load_flat_v2i16_split:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2538,16 +2535,16 @@ define <2 x i16> @load_flat_v2i16_split(i16* %in) #0 {
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i16, i16* %in, i64 1
-  %load0 = load volatile i16, i16* %in
-  %load1 = load volatile i16, i16* %gep
+  %gep = getelementptr inbounds i16, ptr %in, i64 1
+  %load0 = load volatile i16, ptr %in
+  %load1 = load volatile i16, ptr %gep
   %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
   ret <2 x i16> %build1
 }
 
 ; FIXME: Remove waitcnt between reads
-define <2 x i16> @load_constant_v2i16_split(i16 addrspace(4)* %in) #0 {
+define <2 x i16> @load_constant_v2i16_split(ptr addrspace(4) %in) #0 {
 ; GFX900-LABEL: load_constant_v2i16_split:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2590,9 +2587,9 @@ define <2 x i16> @load_constant_v2i16_split(i16 addrspace(4)* %in) #0 {
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1
-  %load0 = load volatile i16, i16 addrspace(4)* %in
-  %load1 = load volatile i16, i16 addrspace(4)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(4) %in, i64 1
+  %load0 = load volatile i16, ptr addrspace(4) %in
+  %load1 = load volatile i16, ptr addrspace(4) %gep
   %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
   ret <2 x i16> %build1
@@ -2600,7 +2597,7 @@ entry:
 
 ; FIXME: Remove m0 init and waitcnt between reads
 ; FIXME: Is there a cost to using the extload over not?
-define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval(i16) %in) #0 {
+define <2 x i16> @load_private_v2i16_split(ptr addrspace(5) byval(i16) %in) #0 {
 ; GFX900-LABEL: load_private_v2i16_split:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2641,9 +2638,9 @@ define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval(i16) %in) #0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i32 1
-  %load0 = load volatile i16, i16 addrspace(5)* %in
-  %load1 = load volatile i16, i16 addrspace(5)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
+  %load0 = load volatile i16, ptr addrspace(5) %in
+  %load1 = load volatile i16, ptr addrspace(5) %gep
   %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
   ret <2 x i16> %build1
@@ -2652,7 +2649,7 @@ entry:
 ; FIXME: This test should work without copying of v0.
 ;        ds_read_u16_d16_hi preserves low 16 bits of the destination
 ;        and ds_write_b16 only reads low 16 bits.
-define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, i16 addrspace(3)* %in) #0 {
+define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_hi_v2i16_store_local_lo:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2700,10 +2697,10 @@ define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, i16 addrspace(3)*
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i16, i16 addrspace(3)* %in
+  %load = load i16, ptr addrspace(3) %in
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
-  store volatile i16 %reg, i16 addrspace(3)* %in
+  store volatile i16 %reg, ptr addrspace(3) %in
   ret <2 x i16> %build1
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-input-fold.ll b/llvm/test/CodeGen/AMDGPU/load-input-fold.ll
index 0724e09d7ad09..ada0ee35fdfea 100644
--- a/llvm/test/CodeGen/AMDGPU/load-input-fold.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-input-fold.ll
@@ -14,71 +14,71 @@ main_body:
   %9 = extractelement <4 x float> %reg3, i32 1
   %10 = extractelement <4 x float> %reg3, i32 2
   %11 = extractelement <4 x float> %reg3, i32 3
-  %12 = load <4 x float>, <4 x float> addrspace(8)* null
+  %12 = load <4 x float>, ptr addrspace(8) null
   %13 = extractelement <4 x float> %12, i32 0
   %14 = fmul float %0, %13
-  %15 = load <4 x float>, <4 x float> addrspace(8)* null
+  %15 = load <4 x float>, ptr addrspace(8) null
   %16 = extractelement <4 x float> %15, i32 1
   %17 = fmul float %0, %16
-  %18 = load <4 x float>, <4 x float> addrspace(8)* null
+  %18 = load <4 x float>, ptr addrspace(8) null
   %19 = extractelement <4 x float> %18, i32 2
   %20 = fmul float %0, %19
-  %21 = load <4 x float>, <4 x float> addrspace(8)* null
+  %21 = load <4 x float>, ptr addrspace(8) null
   %22 = extractelement <4 x float> %21, i32 3
   %23 = fmul float %0, %22
-  %24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %24 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 1)
   %25 = extractelement <4 x float> %24, i32 0
   %26 = fmul float %1, %25
   %27 = fadd float %26, %14
-  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %28 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 1)
   %29 = extractelement <4 x float> %28, i32 1
   %30 = fmul float %1, %29
   %31 = fadd float %30, %17
-  %32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %32 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 1)
   %33 = extractelement <4 x float> %32, i32 2
   %34 = fmul float %1, %33
   %35 = fadd float %34, %20
-  %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %36 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 1)
   %37 = extractelement <4 x float> %36, i32 3
   %38 = fmul float %1, %37
   %39 = fadd float %38, %23
-  %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %40 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 2)
   %41 = extractelement <4 x float> %40, i32 0
   %42 = fmul float %2, %41
   %43 = fadd float %42, %27
-  %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %44 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 2)
   %45 = extractelement <4 x float> %44, i32 1
   %46 = fmul float %2, %45
   %47 = fadd float %46, %31
-  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %48 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 2)
   %49 = extractelement <4 x float> %48, i32 2
   %50 = fmul float %2, %49
   %51 = fadd float %50, %35
-  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %52 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 2)
   %53 = extractelement <4 x float> %52, i32 3
   %54 = fmul float %2, %53
   %55 = fadd float %54, %39
-  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %56 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 3)
   %57 = extractelement <4 x float> %56, i32 0
   %58 = fmul float %3, %57
   %59 = fadd float %58, %43
-  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %60 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 3)
   %61 = extractelement <4 x float> %60, i32 1
   %62 = fmul float %3, %61
   %63 = fadd float %62, %47
-  %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %64 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 3)
   %65 = extractelement <4 x float> %64, i32 2
   %66 = fmul float %3, %65
   %67 = fadd float %66, %51
-  %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %68 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 3)
   %69 = extractelement <4 x float> %68, i32 3
   %70 = fmul float %3, %69
   %71 = fadd float %70, %55
-  %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %72 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 4)
   %73 = extractelement <4 x float> %72, i32 0
-  %74 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %74 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 4)
   %75 = extractelement <4 x float> %74, i32 1
-  %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %76 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 4)
   %77 = extractelement <4 x float> %76, i32 2
   %78 = insertelement <4 x float> undef, float %4, i32 0
   %79 = insertelement <4 x float> %78, float %5, i32 1

diff  --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
index 9506eb1b08832..8f01d1db64cc4 100644
--- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck --check-prefix=GFX803 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs --mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX900,GFX900-FLATSCR %s
 
-define <2 x i16> @load_local_lo_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
+define <2 x i16> @load_local_lo_v2i16_undeflo(ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_lo_v2i16_undeflo:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27,12 +27,12 @@ define <2 x i16> @load_local_lo_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i16, i16 addrspace(3)* %in
+  %load = load i16, ptr addrspace(3) %in
   %build = insertelement <2 x i16> undef, i16 %load, i32 0
   ret <2 x i16> %build
 }
 
-define <2 x i16> @load_local_lo_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
+define <2 x i16> @load_local_lo_v2i16_reglo(ptr addrspace(3) %in, i16 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reglo:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -70,14 +70,14 @@ define <2 x i16> @load_local_lo_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0
 ; GFX900-FLATSCR-NEXT:    v_perm_b32 v0, v1, v0, s0
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i16, i16 addrspace(3)* %in
+  %load = load i16, ptr addrspace(3) %in
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
   ret <2 x i16> %build1
 }
 
 ; Show that we get reasonable regalloc without physreg constraints.
-define void @load_local_lo_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
+define void @load_local_lo_v2i16_reglo_vreg(ptr addrspace(3) %in, i16 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reglo_vreg:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -123,14 +123,14 @@ define void @load_local_lo_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i16, i16 addrspace(3)* %in
+  %load = load i16, ptr addrspace(3) %in
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define <2 x i16> @load_local_lo_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
+define <2 x i16> @load_local_lo_v2i16_zerolo(ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_lo_v2i16_zerolo:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -156,12 +156,12 @@ define <2 x i16> @load_local_lo_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i16, i16 addrspace(3)* %in
+  %load = load i16, ptr addrspace(3) %in
   %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
   ret <2 x i16> %build
 }
 
-define <2 x half> @load_local_lo_v2f16_fpimm(half addrspace(3)* %in) #0 {
+define <2 x half> @load_local_lo_v2f16_fpimm(ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_lo_v2f16_fpimm:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -190,12 +190,12 @@ define <2 x half> @load_local_lo_v2f16_fpimm(half addrspace(3)* %in) #0 {
 ; GFX803-NEXT:    v_or_b32_e32 v0, 2.0, v0
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load half, half addrspace(3)* %in
+  %load = load half, ptr addrspace(3) %in
   %build = insertelement <2 x half> <half 0.0, half 2.0>, half %load, i32 0
   ret <2 x half> %build
 }
 
-define void @load_local_lo_v2f16_reghi_vreg(half addrspace(3)* %in, i32 %reg) #0 {
+define void @load_local_lo_v2f16_reghi_vreg(ptr addrspace(3) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_local_lo_v2f16_reghi_vreg:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -229,13 +229,13 @@ define void @load_local_lo_v2f16_reghi_vreg(half addrspace(3)* %in, i32 %reg) #0
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
-  %load = load half, half addrspace(3)* %in
+  %load = load half, ptr addrspace(3) %in
   %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_local_lo_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
+define void @load_local_lo_v2f16_reglo_vreg(ptr addrspace(3) %in, half %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_local_lo_v2f16_reglo_vreg:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -281,14 +281,14 @@ define void @load_local_lo_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load half, half addrspace(3)* %in
+  %load = load half, ptr addrspace(3) %in
   %build0 = insertelement <2 x half> undef, half %reg, i32 1
   %build1 = insertelement <2 x half> %build0, half %load, i32 0
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_local_lo_v2i16_reghi_vreg_zexti8(i8 addrspace(3)* %in, i32 %reg) #0 {
+define void @load_local_lo_v2i16_reghi_vreg_zexti8(ptr addrspace(3) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -322,14 +322,14 @@ define void @load_local_lo_v2i16_reghi_vreg_zexti8(i8 addrspace(3)* %in, i32 %re
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %load = load i8, i8 addrspace(3)* %in
+  %load = load i8, ptr addrspace(3) %in
   %ext = zext i8 %load to i16
   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_local_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
+define void @load_local_lo_v2i16_reglo_vreg_zexti8(ptr addrspace(3) %in, i16 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -375,15 +375,15 @@ define void @load_local_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %re
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i8, i8 addrspace(3)* %in
+  %load = load i8, ptr addrspace(3) %in
   %ext = zext i8 %load to i16
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_local_lo_v2i16_reghi_vreg_sexti8(i8 addrspace(3)* %in, i32 %reg) #0 {
+define void @load_local_lo_v2i16_reghi_vreg_sexti8(ptr addrspace(3) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -417,14 +417,14 @@ define void @load_local_lo_v2i16_reghi_vreg_sexti8(i8 addrspace(3)* %in, i32 %re
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %load = load i8, i8 addrspace(3)* %in
+  %load = load i8, ptr addrspace(3) %in
   %ext = sext i8 %load to i16
   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_local_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
+define void @load_local_lo_v2i16_reglo_vreg_sexti8(ptr addrspace(3) %in, i16 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -470,15 +470,15 @@ define void @load_local_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %re
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i8, i8 addrspace(3)* %in
+  %load = load i8, ptr addrspace(3) %in
   %ext = sext i8 %load to i16
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
   %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_local_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 {
+define void @load_local_lo_v2f16_reglo_vreg_zexti8(ptr addrspace(3) %in, half %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -524,16 +524,16 @@ define void @load_local_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %r
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i8, i8 addrspace(3)* %in
+  %load = load i8, ptr addrspace(3) %in
   %ext = zext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
   %build0 = insertelement <2 x half> undef, half %reg, i32 1
   %build1 = insertelement <2 x half> %build0, half %bitcast, i32 0
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_local_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 {
+define void @load_local_lo_v2f16_reglo_vreg_sexti8(ptr addrspace(3) %in, half %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -579,16 +579,16 @@ define void @load_local_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %r
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i8, i8 addrspace(3)* %in
+  %load = load i8, ptr addrspace(3) %in
   %ext = sext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
   %build0 = insertelement <2 x half> undef, half %reg, i32 1
   %build1 = insertelement <2 x half> %build0, half %bitcast, i32 0
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(i16 addrspace(3)* %in, <2 x i16> %reg) #0 {
+define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(ptr addrspace(3) %in, <2 x i16> %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -642,15 +642,15 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(i16 addrspace(3)* %in,
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i16, i16 addrspace(3)* %in
+  %load = load i16, ptr addrspace(3) %in
   %elt1 = extractelement <2 x i16> %reg, i32 1
-  store i16 %load, i16 addrspace(3)* null
+  store i16 %load, ptr addrspace(3) null
   %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(i16 addrspace(3)* %in, <2 x i16> %reg) #0 {
+define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(ptr addrspace(3) %in, <2 x i16> %reg) #0 {
 ; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_hi:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -692,15 +692,15 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(i16 addrspace(3)* %in,
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i16, i16 addrspace(3)* %in
+  %load = load i16, ptr addrspace(3) %in
   %elt1 = extractelement <2 x i16> %reg, i32 1
-  store i16 %elt1, i16 addrspace(3)* null
+  store i16 %elt1, ptr addrspace(3) null
   %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noalias %in, <2 x i16> %reg, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 {
+define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(ptr addrspace(3) noalias %in, <2 x i16> %reg, ptr addrspace(3) noalias %out0, ptr addrspace(3) noalias %out1) #0 {
 ; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -758,16 +758,16 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noa
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load i16, i16 addrspace(3)* %in
+  %load = load i16, ptr addrspace(3) %in
   %elt1 = extractelement <2 x i16> %reg, i32 1
-  store i16 %load, i16 addrspace(3)* %out0
-  store i16 %elt1, i16 addrspace(3)* %out1
+  store i16 %load, ptr addrspace(3) %out0
+  store i16 %elt1, ptr addrspace(3) %out1
   %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_global_lo_v2i16_reglo_vreg(i16 addrspace(1)* %in, i32 %reg) #0 {
+define void @load_global_lo_v2i16_reglo_vreg(ptr addrspace(1) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_global_lo_v2i16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -802,14 +802,14 @@ define void @load_global_lo_v2i16_reglo_vreg(i16 addrspace(1)* %in, i32 %reg) #0
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
-  %load = load i16, i16 addrspace(1)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 -2047
+  %load = load i16, ptr addrspace(1) %gep
   %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_global_lo_v2f16_reglo_vreg(half addrspace(1)* %in, i32 %reg) #0 {
+define void @load_global_lo_v2f16_reglo_vreg(ptr addrspace(1) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_global_lo_v2f16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -844,14 +844,14 @@ define void @load_global_lo_v2f16_reglo_vreg(half addrspace(1)* %in, i32 %reg) #
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
-  %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
-  %load = load half, half addrspace(1)* %gep
+  %gep = getelementptr inbounds half, ptr addrspace(1) %in, i64 -2047
+  %load = load half, ptr addrspace(1) %gep
   %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_global_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
+define void @load_global_lo_v2i16_reglo_vreg_zexti8(ptr addrspace(1) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_global_lo_v2i16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -886,15 +886,15 @@ define void @load_global_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %r
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
-  %load = load i8, i8 addrspace(1)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(1) %in, i64 -4095
+  %load = load i8, ptr addrspace(1) %gep
   %ext = zext i8 %load to i16
   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_global_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
+define void @load_global_lo_v2i16_reglo_vreg_sexti8(ptr addrspace(1) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_global_lo_v2i16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -929,15 +929,15 @@ define void @load_global_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %r
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
-  %load = load i8, i8 addrspace(1)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(1) %in, i64 -4095
+  %load = load i8, ptr addrspace(1) %gep
   %ext = sext i8 %load to i16
   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_global_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
+define void @load_global_lo_v2f16_reglo_vreg_zexti8(ptr addrspace(1) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_global_lo_v2f16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -972,16 +972,16 @@ define void @load_global_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %r
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
-  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
-  %load = load i8, i8 addrspace(1)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(1) %in, i64 -4095
+  %load = load i8, ptr addrspace(1) %gep
   %ext = zext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
   %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_global_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
+define void @load_global_lo_v2f16_reglo_vreg_sexti8(ptr addrspace(1) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_global_lo_v2f16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1016,16 +1016,16 @@ define void @load_global_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %r
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
-  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
-  %load = load i8, i8 addrspace(1)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(1) %in, i64 -4095
+  %load = load i8, ptr addrspace(1) %gep
   %ext = sext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
   %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_flat_lo_v2i16_reghi_vreg(i16* %in, i32 %reg) #0 {
+define void @load_flat_lo_v2i16_reghi_vreg(ptr %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_flat_lo_v2i16_reghi_vreg:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1058,13 +1058,13 @@ define void @load_flat_lo_v2i16_reghi_vreg(i16* %in, i32 %reg) #0 {
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %load = load i16, i16* %in
+  %load = load i16, ptr %in
   %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_flat_lo_v2f16_reghi_vreg(half* %in, i32 %reg) #0 {
+define void @load_flat_lo_v2f16_reghi_vreg(ptr %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_flat_lo_v2f16_reghi_vreg:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1099,13 +1099,13 @@ define void @load_flat_lo_v2f16_reghi_vreg(half* %in, i32 %reg) #0 {
 ; FIXME: the and above should be removable
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
-  %load = load half, half* %in
+  %load = load half, ptr %in
   %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
+define void @load_flat_lo_v2i16_reglo_vreg_zexti8(ptr %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_flat_lo_v2i16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1138,14 +1138,14 @@ define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %load = load i8, i8* %in
+  %load = load i8, ptr %in
   %ext = zext i8 %load to i16
   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_flat_lo_v2i16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 {
+define void @load_flat_lo_v2i16_reglo_vreg_sexti8(ptr %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_flat_lo_v2i16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1178,14 +1178,14 @@ define void @load_flat_lo_v2i16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 {
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %load = load i8, i8* %in
+  %load = load i8, ptr %in
   %ext = sext i8 %load to i16
   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_flat_lo_v2f16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
+define void @load_flat_lo_v2f16_reglo_vreg_zexti8(ptr %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_flat_lo_v2f16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1218,15 +1218,15 @@ define void @load_flat_lo_v2f16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
-  %load = load i8, i8* %in
+  %load = load i8, ptr %in
   %ext = zext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
   %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_flat_lo_v2f16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 {
+define void @load_flat_lo_v2f16_reglo_vreg_sexti8(ptr %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_flat_lo_v2f16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1259,15 +1259,15 @@ define void @load_flat_lo_v2f16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 {
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
-  %load = load i8, i8* %in
+  %load = load i8, ptr %in
   %ext = sext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
   %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval(i16) %in, i32 %reg) #0 {
+define void @load_private_lo_v2i16_reglo_vreg(ptr addrspace(5) byval(i16) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1309,14 +1309,14 @@ define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval(i16) %in,
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
-  %load = load i16, i16 addrspace(5)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i64 2047
+  %load = load i16, ptr addrspace(5) %gep
   %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval(i16) %in, i16 %reg) #0 {
+define void @load_private_lo_v2i16_reghi_vreg(ptr addrspace(5) byval(i16) %in, i16 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reghi_vreg:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1361,15 +1361,15 @@ define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval(i16) %in,
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
-  %load = load i16, i16 addrspace(5)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i64 2047
+  %load = load i16, ptr addrspace(5) %gep
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval(half) %in, i32 %reg) #0 {
+define void @load_private_lo_v2f16_reglo_vreg(ptr addrspace(5) byval(half) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1411,14 +1411,14 @@ define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval(half) %in
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
-  %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047
-  %load = load half, half addrspace(5)* %gep
+  %gep = getelementptr inbounds half, ptr addrspace(5) %in, i64 2047
+  %load = load half, ptr addrspace(5) %gep
   %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 {
+define void @load_private_lo_v2i16_reglo_vreg_nooff(ptr addrspace(5) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1461,13 +1461,13 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
+  %load = load volatile i16, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
   %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 {
+define void @load_private_lo_v2i16_reghi_vreg_nooff(ptr addrspace(5) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1510,13 +1510,13 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
+  %load = load volatile i16, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
   %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32 %reg) #0 {
+define void @load_private_lo_v2f16_reglo_vreg_nooff(ptr addrspace(5) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1559,13 +1559,13 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
-  %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
+  %load = load volatile half, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
   %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval(i8) %in, i32 %reg) #0 {
+define void @load_private_lo_v2i16_reglo_vreg_zexti8(ptr addrspace(5) byval(i8) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1607,15 +1607,15 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval(i8)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
-  %load = load i8, i8 addrspace(5)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i64 4095
+  %load = load i8, ptr addrspace(5) %gep
   %ext = zext i8 %load to i16
   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval(i8) %in, i32 %reg) #0 {
+define void @load_private_lo_v2i16_reglo_vreg_sexti8(ptr addrspace(5) byval(i8) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1657,15 +1657,15 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval(i8)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
-  %load = load i8, i8 addrspace(5)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i64 4095
+  %load = load i8, ptr addrspace(5) %gep
   %ext = sext i8 %load to i16
   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
+define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(ptr addrspace(5) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1708,14 +1708,14 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
+  %load = load volatile i8, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
   %ext = zext i8 %load to i16
   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
+define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(ptr addrspace(5) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1758,14 +1758,14 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
+  %load = load volatile i8, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
   %ext = sext i8 %load to i16
   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
+define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(ptr addrspace(5) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1808,15 +1808,15 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
-  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
+  %load = load volatile i8, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
   %ext = zext i8 %load to i16
   %bc.ext = bitcast i16 %ext to half
   %build1 = insertelement <2 x half> %reg.bc, half %bc.ext, i32 0
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_constant_lo_v2i16_reglo_vreg(i16 addrspace(4)* %in, i32 %reg) #0 {
+define void @load_constant_lo_v2i16_reglo_vreg(ptr addrspace(4) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_constant_lo_v2i16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1851,14 +1851,14 @@ define void @load_constant_lo_v2i16_reglo_vreg(i16 addrspace(4)* %in, i32 %reg)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047
-  %load = load i16, i16 addrspace(4)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(4) %in, i64 -2047
+  %load = load i16, ptr addrspace(4) %gep
   %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_constant_lo_v2f16_reglo_vreg(half addrspace(4)* %in, i32 %reg) #0 {
+define void @load_constant_lo_v2f16_reglo_vreg(ptr addrspace(4) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_constant_lo_v2f16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1893,14 +1893,14 @@ define void @load_constant_lo_v2f16_reglo_vreg(half addrspace(4)* %in, i32 %reg)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
-  %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047
-  %load = load half, half addrspace(4)* %gep
+  %gep = getelementptr inbounds half, ptr addrspace(4) %in, i64 -2047
+  %load = load half, ptr addrspace(4) %gep
   %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_constant_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, i32 %reg) #0 {
+define void @load_constant_lo_v2f16_reglo_vreg_zexti8(ptr addrspace(4) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_constant_lo_v2f16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1935,16 +1935,16 @@ define void @load_constant_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, i32
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
-  %load = load i8, i8 addrspace(4)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %in, i64 -4095
+  %load = load i8, ptr addrspace(4) %gep
   %ext = zext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
   %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
-define void @load_constant_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, i32 %reg) #0 {
+define void @load_constant_lo_v2f16_reglo_vreg_sexti8(ptr addrspace(4) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_constant_lo_v2f16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1979,12 +1979,12 @@ define void @load_constant_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, i32
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
-  %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
-  %load = load i8, i8 addrspace(4)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(4) %in, i64 -4095
+  %load = load i8, ptr addrspace(4) %gep
   %ext = sext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
   %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
@@ -2048,12 +2048,11 @@ entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)
   %obj1 = alloca [4096 x i16], align 2, addrspace(5)
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
-  store volatile i32 123, i32 addrspace(5)* %bc
-  %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027
-  %load = load volatile i16, i16 addrspace(5)* %gep
+  store volatile i32 123, ptr addrspace(5) %obj0
+  %gep = getelementptr inbounds [4096 x i16], ptr addrspace(5) %obj1, i32 0, i32 2027
+  %load = load volatile i16, ptr addrspace(5) %gep
   %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
@@ -2117,13 +2116,12 @@ entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
-  store volatile i32 123, i32 addrspace(5)* %bc
-  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
-  %load = load volatile i8, i8 addrspace(5)* %gep
+  store volatile i32 123, ptr addrspace(5) %obj0
+  %gep = getelementptr inbounds [4096 x i8], ptr addrspace(5) %obj1, i32 0, i32 4055
+  %load = load volatile i8, ptr addrspace(5) %gep
   %load.ext = sext i8 %load to i16
   %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
@@ -2187,13 +2185,12 @@ entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
-  store volatile i32 123, i32 addrspace(5)* %bc
-  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
-  %load = load volatile i8, i8 addrspace(5)* %gep
+  store volatile i32 123, ptr addrspace(5) %obj0
+  %gep = getelementptr inbounds [4096 x i8], ptr addrspace(5) %obj1, i32 0, i32 4055
+  %load = load volatile i8, ptr addrspace(5) %gep
   %load.ext = zext i8 %load to i16
   %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }
 
@@ -2257,14 +2254,13 @@ entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
   %reg.bc = bitcast i32 %reg to <2 x half>
-  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
-  store volatile i32 123, i32 addrspace(5)* %bc
-  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
-  %load = load volatile i8, i8 addrspace(5)* %gep
+  store volatile i32 123, ptr addrspace(5) %obj0
+  %gep = getelementptr inbounds [4096 x i8], ptr addrspace(5) %obj1, i32 0, i32 4055
+  %load = load volatile i8, ptr addrspace(5) %gep
   %load.ext = sext i8 %load to i16
   %bitcast = bitcast i16 %load.ext to half
   %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 
@@ -2328,14 +2324,13 @@ entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
   %reg.bc = bitcast i32 %reg to <2 x half>
-  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
-  store volatile i32 123, i32 addrspace(5)* %bc
-  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
-  %load = load volatile i8, i8 addrspace(5)* %gep
+  store volatile i32 123, ptr addrspace(5) %obj0
+  %gep = getelementptr inbounds [4096 x i8], ptr addrspace(5) %obj1, i32 0, i32 4055
+  %load = load volatile i8, ptr addrspace(5) %gep
   %load.ext = zext i8 %load to i16
   %bitcast = bitcast i16 %load.ext to half
   %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
-  store <2 x half> %build1, <2 x half> addrspace(1)* undef
+  store <2 x half> %build1, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local-f32-no-ds128.ll b/llvm/test/CodeGen/AMDGPU/load-local-f32-no-ds128.ll
index a7c4341759009..84ca6000cfe77 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-f32-no-ds128.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-f32-no-ds128.ll
@@ -11,9 +11,9 @@
 ;
 ; CIVI: ds_read2_b64
 ; CIVI: ds_write2_b64
-define amdgpu_kernel void @local_v4f32_to_2b64(<4 x float> addrspace(3)* %out, <4 x float> addrspace(3)* %in) {
-  %ld = load <4 x float>, <4 x float> addrspace(3)* %in, align 16
-  store <4 x float> %ld, <4 x float> addrspace(3)* %out, align 16
+define amdgpu_kernel void @local_v4f32_to_2b64(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+  %ld = load <4 x float>, ptr addrspace(3) %in, align 16
+  store <4 x float> %ld, ptr addrspace(3) %out, align 16
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local-f32.ll b/llvm/test/CodeGen/AMDGPU/load-local-f32.ll
index e78406c92e7be..ed422a647c51a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-f32.ll
@@ -13,10 +13,10 @@
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) #0 {
+define amdgpu_kernel void @load_f32_local(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %tmp0 = load float, float addrspace(3)* %in
-  store float %tmp0, float addrspace(1)* %out
+  %tmp0 = load float, ptr addrspace(3) %in
+  store float %tmp0, ptr addrspace(1) %out
   ret void
 }
 
@@ -28,10 +28,10 @@ entry:
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @load_v2f32_local(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %tmp0 = load <2 x float>, <2 x float> addrspace(3)* %in
-  store <2 x float> %tmp0, <2 x float> addrspace(1)* %out
+  %tmp0 = load <2 x float>, ptr addrspace(3) %in
+  store <2 x float> %tmp0, ptr addrspace(1) %out
   ret void
 }
 
@@ -51,10 +51,10 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v3f32(<3 x float> addrspace(3)* %out, <3 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3f32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %tmp0 = load <3 x float>, <3 x float> addrspace(3)* %in
-  store <3 x float> %tmp0, <3 x float> addrspace(3)* %out
+  %tmp0 = load <3 x float>, ptr addrspace(3) %in
+  store <3 x float> %tmp0, ptr addrspace(3) %out
   ret void
 }
 
@@ -68,10 +68,10 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v4f32(<4 x float> addrspace(3)* %out, <4 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4f32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %tmp0 = load <4 x float>, <4 x float> addrspace(3)* %in
-  store <4 x float> %tmp0, <4 x float> addrspace(3)* %out
+  %tmp0 = load <4 x float>, ptr addrspace(3) %in
+  store <4 x float> %tmp0, ptr addrspace(3) %out
   ret void
 }
 
@@ -90,10 +90,10 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v8f32(<8 x float> addrspace(3)* %out, <8 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8f32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %tmp0 = load <8 x float>, <8 x float> addrspace(3)* %in
-  store <8 x float> %tmp0, <8 x float> addrspace(3)* %out
+  %tmp0 = load <8 x float>, ptr addrspace(3) %in
+  store <8 x float> %tmp0, ptr addrspace(3) %out
   ret void
 }
 
@@ -122,10 +122,10 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v16f32(<16 x float> addrspace(3)* %out, <16 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16f32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %tmp0 = load <16 x float>, <16 x float> addrspace(3)* %in
-  store <16 x float> %tmp0, <16 x float> addrspace(3)* %out
+  %tmp0 = load <16 x float>, ptr addrspace(3) %in
+  store <16 x float> %tmp0, ptr addrspace(3) %out
   ret void
 }
 
@@ -142,9 +142,9 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_v4f32_to_128(<4 x float> addrspace(3)* %out, <4 x float> addrspace(3)* %in) {
-  %ld = load <4 x float>, <4 x float> addrspace(3)* %in, align 16
-  store <4 x float> %ld, <4 x float> addrspace(3)* %out, align 16
+define amdgpu_kernel void @local_v4f32_to_128(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+  %ld = load <4 x float>, ptr addrspace(3) %in, align 16
+  store <4 x float> %ld, ptr addrspace(3) %out, align 16
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local-f64.ll b/llvm/test/CodeGen/AMDGPU/load-local-f64.ll
index 8071c406bb5f9..136289ea74e33 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-f64.ll
@@ -17,9 +17,9 @@
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_f64(double addrspace(3)* %out, double addrspace(3)* %in) #0 {
-  %ld = load double, double addrspace(3)* %in
-  store double %ld, double addrspace(3)* %out
+define amdgpu_kernel void @local_load_f64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %ld = load double, ptr addrspace(3) %in
+  store double %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -33,10 +33,10 @@ define amdgpu_kernel void @local_load_f64(double addrspace(3)* %out, double addr
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v2f64(<2 x double> addrspace(3)* %out, <2 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2f64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <2 x double>, <2 x double> addrspace(3)* %in
-  store <2 x double> %ld, <2 x double> addrspace(3)* %out
+  %ld = load <2 x double>, ptr addrspace(3) %in
+  store <2 x double> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -53,10 +53,10 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v3f64(<3 x double> addrspace(3)* %out, <3 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3f64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <3 x double>, <3 x double> addrspace(3)* %in
-  store <3 x double> %ld, <3 x double> addrspace(3)* %out
+  %ld = load <3 x double>, ptr addrspace(3) %in
+  store <3 x double> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -76,10 +76,10 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v4f64(<4 x double> addrspace(3)* %out, <4 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4f64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <4 x double>, <4 x double> addrspace(3)* %in
-  store <4 x double> %ld, <4 x double> addrspace(3)* %out
+  %ld = load <4 x double>, ptr addrspace(3) %in
+  store <4 x double> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -108,10 +108,10 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v8f64(<8 x double> addrspace(3)* %out, <8 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8f64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <8 x double>, <8 x double> addrspace(3)* %in
-  store <8 x double> %ld, <8 x double> addrspace(3)* %out
+  %ld = load <8 x double>, ptr addrspace(3) %in
+  store <8 x double> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -167,10 +167,10 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v16f64(<16 x double> addrspace(3)* %out, <16 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16f64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <16 x double>, <16 x double> addrspace(3)* %in
-  store <16 x double> %ld, <16 x double> addrspace(3)* %out
+  %ld = load <16 x double>, ptr addrspace(3) %in
+  store <16 x double> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -184,10 +184,10 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v2f64_to_128(<2 x double> addrspace(3)* %out, <2 x double> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v2f64_to_128(ptr addrspace(3) %out, ptr addrspace(3) %in) {
 entry:
-  %ld = load <2 x double>, <2 x double> addrspace(3)* %in, align 16
-  store <2 x double> %ld, <2 x double> addrspace(3)* %out, align 16
+  %ld = load <2 x double>, ptr addrspace(3) %in, align 16
+  store <2 x double> %ld, ptr addrspace(3) %out, align 16
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local-i1.ll b/llvm/test/CodeGen/AMDGPU/load-local-i1.ll
index e8e7fc3752e09..31cf71a44b5cc 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i1.ll
@@ -14,72 +14,72 @@
 ; EG: LDS_UBYTE_READ_RET
 ; EG: AND_INT
 ; EG: LDS_BYTE_WRITE
-define amdgpu_kernel void @local_load_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
-  %load = load i1, i1 addrspace(3)* %in
-  store i1 %load, i1 addrspace(3)* %out
+define amdgpu_kernel void @local_load_i1(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load i1, ptr addrspace(3) %in
+  store i1 %load, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v2i1:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
-  store <2 x i1> %load, <2 x i1> addrspace(3)* %out
+define amdgpu_kernel void @local_load_v2i1(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <2 x i1>, ptr addrspace(3) %in
+  store <2 x i1> %load, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v3i1:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
-  store <3 x i1> %load, <3 x i1> addrspace(3)* %out
+define amdgpu_kernel void @local_load_v3i1(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <3 x i1>, ptr addrspace(3) %in
+  store <3 x i1> %load, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v4i1:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
-  store <4 x i1> %load, <4 x i1> addrspace(3)* %out
+define amdgpu_kernel void @local_load_v4i1(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <4 x i1>, ptr addrspace(3) %in
+  store <4 x i1> %load, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v8i1:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
-  store <8 x i1> %load, <8 x i1> addrspace(3)* %out
+define amdgpu_kernel void @local_load_v8i1(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <8 x i1>, ptr addrspace(3) %in
+  store <8 x i1> %load, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v16i1:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
-  store <16 x i1> %load, <16 x i1> addrspace(3)* %out
+define amdgpu_kernel void @local_load_v16i1(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <16 x i1>, ptr addrspace(3) %in
+  store <16 x i1> %load, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v32i1:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
-  store <32 x i1> %load, <32 x i1> addrspace(3)* %out
+define amdgpu_kernel void @local_load_v32i1(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <32 x i1>, ptr addrspace(3) %in
+  store <32 x i1> %load, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v64i1:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
-  store <64 x i1> %load, <64 x i1> addrspace(3)* %out
+define amdgpu_kernel void @local_load_v64i1(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <64 x i1>, ptr addrspace(3) %in
+  store <64 x i1> %load, ptr addrspace(3) %out
   ret void
 }
 
@@ -89,10 +89,10 @@ define amdgpu_kernel void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x
 
 ; GCN: ds_read_u8
 ; GCN: ds_write_b32
-define amdgpu_kernel void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
-  %a = load i1, i1 addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_i1_to_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %a = load i1, ptr addrspace(3) %in
   %ext = zext i1 %a to i32
-  store i32 %ext, i32 addrspace(3)* %out
+  store i32 %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -106,170 +106,170 @@ define amdgpu_kernel void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 a
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
-define amdgpu_kernel void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
-  %a = load i1, i1 addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_i1_to_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %a = load i1, ptr addrspace(3) %in
   %ext = sext i1 %a to i32
-  store i32 %ext, i32 addrspace(3)* %out
+  store i32 %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i32:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
-  %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v1i1_to_v1i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <1 x i1>, ptr addrspace(3) %in
   %ext = zext <1 x i1> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
+  store <1 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i32:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
-  %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v1i1_to_v1i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <1 x i1>, ptr addrspace(3) %in
   %ext = sext <1 x i1> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
+  store <1 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i32:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v2i1_to_v2i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <2 x i1>, ptr addrspace(3) %in
   %ext = zext <2 x i1> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
+  store <2 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i32:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v2i1_to_v2i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <2 x i1>, ptr addrspace(3) %in
   %ext = sext <2 x i1> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
+  store <2 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i32:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v3i1_to_v3i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <3 x i1>, ptr addrspace(3) %in
   %ext = zext <3 x i1> %load to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
+  store <3 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i32:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v3i1_to_v3i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <3 x i1>, ptr addrspace(3) %in
   %ext = sext <3 x i1> %load to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
+  store <3 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i32:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v4i1_to_v4i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <4 x i1>, ptr addrspace(3) %in
   %ext = zext <4 x i1> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
+  store <4 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i32:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v4i1_to_v4i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <4 x i1>, ptr addrspace(3) %in
   %ext = sext <4 x i1> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
+  store <4 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i32:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v8i1_to_v8i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <8 x i1>, ptr addrspace(3) %in
   %ext = zext <8 x i1> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
+  store <8 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i32:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v8i1_to_v8i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <8 x i1>, ptr addrspace(3) %in
   %ext = sext <8 x i1> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
+  store <8 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i32:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v16i1_to_v16i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <16 x i1>, ptr addrspace(3) %in
   %ext = zext <16 x i1> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
+  store <16 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i32:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v16i1_to_v16i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <16 x i1>, ptr addrspace(3) %in
   %ext = sext <16 x i1> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
+  store <16 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i32:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v32i1_to_v32i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <32 x i1>, ptr addrspace(3) %in
   %ext = zext <32 x i1> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
+  store <32 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i32:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v32i1_to_v32i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <32 x i1>, ptr addrspace(3) %in
   %ext = sext <32 x i1> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
+  store <32 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i32:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v64i1_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <64 x i1>, ptr addrspace(3) %in
   %ext = zext <64 x i1> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
+  store <64 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i32:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v64i1_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <64 x i1>, ptr addrspace(3) %in
   %ext = sext <64 x i1> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
+  store <64 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -280,10 +280,10 @@ define amdgpu_kernel void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3
 ; GCN-DAG: ds_read_u8 [[LOAD:v[0-9]+]],
 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
 ; GCN: ds_write_b64
-define amdgpu_kernel void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
-  %a = load i1, i1 addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_i1_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %a = load i1, ptr addrspace(3) %in
   %ext = zext i1 %a to i64
-  store i64 %ext, i64 addrspace(3)* %out
+  store i64 %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -295,170 +295,170 @@ define amdgpu_kernel void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 a
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
 ; GCN: ds_write_b64
-define amdgpu_kernel void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
-  %a = load i1, i1 addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_i1_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %a = load i1, ptr addrspace(3) %in
   %ext = sext i1 %a to i64
-  store i64 %ext, i64 addrspace(3)* %out
+  store i64 %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i64:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
-  %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v1i1_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <1 x i1>, ptr addrspace(3) %in
   %ext = zext <1 x i1> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+  store <1 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i64:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
-  %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v1i1_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <1 x i1>, ptr addrspace(3) %in
   %ext = sext <1 x i1> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+  store <1 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i64:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v2i1_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <2 x i1>, ptr addrspace(3) %in
   %ext = zext <2 x i1> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+  store <2 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i64:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
-  %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v2i1_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <2 x i1>, ptr addrspace(3) %in
   %ext = sext <2 x i1> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+  store <2 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i64:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v3i1_to_v3i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <3 x i1>, ptr addrspace(3) %in
   %ext = zext <3 x i1> %load to <3 x i64>
-  store <3 x i64> %ext, <3 x i64> addrspace(3)* %out
+  store <3 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i64:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
-  %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v3i1_to_v3i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <3 x i1>, ptr addrspace(3) %in
   %ext = sext <3 x i1> %load to <3 x i64>
-  store <3 x i64> %ext, <3 x i64> addrspace(3)* %out
+  store <3 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i64:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v4i1_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <4 x i1>, ptr addrspace(3) %in
   %ext = zext <4 x i1> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+  store <4 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i64:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
-  %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v4i1_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <4 x i1>, ptr addrspace(3) %in
   %ext = sext <4 x i1> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+  store <4 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i64:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v8i1_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <8 x i1>, ptr addrspace(3) %in
   %ext = zext <8 x i1> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+  store <8 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i64:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
-  %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v8i1_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <8 x i1>, ptr addrspace(3) %in
   %ext = sext <8 x i1> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+  store <8 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i64:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v16i1_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <16 x i1>, ptr addrspace(3) %in
   %ext = zext <16 x i1> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+  store <16 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i64:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
-  %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v16i1_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <16 x i1>, ptr addrspace(3) %in
   %ext = sext <16 x i1> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+  store <16 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i64:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v32i1_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <32 x i1>, ptr addrspace(3) %in
   %ext = zext <32 x i1> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+  store <32 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i64:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
-  %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v32i1_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <32 x i1>, ptr addrspace(3) %in
   %ext = sext <32 x i1> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+  store <32 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i64:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v64i1_to_v64i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <64 x i1>, ptr addrspace(3) %in
   %ext = zext <64 x i1> %load to <64 x i64>
-  store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
+  store <64 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i64:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
-define amdgpu_kernel void @local_sextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
-  %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v64i1_to_v64i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <64 x i1>, ptr addrspace(3) %in
   %ext = sext <64 x i1> %load to <64 x i64>
-  store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
+  store <64 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index 8d2407214752a..43e9aa7a2d98f 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -18,10 +18,10 @@
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_SHORT_WRITE {{\*?}} [[TO]], [[DATA]]
-define amdgpu_kernel void @local_load_i16(i16 addrspace(3)* %out, i16 addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
 entry:
-  %ld = load i16, i16 addrspace(3)* %in
-  store i16 %ld, i16 addrspace(3)* %out
+  %ld = load i16, ptr addrspace(3) %in
+  store i16 %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -36,10 +36,10 @@ entry:
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define amdgpu_kernel void @local_load_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v2i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
 entry:
-  %ld = load <2 x i16>, <2 x i16> addrspace(3)* %in
-  store <2 x i16> %ld, <2 x i16> addrspace(3)* %out
+  %ld = load <2 x i16>, ptr addrspace(3) %in
+  store <2 x i16> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -53,10 +53,10 @@ entry:
 
 ; EG-DAG: LDS_USHORT_READ_RET
 ; EG-DAG: LDS_USHORT_READ_RET
-define amdgpu_kernel void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v3i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
 entry:
-  %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
-  store <3 x i16> %ld, <3 x i16> addrspace(3)* %out
+  %ld = load <3 x i16>, ptr addrspace(3) %in
+  store <3 x i16> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -68,10 +68,10 @@ entry:
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v4i16(<4 x i16> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v4i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
 entry:
-  %ld = load <4 x i16>, <4 x i16> addrspace(3)* %in
-  store <4 x i16> %ld, <4 x i16> addrspace(3)* %out
+  %ld = load <4 x i16>, ptr addrspace(3) %in
+  store <4 x i16> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -85,10 +85,10 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v8i16(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v8i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
 entry:
-  %ld = load <8 x i16>, <8 x i16> addrspace(3)* %in
-  store <8 x i16> %ld, <8 x i16> addrspace(3)* %out
+  %ld = load <8 x i16>, ptr addrspace(3) %in
+  store <8 x i16> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -109,10 +109,10 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v16i16(<16 x i16> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v16i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
 entry:
-  %ld = load <16 x i16>, <16 x i16> addrspace(3)* %in
-  store <16 x i16> %ld, <16 x i16> addrspace(3)* %out
+  %ld = load <16 x i16>, ptr addrspace(3) %in
+  store <16 x i16> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -128,10 +128,10 @@ entry:
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define amdgpu_kernel void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
-  %a = load i16, i16 addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_i16_to_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %a = load i16, ptr addrspace(3) %in
   %ext = zext i16 %a to i32
-  store i32 %ext, i32 addrspace(3)* %out
+  store i32 %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -150,10 +150,10 @@ define amdgpu_kernel void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16
 ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
 ; EG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define amdgpu_kernel void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
-  %a = load i16, i16 addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_i16_to_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %a = load i16, ptr addrspace(3) %in
   %ext = sext i16 %a to i32
-  store i32 %ext, i32 addrspace(3)* %out
+  store i32 %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -168,10 +168,10 @@ define amdgpu_kernel void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
-  %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <1 x i16>, ptr addrspace(3) %in
   %ext = zext <1 x i16> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
+  store <1 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -188,10 +188,10 @@ define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)*
 ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
 ; EG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
-  %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <1 x i16>, ptr addrspace(3) %in
   %ext = sext <1 x i16> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
+  store <1 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -203,10 +203,10 @@ define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)*
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
-  %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <2 x i16>, ptr addrspace(3) %in
   %ext = zext <2 x i16> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
+  store <2 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -220,10 +220,10 @@ define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)*
 ; EG: LDS_READ_RET
 ; EG: BFE_INT
 ; EG: BFE_INT
-define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
-  %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <2 x i16>, ptr addrspace(3) %in
   %ext = sext <2 x i16> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
+  store <2 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -240,11 +240,11 @@ define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)*
 ; EG: LDS_USHORT_READ_RET
 ; EG: LDS_USHORT_READ_RET
 ; EG: LDS_USHORT_READ_RET
-define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
 entry:
-  %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
+  %ld = load <3 x i16>, ptr addrspace(3) %in
   %ext = zext <3 x i16> %ld to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
+  store <3 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -264,11 +264,11 @@ entry:
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define amdgpu_kernel void @local_local_sextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_local_sextload_v3i16_to_v3i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
 entry:
-  %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
+  %ld = load <3 x i16>, ptr addrspace(3) %in
   %ext = sext <3 x i16> %ld to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
+  store <3 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -281,10 +281,10 @@ entry:
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
-  %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <4 x i16>, ptr addrspace(3) %in
   %ext = zext <4 x i16> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
+  store <4 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -301,10 +301,10 @@ define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspa
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
-  %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <4 x i16>, ptr addrspace(3) %in
   %ext = sext <4 x i16> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
+  store <4 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -318,10 +318,10 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)*
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
-  %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <8 x i16>, ptr addrspace(3) %in
   %ext = zext <8 x i16> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
+  store <8 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -343,10 +343,10 @@ define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)*
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
-  %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <8 x i16>, ptr addrspace(3) %in
   %ext = sext <8 x i16> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
+  store <8 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -370,10 +370,10 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)*
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
-  %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <16 x i16>, ptr addrspace(3) %in
   %ext = zext <16 x i16> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
+  store <16 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -409,10 +409,10 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
-  %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <16 x i16>, ptr addrspace(3) %in
   %ext = sext <16 x i16> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
+  store <16 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -441,10 +441,10 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
-  %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <32 x i16>, ptr addrspace(3) %in
   %ext = zext <32 x i16> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
+  store <32 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -481,10 +481,10 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
-  %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <32 x i16>, ptr addrspace(3) %in
   %ext = sext <32 x i16> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
+  store <32 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -549,10 +549,10 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
-  %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <64 x i16>, ptr addrspace(3) %in
   %ext = zext <64 x i16> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
+  store <64 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -592,10 +592,10 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
-  %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <64 x i16>, ptr addrspace(3) %in
   %ext = sext <64 x i16> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
+  store <64 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -613,10 +613,10 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG-DAG: LDS_WRITE
-define amdgpu_kernel void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
-  %a = load i16, i16 addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_i16_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %a = load i16, ptr addrspace(3) %in
   %ext = zext i16 %a to i64
-  store i64 %ext, i64 addrspace(3)* %out
+  store i64 %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -643,10 +643,10 @@ define amdgpu_kernel void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16
 ; EG-DAG: LDS_WRITE
 ; EG-DAG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define amdgpu_kernel void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
-  %a = load i16, i16 addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_i16_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %a = load i16, ptr addrspace(3) %in
   %ext = sext i16 %a to i64
-  store i64 %ext, i64 addrspace(3)* %out
+  store i64 %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -660,10 +660,10 @@ define amdgpu_kernel void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG-DAG: LDS_WRITE
-define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
-  %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <1 x i16>, ptr addrspace(3) %in
   %ext = zext <1 x i16> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+  store <1 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -680,10 +680,10 @@ define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)*
 ; EG-DAG: LDS_WRITE
 ; EG-DAG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
-  %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <1 x i16>, ptr addrspace(3) %in
   %ext = sext <1 x i16> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+  store <1 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -693,10 +693,10 @@ define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)*
 
 
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
-  %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <2 x i16>, ptr addrspace(3) %in
   %ext = zext <2 x i16> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+  store <2 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -708,10 +708,10 @@ define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)*
 ; EG: LDS_READ_RET
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
-define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
-  %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <2 x i16>, ptr addrspace(3) %in
   %ext = sext <2 x i16> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+  store <2 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -722,10 +722,10 @@ define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)*
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
-  %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <4 x i16>, ptr addrspace(3) %in
   %ext = zext <4 x i16> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+  store <4 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -740,10 +740,10 @@ define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)*
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
-  %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <4 x i16>, ptr addrspace(3) %in
   %ext = sext <4 x i16> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+  store <4 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -756,10 +756,10 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)*
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
-  %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <8 x i16>, ptr addrspace(3) %in
   %ext = zext <8 x i16> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+  store <8 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -780,10 +780,10 @@ define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)*
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
-  %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <8 x i16>, ptr addrspace(3) %in
   %ext = sext <8 x i16> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+  store <8 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -800,10 +800,10 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)*
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
-  %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <16 x i16>, ptr addrspace(3) %in
   %ext = zext <16 x i16> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+  store <16 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -836,10 +836,10 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
-  %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <16 x i16>, ptr addrspace(3) %in
   %ext = sext <16 x i16> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+  store <16 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -864,10 +864,10 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
-  %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <32 x i16>, ptr addrspace(3) %in
   %ext = zext <32 x i16> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+  store <32 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -924,26 +924,26 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
-  %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <32 x i16>, ptr addrspace(3) %in
   %ext = sext <32 x i16> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+  store <32 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; ; XFUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i64:
-; define amdgpu_kernel void @local_zextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
-;   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
+; define amdgpu_kernel void @local_zextload_v64i16_to_v64i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+;   %load = load <64 x i16>, ptr addrspace(3) %in
 ;   %ext = zext <64 x i16> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
+;   store <64 x i64> %ext, ptr addrspace(3) %out
 ;   ret void
 ; }
 
 ; ; XFUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i64:
-; define amdgpu_kernel void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
-;   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
+; define amdgpu_kernel void @local_sextload_v64i16_to_v64i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+;   %load = load <64 x i16>, ptr addrspace(3) %in
 ;   %ext = sext <64 x i16> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
+;   store <64 x i64> %ext, ptr addrspace(3) %out
 ;   ret void
 ; }
 
@@ -960,9 +960,9 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_v8i16_to_128(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) {
-  %ld = load <8 x i16>, <8 x i16> addrspace(3)* %in, align 16
-  store <8 x i16> %ld, <8 x i16> addrspace(3)* %out, align 16
+define amdgpu_kernel void @local_v8i16_to_128(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+  %ld = load <8 x i16>, ptr addrspace(3) %in, align 16
+  store <8 x i16> %ld, ptr addrspace(3) %out, align 16
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local-i32.ll b/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
index 224ccb85e2c8e..24fef77e0c10d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
@@ -16,10 +16,10 @@
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load i32, i32 addrspace(3)* %in
-  store i32 %ld, i32 addrspace(3)* %out
+  %ld = load i32, ptr addrspace(3) %in
+  store i32 %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -28,10 +28,10 @@ entry:
 ; GFX9-NOT: m0
 
 ; GCN: ds_read_b64
-define amdgpu_kernel void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
-  store <2 x i32> %ld, <2 x i32> addrspace(3)* %out
+  %ld = load <2 x i32>, ptr addrspace(3) %in
+  store <2 x i32> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -42,10 +42,10 @@ entry:
 ; SI-DAG: ds_read_b64
 ; SI-DAG: ds_read_b32
 ; CIVI-DAG: ds_read_b96
-define amdgpu_kernel void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <3 x i32>, <3 x i32> addrspace(3)* %in
-  store <3 x i32> %ld, <3 x i32> addrspace(3)* %out
+  %ld = load <3 x i32>, ptr addrspace(3) %in
+  store <3 x i32> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -55,10 +55,10 @@ entry:
 
 ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
 
-define amdgpu_kernel void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
-  store <4 x i32> %ld, <4 x i32> addrspace(3)* %out
+  %ld = load <4 x i32>, ptr addrspace(3) %in
+  store <4 x i32> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -68,10 +68,10 @@ entry:
 
 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-define amdgpu_kernel void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
-  store <8 x i32> %ld, <8 x i32> addrspace(3)* %out
+  %ld = load <8 x i32>, ptr addrspace(3) %in
+  store <8 x i32> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -87,10 +87,10 @@ entry:
 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
-define amdgpu_kernel void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
-  store <16 x i32> %ld, <16 x i32> addrspace(3)* %out
+  %ld = load <16 x i32>, ptr addrspace(3) %in
+  store <16 x i32> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -98,10 +98,10 @@ entry:
 ; SICIVI: s_mov_b32 m0, -1
 ; GFX9-NOT: m0
 
-define amdgpu_kernel void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
-  %ld = load i32, i32 addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_i32_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %ld = load i32, ptr addrspace(3) %in
   %ext = zext i32 %ld to i64
-  store i64 %ext, i64 addrspace(3)* %out
+  store i64 %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -109,10 +109,10 @@ define amdgpu_kernel void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32
 ; SICIVI: s_mov_b32 m0, -1
 ; GFX9-NOT: m0
 
-define amdgpu_kernel void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
-  %ld = load i32, i32 addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_i32_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %ld = load i32, ptr addrspace(3) %in
   %ext = sext i32 %ld to i64
-  store i64 %ext, i64 addrspace(3)* %out
+  store i64 %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -120,10 +120,10 @@ define amdgpu_kernel void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32
 ; SICIVI: s_mov_b32 m0, -1
 ; GFX9-NOT: m0
 
-define amdgpu_kernel void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
-  %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v1i32_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %ld = load <1 x i32>, ptr addrspace(3) %in
   %ext = zext <1 x i32> %ld to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+  store <1 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -131,10 +131,10 @@ define amdgpu_kernel void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)*
 ; SICIVI: s_mov_b32 m0, -1
 ; GFX9-NOT: m0
 
-define amdgpu_kernel void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
-  %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v1i32_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %ld = load <1 x i32>, ptr addrspace(3) %in
   %ext = sext <1 x i32> %ld to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+  store <1 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -142,10 +142,10 @@ define amdgpu_kernel void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)*
 ; SICIVI: s_mov_b32 m0, -1
 ; GFX9-NOT: m0
 
-define amdgpu_kernel void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
-  %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v2i32_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %ld = load <2 x i32>, ptr addrspace(3) %in
   %ext = zext <2 x i32> %ld to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+  store <2 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -153,10 +153,10 @@ define amdgpu_kernel void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)*
 ; SICIVI: s_mov_b32 m0, -1
 ; GFX9-NOT: m0
 
-define amdgpu_kernel void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
-  %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v2i32_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %ld = load <2 x i32>, ptr addrspace(3) %in
   %ext = sext <2 x i32> %ld to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+  store <2 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -164,10 +164,10 @@ define amdgpu_kernel void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)*
 ; SICIVI: s_mov_b32 m0, -1
 ; GFX9-NOT: m0
 
-define amdgpu_kernel void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
-  %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v4i32_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %ld = load <4 x i32>, ptr addrspace(3) %in
   %ext = zext <4 x i32> %ld to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+  store <4 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -175,10 +175,10 @@ define amdgpu_kernel void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)*
 ; SICIVI: s_mov_b32 m0, -1
 ; GFX9-NOT: m0
 
-define amdgpu_kernel void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
-  %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v4i32_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %ld = load <4 x i32>, ptr addrspace(3) %in
   %ext = sext <4 x i32> %ld to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+  store <4 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -195,9 +195,9 @@ define amdgpu_kernel void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)*
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_v4i32_to_128(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) {
-  %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 16
-  store <4 x i32> %ld, <4 x i32> addrspace(3)* %out, align 16
+define amdgpu_kernel void @local_v4i32_to_128(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+  %ld = load <4 x i32>, ptr addrspace(3) %in, align 16
+  store <4 x i32> %ld, ptr addrspace(3) %out, align 16
   ret void
 }
 
@@ -205,10 +205,10 @@ define amdgpu_kernel void @local_v4i32_to_128(<4 x i32> addrspace(3)* %out, <4 x
 ; SICIVI: s_mov_b32 m0, -1
 ; GFX9-NOT: m0
 
-define amdgpu_kernel void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
-  %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v8i32_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %ld = load <8 x i32>, ptr addrspace(3) %in
   %ext = zext <8 x i32> %ld to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+  store <8 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -216,10 +216,10 @@ define amdgpu_kernel void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)*
 ; SICIVI: s_mov_b32 m0, -1
 ; GFX9-NOT: m0
 
-define amdgpu_kernel void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
-  %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v8i32_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %ld = load <8 x i32>, ptr addrspace(3) %in
   %ext = sext <8 x i32> %ld to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+  store <8 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -227,10 +227,10 @@ define amdgpu_kernel void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)*
 ; SICIVI: s_mov_b32 m0, -1
 ; GFX9-NOT: m0
 
-define amdgpu_kernel void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
-  %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v16i32_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %ld = load <16 x i32>, ptr addrspace(3) %in
   %ext = sext <16 x i32> %ld to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+  store <16 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -238,10 +238,10 @@ define amdgpu_kernel void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(
 ; SICIVI: s_mov_b32 m0, -1
 ; GFX9-NOT: m0
 
-define amdgpu_kernel void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
-  %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v16i32_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %ld = load <16 x i32>, ptr addrspace(3) %in
   %ext = zext <16 x i32> %ld to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+  store <16 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -249,10 +249,10 @@ define amdgpu_kernel void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(
 ; SICIVI: s_mov_b32 m0, -1
 ; GFX9-NOT: m0
 
-define amdgpu_kernel void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
-  %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v32i32_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %ld = load <32 x i32>, ptr addrspace(3) %in
   %ext = sext <32 x i32> %ld to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+  store <32 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -260,10 +260,10 @@ define amdgpu_kernel void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(
 ; SICIVI: s_mov_b32 m0, -1
 ; GFX9-NOT: m0
 
-define amdgpu_kernel void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
-  %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v32i32_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %ld = load <32 x i32>, ptr addrspace(3) %in
   %ext = zext <32 x i32> %ld to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+  store <32 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -272,9 +272,9 @@ define amdgpu_kernel void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(
 ; GFX9-NOT: m0
 ; GFX9-NOT: accvgpr
 
-define amdgpu_kernel void @local_load_v32i32(<32 x i32> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
-  %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
-  store <32 x i32> %ld, <32 x i32> addrspace(3)* %out
+define amdgpu_kernel void @local_load_v32i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %ld = load <32 x i32>, ptr addrspace(3) %in
+  store <32 x i32> %ld, ptr addrspace(3) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local-i64.ll b/llvm/test/CodeGen/AMDGPU/load-local-i64.ll
index 8a07640e4f4d3..e6d5ba9de6398 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i64.ll
@@ -17,9 +17,9 @@
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(3)* %in) #0 {
-  %ld = load i64, i64 addrspace(3)* %in
-  store i64 %ld, i64 addrspace(3)* %out
+define amdgpu_kernel void @local_load_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %ld = load i64, ptr addrspace(3) %in
+  store i64 %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -33,10 +33,10 @@ define amdgpu_kernel void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v2i64(<2 x i64> addrspace(3)* %out, <2 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <2 x i64>, <2 x i64> addrspace(3)* %in
-  store <2 x i64> %ld, <2 x i64> addrspace(3)* %out
+  %ld = load <2 x i64>, ptr addrspace(3) %in
+  store <2 x i64> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -46,10 +46,10 @@ entry:
 ; CIVI: ds_read_b128
 ; CIVI: ds_write_b128
 
-define amdgpu_kernel void @local_load_v2i64_to_128(<2 x i64> addrspace(3)* %out, <2 x i64> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v2i64_to_128(ptr addrspace(3) %out, ptr addrspace(3) %in) {
 entry:
-  %ld = load <2 x i64>, <2 x i64> addrspace(3)* %in, align 16
-  store <2 x i64> %ld, <2 x i64> addrspace(3)* %out, align 16
+  %ld = load <2 x i64>, ptr addrspace(3) %in, align 16
+  store <2 x i64> %ld, ptr addrspace(3) %out, align 16
   ret void
 }
 
@@ -66,10 +66,10 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <3 x i64>, <3 x i64> addrspace(3)* %in
-  store <3 x i64> %ld, <3 x i64> addrspace(3)* %out
+  %ld = load <3 x i64>, ptr addrspace(3) %in
+  store <3 x i64> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -89,10 +89,10 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v4i64(<4 x i64> addrspace(3)* %out, <4 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <4 x i64>, <4 x i64> addrspace(3)* %in
-  store <4 x i64> %ld, <4 x i64> addrspace(3)* %out
+  %ld = load <4 x i64>, ptr addrspace(3) %in
+  store <4 x i64> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -121,10 +121,10 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v8i64(<8 x i64> addrspace(3)* %out, <8 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <8 x i64>, <8 x i64> addrspace(3)* %in
-  store <8 x i64> %ld, <8 x i64> addrspace(3)* %out
+  %ld = load <8 x i64>, ptr addrspace(3) %in
+  store <8 x i64> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -180,10 +180,10 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v16i64(<16 x i64> addrspace(3)* %out, <16 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <16 x i64>, <16 x i64> addrspace(3)* %in
-  store <16 x i64> %ld, <16 x i64> addrspace(3)* %out
+  %ld = load <16 x i64>, ptr addrspace(3) %in
+  store <16 x i64> %ld, ptr addrspace(3) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
index b8cbc7b0892c2..54fef9184fe70 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
@@ -14,10 +14,10 @@
 ; GCN: ds_read_u8
 
 ; EG: LDS_UBYTE_READ_RET
-define amdgpu_kernel void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i8(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load i8, i8 addrspace(3)* %in
-  store i8 %ld, i8 addrspace(3)* %out
+  %ld = load i8, ptr addrspace(3) %in
+  store i8 %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -28,10 +28,10 @@ entry:
 ; GCN: ds_read_u16
 
 ; EG: LDS_USHORT_READ_RET
-define amdgpu_kernel void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2i8(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <2 x i8>, <2 x i8> addrspace(3)* %in
-  store <2 x i8> %ld, <2 x i8> addrspace(3)* %out
+  %ld = load <2 x i8>, ptr addrspace(3) %in
+  store <2 x i8> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -40,10 +40,10 @@ entry:
 ; GCN: ds_read_b32
 
 ; EG: DS_READ_RET
-define amdgpu_kernel void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3i8(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
-  store <3 x i8> %ld, <3 x i8> addrspace(3)* %out
+  %ld = load <3 x i8>, ptr addrspace(3) %in
+  store <3 x i8> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -52,10 +52,10 @@ entry:
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4i8(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <4 x i8>, <4 x i8> addrspace(3)* %in
-  store <4 x i8> %ld, <4 x i8> addrspace(3)* %out
+  %ld = load <4 x i8>, ptr addrspace(3) %in
+  store <4 x i8> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -65,10 +65,10 @@ entry:
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8i8(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <8 x i8>, <8 x i8> addrspace(3)* %in
-  store <8 x i8> %ld, <8 x i8> addrspace(3)* %out
+  %ld = load <8 x i8>, ptr addrspace(3) %in
+  store <8 x i8> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -81,10 +81,10 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16i8(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in
-  store <16 x i8> %ld, <16 x i8> addrspace(3)* %out
+  %ld = load <16 x i8>, ptr addrspace(3) %in
+  store <16 x i8> %ld, ptr addrspace(3) %out
   ret void
 }
 
@@ -95,10 +95,10 @@ entry:
 ; GCN: ds_read_u8
 
 ; EG: LDS_UBYTE_READ_RET
-define amdgpu_kernel void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
-  %a = load i8, i8 addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_i8_to_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %a = load i8, ptr addrspace(3) %in
   %ext = zext i8 %a to i32
-  store i32 %ext, i32 addrspace(3)* %out
+  store i32 %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -110,20 +110,20 @@ define amdgpu_kernel void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 a
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
-define amdgpu_kernel void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
-  %ld = load i8, i8 addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_i8_to_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %ld = load i8, ptr addrspace(3) %in
   %ext = sext i8 %ld to i32
-  store i32 %ext, i32 addrspace(3)* %out
+  store i32 %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i32:
 
 ; EG: LDS_UBYTE_READ_RET
-define amdgpu_kernel void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v1i8_to_v1i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <1 x i8>, ptr addrspace(3) %in
   %ext = zext <1 x i8> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
+  store <1 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -132,10 +132,10 @@ define amdgpu_kernel void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)*
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
-define amdgpu_kernel void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v1i8_to_v1i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <1 x i8>, ptr addrspace(3) %in
   %ext = sext <1 x i8> %load to <1 x i32>
-  store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
+  store <1 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -144,10 +144,10 @@ define amdgpu_kernel void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)*
 ; GCN: ds_read_u16
 
 ; EG: LDS_USHORT_READ_RET
-define amdgpu_kernel void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v2i8_to_v2i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <2 x i8>, ptr addrspace(3) %in
   %ext = zext <2 x i8> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
+  store <2 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -171,10 +171,10 @@ define amdgpu_kernel void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)*
 ; EG: LDS_USHORT_READ_RET
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define amdgpu_kernel void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v2i8_to_v2i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <2 x i8>, ptr addrspace(3) %in
   %ext = sext <2 x i8> %load to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
+  store <2 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -188,11 +188,11 @@ define amdgpu_kernel void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)*
 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff,
 
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v3i8_to_v3i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
+  %ld = load <3 x i8>, ptr addrspace(3) %in
   %ext = zext <3 x i8> %ld to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
+  store <3 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -216,11 +216,11 @@ entry:
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define amdgpu_kernel void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v3i8_to_v3i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
+  %ld = load <3 x i8>, ptr addrspace(3) %in
   %ext = sext <3 x i8> %ld to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
+  store <3 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -234,10 +234,10 @@ entry:
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
-define amdgpu_kernel void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v4i8_to_v4i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <4 x i8>, ptr addrspace(3) %in
   %ext = zext <4 x i8> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
+  store <4 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -252,10 +252,10 @@ define amdgpu_kernel void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)*
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define amdgpu_kernel void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v4i8_to_v4i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <4 x i8>, ptr addrspace(3) %in
   %ext = sext <4 x i8> %load to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
+  store <4 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -271,10 +271,10 @@ define amdgpu_kernel void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)*
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
-define amdgpu_kernel void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v8i8_to_v8i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <8 x i8>, ptr addrspace(3) %in
   %ext = zext <8 x i8> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
+  store <8 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -292,10 +292,10 @@ define amdgpu_kernel void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)*
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define amdgpu_kernel void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v8i8_to_v8i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <8 x i8>, ptr addrspace(3) %in
   %ext = sext <8 x i8> %load to <8 x i32>
-  store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
+  store <8 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -319,10 +319,10 @@ define amdgpu_kernel void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)*
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
-define amdgpu_kernel void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v16i8_to_v16i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <16 x i8>, ptr addrspace(3) %in
   %ext = zext <16 x i8> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
+  store <16 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -350,10 +350,10 @@ define amdgpu_kernel void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define amdgpu_kernel void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v16i8_to_v16i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <16 x i8>, ptr addrspace(3) %in
   %ext = sext <16 x i8> %load to <16 x i32>
-  store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
+  store <16 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -369,10 +369,10 @@ define amdgpu_kernel void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define amdgpu_kernel void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v32i8_to_v32i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <32 x i8>, ptr addrspace(3) %in
   %ext = zext <32 x i8> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
+  store <32 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -388,10 +388,10 @@ define amdgpu_kernel void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define amdgpu_kernel void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v32i8_to_v32i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <32 x i8>, ptr addrspace(3) %in
   %ext = sext <32 x i8> %load to <32 x i32>
-  store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
+  store <32 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -415,10 +415,10 @@ define amdgpu_kernel void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define amdgpu_kernel void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
-  %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v64i8_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <64 x i8>, ptr addrspace(3) %in
   %ext = zext <64 x i8> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
+  store <64 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -442,10 +442,10 @@ define amdgpu_kernel void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define amdgpu_kernel void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
-  %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v64i8_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <64 x i8>, ptr addrspace(3) %in
   %ext = sext <64 x i8> %load to <64 x i32>
-  store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
+  store <64 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -460,10 +460,10 @@ define amdgpu_kernel void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3
 ; EG: LDS_UBYTE_READ_RET
 ; EG: MOV {{.*}}, literal
 ; EG: 0.0
-define amdgpu_kernel void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
-  %a = load i8, i8 addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_i8_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %a = load i8, ptr addrspace(3) %in
   %ext = zext i8 %a to i64
-  store i64 %ext, i64 addrspace(3)* %out
+  store i64 %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -480,10 +480,10 @@ define amdgpu_kernel void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 a
 ; EG: ASHR
 ; TODO: why not 7?
 ; EG: 31
-define amdgpu_kernel void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
-  %a = load i8, i8 addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_i8_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %a = load i8, ptr addrspace(3) %in
   %ext = sext i8 %a to i64
-  store i64 %ext, i64 addrspace(3)* %out
+  store i64 %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -495,10 +495,10 @@ define amdgpu_kernel void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 a
 ; EG: MOV {{.*}}, literal
 ; TODO: merge?
 ; EG: 0.0
-define amdgpu_kernel void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v1i8_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <1 x i8>, ptr addrspace(3) %in
   %ext = zext <1 x i8> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+  store <1 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -510,10 +510,10 @@ define amdgpu_kernel void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)*
 ; EG: ASHR
 ; TODO: why not 7?
 ; EG: 31
-define amdgpu_kernel void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v1i8_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <1 x i8>, ptr addrspace(3) %in
   %ext = sext <1 x i8> %load to <1 x i64>
-  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+  store <1 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -522,10 +522,10 @@ define amdgpu_kernel void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)*
 ; GFX9-NOT: m0
 
 ; EG: LDS_USHORT_READ_RET
-define amdgpu_kernel void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v2i8_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <2 x i8>, ptr addrspace(3) %in
   %ext = zext <2 x i8> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+  store <2 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -536,10 +536,10 @@ define amdgpu_kernel void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)*
 ; EG: LDS_USHORT_READ_RET
 ; EG: BFE_INT
 ; EG: BFE_INT
-define amdgpu_kernel void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v2i8_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <2 x i8>, ptr addrspace(3) %in
   %ext = sext <2 x i8> %load to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+  store <2 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -548,10 +548,10 @@ define amdgpu_kernel void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)*
 ; GFX9-NOT: m0
 
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v4i8_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <4 x i8>, ptr addrspace(3) %in
   %ext = zext <4 x i8> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+  store <4 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -560,10 +560,10 @@ define amdgpu_kernel void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)*
 ; GFX9-NOT: m0
 
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v4i8_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <4 x i8>, ptr addrspace(3) %in
   %ext = sext <4 x i8> %load to <4 x i64>
-  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+  store <4 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -573,10 +573,10 @@ define amdgpu_kernel void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)*
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v8i8_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <8 x i8>, ptr addrspace(3) %in
   %ext = zext <8 x i8> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+  store <8 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -595,10 +595,10 @@ define amdgpu_kernel void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)*
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define amdgpu_kernel void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v8i8_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <8 x i8>, ptr addrspace(3) %in
   %ext = sext <8 x i8> %load to <8 x i64>
-  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+  store <8 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -610,10 +610,10 @@ define amdgpu_kernel void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)*
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v16i8_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <16 x i8>, ptr addrspace(3) %in
   %ext = zext <16 x i8> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+  store <16 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -625,10 +625,10 @@ define amdgpu_kernel void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v16i8_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <16 x i8>, ptr addrspace(3) %in
   %ext = sext <16 x i8> %load to <16 x i64>
-  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+  store <16 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -644,10 +644,10 @@ define amdgpu_kernel void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v32i8_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <32 x i8>, ptr addrspace(3) %in
   %ext = zext <32 x i8> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+  store <32 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -663,26 +663,26 @@ define amdgpu_kernel void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v32i8_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <32 x i8>, ptr addrspace(3) %in
   %ext = sext <32 x i8> %load to <32 x i64>
-  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+  store <32 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i64:
-; define amdgpu_kernel void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
-;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
+; define amdgpu_kernel void @local_zextload_v64i8_to_v64i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+;   %load = load <64 x i8>, ptr addrspace(3) %in
 ;   %ext = zext <64 x i8> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
+;   store <64 x i64> %ext, ptr addrspace(3) %out
 ;   ret void
 ; }
 
 ; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i64:
-; define amdgpu_kernel void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
-;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
+; define amdgpu_kernel void @local_sextload_v64i8_to_v64i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+;   %load = load <64 x i8>, ptr addrspace(3) %in
 ;   %ext = sext <64 x i8> %load to <64 x i64>
-;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
+;   store <64 x i64> %ext, ptr addrspace(3) %out
 ;   ret void
 ; }
 
@@ -694,10 +694,10 @@ define amdgpu_kernel void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: LDS_SHORT_WRITE
-define amdgpu_kernel void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
-  %a = load i8, i8 addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_i8_to_i16(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %a = load i8, ptr addrspace(3) %in
   %ext = zext i8 %a to i16
-  store i16 %ext, i16 addrspace(3)* %out
+  store i16 %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -710,10 +710,10 @@ define amdgpu_kernel void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 a
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
 ; EG: LDS_SHORT_WRITE
-define amdgpu_kernel void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
-  %a = load i8, i8 addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_i8_to_i16(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %a = load i8, ptr addrspace(3) %in
   %ext = sext i8 %a to i16
-  store i16 %ext, i16 addrspace(3)* %out
+  store i16 %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -723,10 +723,10 @@ define amdgpu_kernel void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 a
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: LDS_SHORT_WRITE
-define amdgpu_kernel void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v1i8_to_v1i16(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <1 x i8>, ptr addrspace(3) %in
   %ext = zext <1 x i8> %load to <1 x i16>
-  store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
+  store <1 x i16> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -737,10 +737,10 @@ define amdgpu_kernel void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)*
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
 ; EG: LDS_SHORT_WRITE
-define amdgpu_kernel void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
-  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v1i8_to_v1i16(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <1 x i8>, ptr addrspace(3) %in
   %ext = sext <1 x i8> %load to <1 x i16>
-  store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
+  store <1 x i16> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -750,10 +750,10 @@ define amdgpu_kernel void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)*
 
 ; EG: LDS_USHORT_READ_RET
 ; EG: LDS_WRITE
-define amdgpu_kernel void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v2i8_to_v2i16(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <2 x i8>, ptr addrspace(3) %in
   %ext = zext <2 x i8> %load to <2 x i16>
-  store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
+  store <2 x i16> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -765,10 +765,10 @@ define amdgpu_kernel void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)*
 ; EG: BFE_INT
 ; EG: BFE_INT
 ; EG: LDS_WRITE
-define amdgpu_kernel void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
-  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v2i8_to_v2i16(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <2 x i8>, ptr addrspace(3) %in
   %ext = sext <2 x i8> %load to <2 x i16>
-  store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
+  store <2 x i16> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -779,10 +779,10 @@ define amdgpu_kernel void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)*
 ; EG: LDS_READ_RET
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define amdgpu_kernel void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v4i8_to_v4i16(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <4 x i8>, ptr addrspace(3) %in
   %ext = zext <4 x i8> %load to <4 x i16>
-  store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
+  store <4 x i16> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -798,10 +798,10 @@ define amdgpu_kernel void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)*
 ; EG-DAG: BFE_INT
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define amdgpu_kernel void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
-  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v4i8_to_v4i16(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <4 x i8>, ptr addrspace(3) %in
   %ext = sext <4 x i8> %load to <4 x i16>
-  store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
+  store <4 x i16> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -815,10 +815,10 @@ define amdgpu_kernel void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)*
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define amdgpu_kernel void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v8i8_to_v8i16(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <8 x i8>, ptr addrspace(3) %in
   %ext = zext <8 x i8> %load to <8 x i16>
-  store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
+  store <8 x i16> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -841,10 +841,10 @@ define amdgpu_kernel void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)*
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define amdgpu_kernel void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
-  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v8i8_to_v8i16(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <8 x i8>, ptr addrspace(3) %in
   %ext = sext <8 x i8> %load to <8 x i16>
-  store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
+  store <8 x i16> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -864,10 +864,10 @@ define amdgpu_kernel void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)*
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define amdgpu_kernel void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v16i8_to_v16i16(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <16 x i8>, ptr addrspace(3) %in
   %ext = zext <16 x i8> %load to <16 x i16>
-  store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
+  store <16 x i16> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -904,10 +904,10 @@ define amdgpu_kernel void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define amdgpu_kernel void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
-  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v16i8_to_v16i16(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <16 x i8>, ptr addrspace(3) %in
   %ext = sext <16 x i8> %load to <16 x i16>
-  store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
+  store <16 x i16> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -939,10 +939,10 @@ define amdgpu_kernel void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define amdgpu_kernel void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_zextload_v32i8_to_v32i16(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <32 x i8>, ptr addrspace(3) %in
   %ext = zext <32 x i8> %load to <32 x i16>
-  store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
+  store <32 x i16> %ext, ptr addrspace(3) %out
   ret void
 }
 
@@ -1003,26 +1003,26 @@ define amdgpu_kernel void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define amdgpu_kernel void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
-  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
+define amdgpu_kernel void @local_sextload_v32i8_to_v32i16(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+  %load = load <32 x i8>, ptr addrspace(3) %in
   %ext = sext <32 x i8> %load to <32 x i16>
-  store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
+  store <32 x i16> %ext, ptr addrspace(3) %out
   ret void
 }
 
 ; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i16:
-; define amdgpu_kernel void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
-;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
+; define amdgpu_kernel void @local_zextload_v64i8_to_v64i16(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+;   %load = load <64 x i8>, ptr addrspace(3) %in
 ;   %ext = zext <64 x i8> %load to <64 x i16>
-;   store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
+;   store <64 x i16> %ext, ptr addrspace(3) %out
 ;   ret void
 ; }
 
 ; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i16:
-; define amdgpu_kernel void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
-;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
+; define amdgpu_kernel void @local_sextload_v64i8_to_v64i16(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+;   %load = load <64 x i8>, ptr addrspace(3) %in
 ;   %ext = sext <64 x i8> %load to <64 x i16>
-;   store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
+;   store <64 x i16> %ext, ptr addrspace(3) %out
 ;   ret void
 ; }
 
@@ -1039,9 +1039,9 @@ define amdgpu_kernel void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define amdgpu_kernel void @local_v16i8_to_128(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) {
-  %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 16
-  store <16 x i8> %ld, <16 x i8> addrspace(3)* %out, align 16
+define amdgpu_kernel void @local_v16i8_to_128(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+  %ld = load <16 x i8>, ptr addrspace(3) %in, align 16
+  store <16 x i8> %ld, ptr addrspace(3) %out, align 16
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
index 8ca037fd49c84..f8a58fadf3982 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
@@ -4,7 +4,7 @@
 ; Test that checks for redundant copies to temporary stack slot produced by
 ; expandUnalignedLoad.
 
-define amdgpu_vs void @test(<4 x i32> inreg %arg1, <6 x float> addrspace(3)* %arg2) {
+define amdgpu_vs void @test(<4 x i32> inreg %arg1, ptr addrspace(3) %arg2) {
 ; CHECK-LABEL: test:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
@@ -21,13 +21,13 @@ define amdgpu_vs void @test(<4 x i32> inreg %arg1, <6 x float> addrspace(3)* %ar
 ; CHECK-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen
 ; CHECK-NEXT:    s_endpgm
   call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 0, float undef, float undef, float undef, float undef, i1 immarg false, i1 immarg false)
-  %var1 = load <6 x float>, <6 x float> addrspace(3)* %arg2, align 4
+  %var1 = load <6 x float>, ptr addrspace(3) %arg2, align 4
   %var2 = shufflevector <6 x float> %var1, <6 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %var2, <4 x i32> %arg1, i32 0, i32 0, i32 0, i32 immarg 126, i32 immarg 0)
   ret void
 }
 
-define amdgpu_vs void @test_2(<4 x i32> inreg %arg1, i32 %arg2, i32 inreg %arg3, <8 x float> addrspace(3)* %arg4) {
+define amdgpu_vs void @test_2(<4 x i32> inreg %arg1, i32 %arg2, i32 inreg %arg3, ptr addrspace(3) %arg4) {
 ; CHECK-LABEL: test_2:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, 28, v1
@@ -51,7 +51,7 @@ define amdgpu_vs void @test_2(<4 x i32> inreg %arg1, i32 %arg2, i32 inreg %arg3,
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    tbuffer_store_format_xyzw v[2:5], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:16 glc slc
 ; CHECK-NEXT:    s_endpgm
-  %load = load <8 x float>, <8 x float> addrspace(3)* %arg4, align 4
+  %load = load <8 x float>, ptr addrspace(3) %arg4, align 4
   %vec1 = shufflevector <8 x float> %load, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec1, <4 x i32> %arg1, i32 %arg2, i32 0, i32 %arg3, i32 immarg 77, i32 immarg 3)
   %vec2 = shufflevector <8 x float> %load, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -59,7 +59,7 @@ define amdgpu_vs void @test_2(<4 x i32> inreg %arg1, i32 %arg2, i32 inreg %arg3,
   ret void
 }
 
-define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, <4 x i32> inreg %arg3, i32 %arg4, <6 x float> addrspace(3)* %arg5, <6 x float> addrspace(3)* %arg6) {
+define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, <4 x i32> inreg %arg3, i32 %arg4, ptr addrspace(3) %arg5, ptr addrspace(3) %arg6) {
 ; CHECK-LABEL: test_3:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_mov_b32 s7, s5
@@ -101,7 +101,7 @@ define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, <4 x i32> inreg
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc
 ; CHECK-NEXT:    s_endpgm
-  %load1 = load <6 x float>, <6 x float> addrspace(3)* %arg5, align 4
+  %load1 = load <6 x float>, ptr addrspace(3) %arg5, align 4
   %vec11 = shufflevector <6 x float> %load1, <6 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec11, <4 x i32> %arg3, i32 %arg1, i32 264, i32 %arg2, i32 immarg 77, i32 immarg 3)
   %vec12 = shufflevector <6 x float> %load1, <6 x float> undef, <2 x i32> <i32 4, i32 5>
@@ -109,7 +109,7 @@ define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, <4 x i32> inreg
 
   call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 0, float undef, float undef, float undef, float undef, i1 immarg false, i1 immarg false)
 
-  %load2 = load <6 x float>, <6 x float> addrspace(3)* %arg6, align 4
+  %load2 = load <6 x float>, ptr addrspace(3) %arg6, align 4
   %vec21 = shufflevector <6 x float> %load2, <6 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec21, <4 x i32> %arg3, i32 %arg1, i32 240, i32 %arg2, i32 immarg 77, i32 immarg 3)
   %vec22 = shufflevector <6 x float> %load2, <6 x float> undef, <2 x i32> <i32 4, i32 5>

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/load-local.128.ll
index 950223555f4e3..729cf95245dea 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local.128.ll
@@ -5,7 +5,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
 
-define <4 x i32> @load_lds_v4i32(<4 x i32> addrspace(3)* %ptr) {
+define <4 x i32> @load_lds_v4i32(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -47,11 +47,11 @@ define <4 x i32> @load_lds_v4i32(<4 x i32> addrspace(3)* %ptr) {
 ; GFX11-NEXT:    ds_load_b128 v[0:3], v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr
+  %load = load <4 x i32>, ptr addrspace(3) %ptr
   ret <4 x i32> %load
 }
 
-define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
+define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v4i32_align1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -305,11 +305,11 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1
+  %load = load <4 x i32>, ptr addrspace(3) %ptr, align 1
   ret <4 x i32> %load
 }
 
-define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) {
+define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v4i32_align2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -437,11 +437,11 @@ define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2
+  %load = load <4 x i32>, ptr addrspace(3) %ptr, align 2
   ret <4 x i32> %load
 }
 
-define <4 x i32> @load_lds_v4i32_align4(<4 x i32> addrspace(3)* %ptr) {
+define <4 x i32> @load_lds_v4i32_align4(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v4i32_align4:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -494,11 +494,11 @@ define <4 x i32> @load_lds_v4i32_align4(<4 x i32> addrspace(3)* %ptr) {
 ; GFX11-NEXT:    ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4
+  %load = load <4 x i32>, ptr addrspace(3) %ptr, align 4
   ret <4 x i32> %load
 }
 
-define <4 x i32> @load_lds_v4i32_align8(<4 x i32> addrspace(3)* %ptr) {
+define <4 x i32> @load_lds_v4i32_align8(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v4i32_align8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -540,11 +540,11 @@ define <4 x i32> @load_lds_v4i32_align8(<4 x i32> addrspace(3)* %ptr) {
 ; GFX11-NEXT:    ds_load_2addr_b64 v[0:3], v0 offset1:1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8
+  %load = load <4 x i32>, ptr addrspace(3) %ptr, align 8
   ret <4 x i32> %load
 }
 
-define <4 x i32> @load_lds_v4i32_align16(<4 x i32> addrspace(3)* %ptr) {
+define <4 x i32> @load_lds_v4i32_align16(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v4i32_align16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -586,6 +586,6 @@ define <4 x i32> @load_lds_v4i32_align16(<4 x i32> addrspace(3)* %ptr) {
 ; GFX11-NEXT:    ds_load_b128 v[0:3], v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 16
+  %load = load <4 x i32>, ptr addrspace(3) %ptr, align 16
   ret <4 x i32> %load
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/load-local.96.ll
index 6933fc4a05e8d..6af1440b5bf28 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local.96.ll
@@ -5,7 +5,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
 
-define <3 x i32> @load_lds_v3i32(<3 x i32> addrspace(3)* %ptr) {
+define <3 x i32> @load_lds_v3i32(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v3i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -47,11 +47,11 @@ define <3 x i32> @load_lds_v3i32(<3 x i32> addrspace(3)* %ptr) {
 ; GFX11-NEXT:    ds_load_b96 v[0:2], v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr
+  %load = load <3 x i32>, ptr addrspace(3) %ptr
   ret <3 x i32> %load
 }
 
-define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
+define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v3i32_align1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -252,11 +252,11 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1
+  %load = load <3 x i32>, ptr addrspace(3) %ptr, align 1
   ret <3 x i32> %load
 }
 
-define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) {
+define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v3i32_align2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -359,11 +359,11 @@ define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2
+  %load = load <3 x i32>, ptr addrspace(3) %ptr, align 2
   ret <3 x i32> %load
 }
 
-define <3 x i32> @load_lds_v3i32_align4(<3 x i32> addrspace(3)* %ptr) {
+define <3 x i32> @load_lds_v3i32_align4(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v3i32_align4:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -414,11 +414,11 @@ define <3 x i32> @load_lds_v3i32_align4(<3 x i32> addrspace(3)* %ptr) {
 ; GFX11-NEXT:    ds_load_b32 v2, v2 offset:8
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4
+  %load = load <3 x i32>, ptr addrspace(3) %ptr, align 4
   ret <3 x i32> %load
 }
 
-define <3 x i32> @load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) {
+define <3 x i32> @load_lds_v3i32_align8(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v3i32_align8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -468,11 +468,11 @@ define <3 x i32> @load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) {
 ; GFX11-NEXT:    ds_load_b32 v2, v2 offset:8
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 8
+  %load = load <3 x i32>, ptr addrspace(3) %ptr, align 8
   ret <3 x i32> %load
 }
 
-define <3 x i32> @load_lds_v3i32_align16(<3 x i32> addrspace(3)* %ptr) {
+define <3 x i32> @load_lds_v3i32_align16(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v3i32_align16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -514,6 +514,6 @@ define <3 x i32> @load_lds_v3i32_align16(<3 x i32> addrspace(3)* %ptr) {
 ; GFX11-NEXT:    ds_load_b96 v[0:2], v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16
+  %load = load <3 x i32>, ptr addrspace(3) %ptr, align 16
   ret <3 x i32> %load
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
index 86bf754ecfcf0..4dfc773d615e4 100644
--- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
@@ -17,12 +17,12 @@
 ; GCN-NOT: load_dword
 
 ; GCN: flat_store_dwordx2
-define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], i64* %ptr0, [8 x i32], i64* %ptr1, [8 x i32], i64 addrspace(1)* %ptr2) {
+define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], ptr %ptr0, [8 x i32], ptr %ptr1, [8 x i32], ptr addrspace(1) %ptr2) {
   %tmp2 = icmp eq i32 %tmp, 0
-  %tmp3 = load i64, i64* %ptr0, align 8
-  %tmp4 = load i64, i64* %ptr1, align 8
+  %tmp3 = load i64, ptr %ptr0, align 8
+  %tmp4 = load i64, ptr %ptr1, align 8
   %tmp5 = select i1 %tmp2, i64 %tmp3, i64 %tmp4
-  store i64 %tmp5, i64 addrspace(1)* %ptr2, align 8
+  store i64 %tmp5, ptr addrspace(1) %ptr2, align 8
   ret void
 }
 
@@ -38,12 +38,12 @@ define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], i64* %
 ; GCN: s_cselect_b32
 ; GCN: s_cselect_b32
 ; GCN: flat_store_dwordx2
-define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], i64 addrspace(1)* %ptr0, [8 x i32], i64 addrspace(1)* %ptr1, [8 x i32], i64 addrspace(1)* %ptr2) {
+define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], ptr addrspace(1) %ptr0, [8 x i32], ptr addrspace(1) %ptr1, [8 x i32], ptr addrspace(1) %ptr2) {
   %tmp2 = icmp eq i32 %tmp, 0
-  %tmp3 = load i64, i64 addrspace(1)* %ptr0, align 8
-  %tmp4 = load i64, i64 addrspace(1)* %ptr1, align 8
+  %tmp3 = load i64, ptr addrspace(1) %ptr0, align 8
+  %tmp4 = load i64, ptr addrspace(1) %ptr1, align 8
   %tmp5 = select i1 %tmp2, i64 %tmp3, i64 %tmp4
-  store i64 %tmp5, i64 addrspace(1)* %ptr2, align 8
+  store i64 %tmp5, ptr addrspace(1) %ptr2, align 8
   ret void
 }
 
@@ -53,12 +53,12 @@ define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], i64
 ; GCN: v_cndmask_b32
 ; GCN: v_cndmask_b32
 ; GCN: flat_store_dwordx2
-define amdgpu_kernel void @select_ptr_crash_i64_local(i32 %tmp, i64 addrspace(3)* %ptr0, i64 addrspace(3)* %ptr1, i64 addrspace(1)* %ptr2) {
+define amdgpu_kernel void @select_ptr_crash_i64_local(i32 %tmp, ptr addrspace(3) %ptr0, ptr addrspace(3) %ptr1, ptr addrspace(1) %ptr2) {
   %tmp2 = icmp eq i32 %tmp, 0
-  %tmp3 = load i64, i64 addrspace(3)* %ptr0, align 8
-  %tmp4 = load i64, i64 addrspace(3)* %ptr1, align 8
+  %tmp3 = load i64, ptr addrspace(3) %ptr0, align 8
+  %tmp4 = load i64, ptr addrspace(3) %ptr1, align 8
   %tmp5 = select i1 %tmp2, i64 %tmp3, i64 %tmp4
-  store i64 %tmp5, i64 addrspace(1)* %ptr2, align 8
+  store i64 %tmp5, ptr addrspace(1) %ptr2, align 8
   ret void
 }
 
@@ -70,13 +70,13 @@ define amdgpu_kernel void @select_ptr_crash_i64_local(i32 %tmp, i64 addrspace(3)
 ; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:512
 ; GCN: v_cndmask_b32
 ; GCN: v_cndmask_b32
-define amdgpu_kernel void @select_ptr_crash_i64_local_offsets(i32 %tmp, i64 addrspace(3)* %ptr0, i64 addrspace(3)* %ptr1, i64 addrspace(1)* %ptr2) {
+define amdgpu_kernel void @select_ptr_crash_i64_local_offsets(i32 %tmp, ptr addrspace(3) %ptr0, ptr addrspace(3) %ptr1, ptr addrspace(1) %ptr2) {
   %tmp2 = icmp eq i32 %tmp, 0
-  %gep0 = getelementptr inbounds i64, i64 addrspace(3)* %ptr0, i64 16
-  %gep1 = getelementptr inbounds i64, i64 addrspace(3)* %ptr1, i64 64
-  %tmp3 = load i64, i64 addrspace(3)* %gep0, align 8
-  %tmp4 = load i64, i64 addrspace(3)* %gep1, align 8
+  %gep0 = getelementptr inbounds i64, ptr addrspace(3) %ptr0, i64 16
+  %gep1 = getelementptr inbounds i64, ptr addrspace(3) %ptr1, i64 64
+  %tmp3 = load i64, ptr addrspace(3) %gep0, align 8
+  %tmp4 = load i64, ptr addrspace(3) %gep1, align 8
   %tmp5 = select i1 %tmp2, i64 %tmp3, i64 %tmp4
-  store i64 %tmp5, i64 addrspace(1)* %ptr2, align 8
+  store i64 %tmp5, ptr addrspace(1) %ptr2, align 8
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll
index 3e19f92de4b3a..df31ab06d277e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll
@@ -8,10 +8,10 @@
 ; SI-DAG: {{flat|buffer}}_load_ubyte
 ; SI-DAG: {{flat|buffer}}_load_ushort
 ; SI: {{flat|buffer}}_store_dword
-define amdgpu_kernel void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 {
-  %1 = load i24, i24 addrspace(1)* %in
+define amdgpu_kernel void @load_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %1 = load i24, ptr addrspace(1) %in
   %2 = zext i24 %1 to i32
-  store i32 %2, i32 addrspace(1)* %out
+  store i32 %2, ptr addrspace(1) %out
   ret void
 }
 
@@ -21,10 +21,10 @@ define amdgpu_kernel void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %i
 
 ; CI-HSA: flat_load_dword [[VAL:v[0-9]+]]
 ; CI-HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VAL]]
-define amdgpu_kernel void @load_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) #0 {
-  %1 = load i25, i25 addrspace(1)* %in
+define amdgpu_kernel void @load_i25(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %1 = load i25, ptr addrspace(1) %in
   %2 = zext i25 %1 to i32
-  store i32 %2, i32 addrspace(1)* %out
+  store i32 %2, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/local-64.ll b/llvm/test/CodeGen/AMDGPU/local-64.ll
index 3e85dd5905fca..2923ba9bb65c6 100644
--- a/llvm/test/CodeGen/AMDGPU/local-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-64.ll
@@ -9,10 +9,10 @@
 
 ; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28
 ; GCN: buffer_store_dword [[REG]],
-define amdgpu_kernel void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
-  %val = load i32, i32 addrspace(3)* %gep, align 4
-  store i32 %val, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @local_i32_load(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %in, i32 7
+  %val = load i32, ptr addrspace(3) %gep, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -22,9 +22,9 @@ define amdgpu_kernel void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(
 
 ; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}}
 ; GCN: buffer_store_dword [[REG]],
-define amdgpu_kernel void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
-  %val = load i32, i32 addrspace(3)* %in, align 4
-  store i32 %val, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @local_i32_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind {
+  %val = load i32, ptr addrspace(3) %in, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -35,10 +35,10 @@ define amdgpu_kernel void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 a
 ; GCN-NOT: add
 ; GCN: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535
 ; GCN: buffer_store_byte [[REG]],
-define amdgpu_kernel void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
-  %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535
-  %val = load i8, i8 addrspace(3)* %gep, align 4
-  store i8 %val, i8 addrspace(1)* %out, align 4
+define amdgpu_kernel void @local_i8_load_i16_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind {
+  %gep = getelementptr i8, ptr addrspace(3) %in, i32 65535
+  %val = load i8, ptr addrspace(3) %gep, align 4
+  store i8 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -56,10 +56,10 @@ define amdgpu_kernel void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i
 ; GCN-DAG: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
 ; GCN: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]]
 ; GCN: buffer_store_byte [[REG]],
-define amdgpu_kernel void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
-  %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536
-  %val = load i8, i8 addrspace(3)* %gep, align 4
-  store i8 %val, i8 addrspace(1)* %out, align 4
+define amdgpu_kernel void @local_i8_load_over_i16_max_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind {
+  %gep = getelementptr i8, ptr addrspace(3) %in, i32 65536
+  %val = load i8, ptr addrspace(3) %gep, align 4
+  store i8 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -70,10 +70,10 @@ define amdgpu_kernel void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %o
 ; GCN-NOT: add
 ; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
 ; GCN: buffer_store_dwordx2 [[REG]],
-define amdgpu_kernel void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %in, i32 7
-  %val = load i64, i64 addrspace(3)* %gep, align 8
-  store i64 %val, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @local_i64_load(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %in, i32 7
+  %val = load i64, ptr addrspace(3) %gep, align 8
+  store i64 %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -83,9 +83,9 @@ define amdgpu_kernel void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(
 
 ; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
 ; GCN: buffer_store_dwordx2 [[REG]],
-define amdgpu_kernel void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
-  %val = load i64, i64 addrspace(3)* %in, align 8
-  store i64 %val, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @local_i64_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind {
+  %val = load i64, ptr addrspace(3) %in, align 8
+  store i64 %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -96,10 +96,10 @@ define amdgpu_kernel void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 a
 ; GCN-NOT: add
 ; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
 ; GCN: buffer_store_dwordx2 [[REG]],
-define amdgpu_kernel void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
-  %gep = getelementptr double, double addrspace(3)* %in, i32 7
-  %val = load double, double addrspace(3)* %gep, align 8
-  store double %val, double addrspace(1)* %out, align 8
+define amdgpu_kernel void @local_f64_load(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind {
+  %gep = getelementptr double, ptr addrspace(3) %in, i32 7
+  %val = load double, ptr addrspace(3) %gep, align 8
+  store double %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -109,9 +109,9 @@ define amdgpu_kernel void @local_f64_load(double addrspace(1)* %out, double addr
 
 ; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
 ; GCN: buffer_store_dwordx2 [[REG]],
-define amdgpu_kernel void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
-  %val = load double, double addrspace(3)* %in, align 8
-  store double %val, double addrspace(1)* %out, align 8
+define amdgpu_kernel void @local_f64_load_0_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) nounwind {
+  %val = load double, ptr addrspace(3) %in, align 8
+  store double %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -121,9 +121,9 @@ define amdgpu_kernel void @local_f64_load_0_offset(double addrspace(1)* %out, do
 
 ; GCN-NOT: add
 ; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
-define amdgpu_kernel void @local_i64_store(i64 addrspace(3)* %out) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %out, i32 7
-  store i64 5678, i64 addrspace(3)* %gep, align 8
+define amdgpu_kernel void @local_i64_store(ptr addrspace(3) %out) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %out, i32 7
+  store i64 5678, ptr addrspace(3) %gep, align 8
   ret void
 }
 
@@ -133,8 +133,8 @@ define amdgpu_kernel void @local_i64_store(i64 addrspace(3)* %out) nounwind {
 
 ; GCN-NOT: add
 ; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
-  store i64 1234, i64 addrspace(3)* %out, align 8
+define amdgpu_kernel void @local_i64_store_0_offset(ptr addrspace(3) %out) nounwind {
+  store i64 1234, ptr addrspace(3) %out, align 8
   ret void
 }
 
@@ -144,9 +144,9 @@ define amdgpu_kernel void @local_i64_store_0_offset(i64 addrspace(3)* %out) noun
 
 ; GCN-NOT: add
 ; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
-define amdgpu_kernel void @local_f64_store(double addrspace(3)* %out) nounwind {
-  %gep = getelementptr double, double addrspace(3)* %out, i32 7
-  store double 16.0, double addrspace(3)* %gep, align 8
+define amdgpu_kernel void @local_f64_store(ptr addrspace(3) %out) nounwind {
+  %gep = getelementptr double, ptr addrspace(3) %out, i32 7
+  store double 16.0, ptr addrspace(3) %gep, align 8
   ret void
 }
 
@@ -155,8 +155,8 @@ define amdgpu_kernel void @local_f64_store(double addrspace(3)* %out) nounwind {
 ; GFX9-NOT: m0
 
 ; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
-  store double 20.0, double addrspace(3)* %out, align 8
+define amdgpu_kernel void @local_f64_store_0_offset(ptr addrspace(3) %out) nounwind {
+  store double 20.0, ptr addrspace(3) %out, align 8
   ret void
 }
 
@@ -168,9 +168,9 @@ define amdgpu_kernel void @local_f64_store_0_offset(double addrspace(3)* %out) n
 ; SI: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
 ; CIPLUS: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112
 ; GCN: s_endpgm
-define amdgpu_kernel void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
-  %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7
-  store <2 x i64> <i64 5678, i64 5678>, <2 x i64> addrspace(3)* %gep, align 16
+define amdgpu_kernel void @local_v2i64_store(ptr addrspace(3) %out) nounwind {
+  %gep = getelementptr <2 x i64>, ptr addrspace(3) %out, i32 7
+  store <2 x i64> <i64 5678, i64 5678>, ptr addrspace(3) %gep, align 16
   ret void
 }
 
@@ -184,8 +184,8 @@ define amdgpu_kernel void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounw
 ; CIPLUS: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]$}}
 
 ; GCN: s_endpgm
-define amdgpu_kernel void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
-  store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
+define amdgpu_kernel void @local_v2i64_store_0_offset(ptr addrspace(3) %out) nounwind {
+  store <2 x i64> <i64 1234, i64 1234>, ptr addrspace(3) %out, align 16
   ret void
 }
 
@@ -201,9 +201,9 @@ define amdgpu_kernel void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %o
 ; CIPLUS-DAG: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240{{$}}
 
 ; GCN: s_endpgm
-define amdgpu_kernel void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
-  %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7
-  store <4 x i64> <i64 5678, i64 5678, i64 5678, i64 5678>, <4 x i64> addrspace(3)* %gep, align 16
+define amdgpu_kernel void @local_v4i64_store(ptr addrspace(3) %out) nounwind {
+  %gep = getelementptr <4 x i64>, ptr addrspace(3) %out, i32 7
+  store <4 x i64> <i64 5678, i64 5678, i64 5678, i64 5678>, ptr addrspace(3) %gep, align 16
   ret void
 }
 
@@ -219,7 +219,7 @@ define amdgpu_kernel void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounw
 ; CIPLUS-DAG: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16{{$}}
 
 ; GCN: s_endpgm
-define amdgpu_kernel void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
-  store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
+define amdgpu_kernel void @local_v4i64_store_0_offset(ptr addrspace(3) %out) nounwind {
+  store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, ptr addrspace(3) %out, align 16
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index f8ecd9730504e..1739612176fcf 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -13,8 +13,8 @@
 ; NO-ATOMICS: v_add_f32
 ; NO-ATOMICS: ds_cmpst_rtn_b32
 ; NO-ATOMICS: s_cbranch_execnz
-define float @lds_atomic_fadd_ret_f32(float addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw fadd float addrspace(3)* %ptr, float 4.0 seq_cst
+define float @lds_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst
   ret float %result
 }
 
@@ -23,8 +23,8 @@ define float @lds_atomic_fadd_ret_f32(float addrspace(3)* %ptr) nounwind {
 ; GFX9-NOT: m0
 ; HAS-ATOMICS-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 4.0
 ; HAS-ATOMICS: ds_add_f32 v0, [[K]]
-define void @lds_atomic_fadd_noret_f32(float addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw fadd float addrspace(3)* %ptr, float 4.0 seq_cst
+define void @lds_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst
   ret void
 }
 
@@ -36,16 +36,16 @@ define void @lds_atomic_fadd_noret_f32(float addrspace(3)* %ptr) nounwind {
 ; HAS-ATOMICS: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64
 ; HAS-ATOMICS: s_waitcnt lgkmcnt(0)
 ; HAS-ATOMICS: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]]
-define amdgpu_kernel void @lds_ds_fadd(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) {
+define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %ptrf, i32 %idx) {
   %idx.add = add nuw i32 %idx, 4
   %shl0 = shl i32 %idx.add, 3
   %shl1 = shl i32 %idx.add, 4
-  %ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
-  %ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
-  %a1 = atomicrmw fadd float addrspace(3)* %ptr0, float 4.2e+1 seq_cst
-  %a2 = atomicrmw fadd float addrspace(3)* %ptr1, float 4.2e+1 seq_cst
-  %a3 = atomicrmw fadd float addrspace(3)* %ptrf, float %a1 seq_cst
-  store float %a3, float addrspace(1)* %out
+  %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
+  %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3)
+  %a1 = atomicrmw fadd ptr addrspace(3) %ptr0, float 4.2e+1 seq_cst
+  %a2 = atomicrmw fadd ptr addrspace(3) %ptr1, float 4.2e+1 seq_cst
+  %a3 = atomicrmw fadd ptr addrspace(3) %ptrf, float %a1 seq_cst
+  store float %a3, ptr addrspace(1) %out
   ret void
 }
 
@@ -57,16 +57,16 @@ define amdgpu_kernel void @lds_ds_fadd(float addrspace(1)* %out, float addrspace
 ; HAS-ATOMICS: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64
 ; HAS-ATOMICS: s_waitcnt lgkmcnt(1)
 ; HAS-ATOMICS: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]]
-define amdgpu_kernel void @lds_ds_fadd_one_as(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) {
+define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspace(3) %ptrf, i32 %idx) {
   %idx.add = add nuw i32 %idx, 4
   %shl0 = shl i32 %idx.add, 3
   %shl1 = shl i32 %idx.add, 4
-  %ptr0 = inttoptr i32 %shl0 to float addrspace(3)*
-  %ptr1 = inttoptr i32 %shl1 to float addrspace(3)*
-  %a1 = atomicrmw fadd float addrspace(3)* %ptr0, float 4.2e+1 syncscope("one-as") seq_cst
-  %a2 = atomicrmw fadd float addrspace(3)* %ptr1, float 4.2e+1 syncscope("one-as") seq_cst
-  %a3 = atomicrmw fadd float addrspace(3)* %ptrf, float %a1 syncscope("one-as") seq_cst
-  store float %a3, float addrspace(1)* %out
+  %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
+  %ptr1 = inttoptr i32 %shl1 to ptr addrspace(3)
+  %a1 = atomicrmw fadd ptr addrspace(3) %ptr0, float 4.2e+1 syncscope("one-as") seq_cst
+  %a2 = atomicrmw fadd ptr addrspace(3) %ptr1, float 4.2e+1 syncscope("one-as") seq_cst
+  %a3 = atomicrmw fadd ptr addrspace(3) %ptrf, float %a1 syncscope("one-as") seq_cst
+  store float %a3, ptr addrspace(1) %out
   ret void
 }
 
@@ -75,8 +75,8 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(float addrspace(1)* %out, float ad
 ; GCN: v_add_f64
 ; GCN: ds_cmpst_rtn_b64
 ; GCN: s_cbranch_execnz
-define double @lds_atomic_fadd_ret_f64(double addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst
+define double @lds_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
   ret double %result
 }
 
@@ -85,8 +85,8 @@ define double @lds_atomic_fadd_ret_f64(double addrspace(3)* %ptr) nounwind {
 ; GCN: v_add_f64
 ; GCN: ds_cmpst_rtn_b64
 ; GCN: s_cbranch_execnz
-define void @lds_atomic_fadd_noret_f64(double addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst
+define void @lds_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
   ret void
 }
 
@@ -95,8 +95,8 @@ define void @lds_atomic_fadd_noret_f64(double addrspace(3)* %ptr) nounwind {
 ; GCN: v_sub_f32
 ; GCN: ds_cmpst_rtn_b32
 ; GCN: s_cbranch_execnz
-define float @lds_atomic_fsub_ret_f32(float addrspace(3)* %ptr, float %val) nounwind {
-  %result = atomicrmw fsub float addrspace(3)* %ptr, float %val seq_cst
+define float @lds_atomic_fsub_ret_f32(ptr addrspace(3) %ptr, float %val) nounwind {
+  %result = atomicrmw fsub ptr addrspace(3) %ptr, float %val seq_cst
   ret float %result
 }
 
@@ -104,8 +104,8 @@ define float @lds_atomic_fsub_ret_f32(float addrspace(3)* %ptr, float %val) noun
 ; GCN: ds_read_b32
 ; GCN: v_sub_f32
 ; GCN: ds_cmpst_rtn_b32
-define void @lds_atomic_fsub_noret_f32(float addrspace(3)* %ptr, float %val) nounwind {
-  %result = atomicrmw fsub float addrspace(3)* %ptr, float %val seq_cst
+define void @lds_atomic_fsub_noret_f32(ptr addrspace(3) %ptr, float %val) nounwind {
+  %result = atomicrmw fsub ptr addrspace(3) %ptr, float %val seq_cst
   ret void
 }
 
@@ -114,8 +114,8 @@ define void @lds_atomic_fsub_noret_f32(float addrspace(3)* %ptr, float %val) nou
 ; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
 ; GCN: ds_cmpst_rtn_b64
 
-define double @lds_atomic_fsub_ret_f64(double addrspace(3)* %ptr, double %val) nounwind {
-  %result = atomicrmw fsub double addrspace(3)* %ptr, double %val seq_cst
+define double @lds_atomic_fsub_ret_f64(ptr addrspace(3) %ptr, double %val) nounwind {
+  %result = atomicrmw fsub ptr addrspace(3) %ptr, double %val seq_cst
   ret double %result
 }
 
@@ -124,7 +124,7 @@ define double @lds_atomic_fsub_ret_f64(double addrspace(3)* %ptr, double %val) n
 ; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
 ; GCN: ds_cmpst_rtn_b64
 ; GCN: s_cbranch_execnz
-define void @lds_atomic_fsub_noret_f64(double addrspace(3)* %ptr, double %val) nounwind {
-  %result = atomicrmw fsub double addrspace(3)* %ptr, double %val seq_cst
+define void @lds_atomic_fsub_noret_f64(ptr addrspace(3) %ptr, double %val) nounwind {
+  %result = atomicrmw fsub ptr addrspace(3) %ptr, double %val seq_cst
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/local-atomics.ll b/llvm/test/CodeGen/AMDGPU/local-atomics.ll
index fcbe38e40070b..140c615170fcf 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics.ll
@@ -16,9 +16,9 @@
 ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_xchg_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw xchg ptr addrspace(3) %ptr, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -29,10 +29,10 @@ define amdgpu_kernel void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 a
 ; EG: LDS_WRXCHG_RET *
 ; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_xchg_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw xchg ptr addrspace(3) %gep, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -43,10 +43,10 @@ define amdgpu_kernel void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out
 ; EG: LDS_WRXCHG_RET *
 ; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_xchg_ret_f32_offset(float addrspace(1)* %out, float addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr float, float addrspace(3)* %ptr, i32 4
-  %result = atomicrmw xchg float addrspace(3)* %gep, float 4.0 seq_cst
-  store float %result, float addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_xchg_ret_f32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr float, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw xchg ptr addrspace(3) %gep, float 4.0 seq_cst
+  store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -63,9 +63,9 @@ define amdgpu_kernel void @lds_atomic_xchg_ret_f32_offset(float addrspace(1)* %o
 ; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_add_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw add ptr addrspace(3) %ptr, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -76,10 +76,10 @@ define amdgpu_kernel void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 ad
 ; EG: LDS_ADD_RET *
 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_add_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw add ptr addrspace(3) %gep, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -91,12 +91,12 @@ define amdgpu_kernel void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out,
 ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i32_bad_si_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
-  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 %add
+  %result = atomicrmw add ptr addrspace(3) %gep, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -109,9 +109,9 @@ define amdgpu_kernel void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)
 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_add1_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw add ptr addrspace(3) %ptr, i32 1 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -124,10 +124,10 @@ define amdgpu_kernel void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 a
 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_add1_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw add ptr addrspace(3) %gep, i32 1 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -139,12 +139,12 @@ define amdgpu_kernel void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out
 ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i32_bad_si_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
-  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 %add
+  %result = atomicrmw add ptr addrspace(3) %gep, i32 1 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -156,9 +156,9 @@ define amdgpu_kernel void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1
 
 ; GCN: ds_sub_rtn_u32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_sub_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw sub ptr addrspace(3) %ptr, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -170,10 +170,10 @@ define amdgpu_kernel void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 ad
 
 ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_sub_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw sub ptr addrspace(3) %gep, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -186,9 +186,9 @@ define amdgpu_kernel void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out,
 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_rtn_u32  v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_sub1_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw sub ptr addrspace(3) %ptr, i32 1 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -201,10 +201,10 @@ define amdgpu_kernel void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 a
 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_sub1_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw sub ptr addrspace(3) %gep, i32 1 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -216,9 +216,9 @@ define amdgpu_kernel void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out
 
 ; GCN: ds_and_rtn_b32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_and_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw and ptr addrspace(3) %ptr, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -229,10 +229,10 @@ define amdgpu_kernel void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 ad
 ; EG: LDS_AND_RET *
 ; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_and_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw and ptr addrspace(3) %gep, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -243,9 +243,9 @@ define amdgpu_kernel void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out,
 ; EG: LDS_OR_RET *
 ; GCN: ds_or_rtn_b32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_or_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw or ptr addrspace(3) %ptr, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -256,10 +256,10 @@ define amdgpu_kernel void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 add
 ; EG: LDS_OR_RET *
 ; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_or_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw or ptr addrspace(3) %gep, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -270,9 +270,9 @@ define amdgpu_kernel void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out,
 ; EG: LDS_XOR_RET *
 ; GCN: ds_xor_rtn_b32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_xor_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw xor ptr addrspace(3) %ptr, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -283,18 +283,18 @@ define amdgpu_kernel void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 ad
 ; EG: LDS_XOR_RET *
 ; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_xor_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw xor ptr addrspace(3) %gep, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; FIXME: There is no atomic nand instr
 ; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i32:uction, so we somehow need to expand this.
-; define amdgpu_kernel void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
-;   store i32 %result, i32 addrspace(1)* %out, align 4
+; define amdgpu_kernel void @lds_atomic_nand_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+;   %result = atomicrmw nand ptr addrspace(3) %ptr, i32 4 seq_cst
+;   store i32 %result, ptr addrspace(1) %out, align 4
 ;   ret void
 ; }
 
@@ -305,9 +305,9 @@ define amdgpu_kernel void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out,
 ; EG: LDS_MIN_INT_RET *
 ; GCN: ds_min_rtn_i32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_min_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw min ptr addrspace(3) %ptr, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -318,10 +318,10 @@ define amdgpu_kernel void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 ad
 ; EG: LDS_MIN_INT_RET *
 ; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_min_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw min ptr addrspace(3) %gep, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -332,9 +332,9 @@ define amdgpu_kernel void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out,
 ; EG: LDS_MAX_INT_RET *
 ; GCN: ds_max_rtn_i32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_max_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw max ptr addrspace(3) %ptr, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -345,10 +345,10 @@ define amdgpu_kernel void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 ad
 ; EG: LDS_MAX_INT_RET *
 ; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_max_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw max ptr addrspace(3) %gep, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -359,9 +359,9 @@ define amdgpu_kernel void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out,
 ; EG: LDS_MIN_UINT_RET *
 ; GCN: ds_min_rtn_u32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_umin_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw umin ptr addrspace(3) %ptr, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -372,10 +372,10 @@ define amdgpu_kernel void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 a
 ; EG: LDS_MIN_UINT_RET *
 ; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_umin_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw umin ptr addrspace(3) %gep, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -386,9 +386,9 @@ define amdgpu_kernel void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out
 ; EG: LDS_MAX_UINT_RET *
 ; GCN: ds_max_rtn_u32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_umax_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw umax ptr addrspace(3) %ptr, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -399,10 +399,10 @@ define amdgpu_kernel void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 a
 ; EG: LDS_MAX_UINT_RET *
 ; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
-  store i32 %result, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @lds_atomic_umax_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw umax ptr addrspace(3) %gep, i32 4 seq_cst
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -415,8 +415,8 @@ define amdgpu_kernel void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out
 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_xchg_noret_i32(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw xchg ptr addrspace(3) %ptr, i32 4 seq_cst
   ret void
 }
 
@@ -426,9 +426,9 @@ define amdgpu_kernel void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nou
 
 ; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_xchg_noret_i32_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw xchg ptr addrspace(3) %gep, i32 4 seq_cst
   ret void
 }
 
@@ -441,8 +441,8 @@ define amdgpu_kernel void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %p
 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_add_u32 [[VPTR]], [[DATA]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_add_noret_i32(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw add ptr addrspace(3) %ptr, i32 4 seq_cst
   ret void
 }
 
@@ -452,9 +452,9 @@ define amdgpu_kernel void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) noun
 
 ; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_add_noret_i32_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw add ptr addrspace(3) %gep, i32 4 seq_cst
   ret void
 }
 
@@ -465,11 +465,11 @@ define amdgpu_kernel void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %pt
 ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i32_bad_si_offset(ptr addrspace(3) %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
-  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 %add
+  %result = atomicrmw add ptr addrspace(3) %gep, i32 4 seq_cst
   ret void
 }
 
@@ -480,8 +480,8 @@ define amdgpu_kernel void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(
 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
+define amdgpu_kernel void @lds_atomic_add1_noret_i32(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw add ptr addrspace(3) %ptr, i32 1 seq_cst
   ret void
 }
 
@@ -492,9 +492,9 @@ define amdgpu_kernel void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nou
 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
+define amdgpu_kernel void @lds_atomic_add1_noret_i32_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw add ptr addrspace(3) %gep, i32 1 seq_cst
   ret void
 }
 
@@ -505,11 +505,11 @@ define amdgpu_kernel void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %p
 ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i32_bad_si_offset(ptr addrspace(3) %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
-  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 %add
+  %result = atomicrmw add ptr addrspace(3) %gep, i32 1 seq_cst
   ret void
 }
 
@@ -519,8 +519,8 @@ define amdgpu_kernel void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace
 
 ; GCN: ds_sub_u32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_sub_noret_i32(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw sub ptr addrspace(3) %ptr, i32 4 seq_cst
   ret void
 }
 
@@ -530,9 +530,9 @@ define amdgpu_kernel void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) noun
 
 ; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_sub_noret_i32_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw sub ptr addrspace(3) %gep, i32 4 seq_cst
   ret void
 }
 
@@ -543,8 +543,8 @@ define amdgpu_kernel void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %pt
 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
+define amdgpu_kernel void @lds_atomic_sub1_noret_i32(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw sub ptr addrspace(3) %ptr, i32 1 seq_cst
   ret void
 }
 
@@ -555,9 +555,9 @@ define amdgpu_kernel void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nou
 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
+define amdgpu_kernel void @lds_atomic_sub1_noret_i32_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw sub ptr addrspace(3) %gep, i32 1 seq_cst
   ret void
 }
 
@@ -567,8 +567,8 @@ define amdgpu_kernel void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %p
 
 ; GCN: ds_and_b32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_and_noret_i32(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw and ptr addrspace(3) %ptr, i32 4 seq_cst
   ret void
 }
 
@@ -578,9 +578,9 @@ define amdgpu_kernel void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) noun
 
 ; GCN: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_and_noret_i32_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw and ptr addrspace(3) %gep, i32 4 seq_cst
   ret void
 }
 
@@ -590,8 +590,8 @@ define amdgpu_kernel void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %pt
 
 ; GCN: ds_or_b32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_or_noret_i32(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw or ptr addrspace(3) %ptr, i32 4 seq_cst
   ret void
 }
 
@@ -601,9 +601,9 @@ define amdgpu_kernel void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounw
 
 ; GCN: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_or_noret_i32_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw or ptr addrspace(3) %gep, i32 4 seq_cst
   ret void
 }
 
@@ -613,8 +613,8 @@ define amdgpu_kernel void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr
 
 ; GCN: ds_xor_b32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_xor_noret_i32(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw xor ptr addrspace(3) %ptr, i32 4 seq_cst
   ret void
 }
 
@@ -624,16 +624,16 @@ define amdgpu_kernel void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) noun
 
 ; GCN: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_xor_noret_i32_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw xor ptr addrspace(3) %gep, i32 4 seq_cst
   ret void
 }
 
 ; FIXME: There is no atomic nand instr
 ; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i32:uction, so we somehow need to expand this.
-; define amdgpu_kernel void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
+; define amdgpu_kernel void @lds_atomic_nand_noret_i32(ptr addrspace(3) %ptr) nounwind {
+;   %result = atomicrmw nand ptr addrspace(3) %ptr, i32 4 seq_cst
 ;   ret void
 ; }
 
@@ -643,8 +643,8 @@ define amdgpu_kernel void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %pt
 
 ; GCN: ds_min_i32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_min_noret_i32(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw min ptr addrspace(3) %ptr, i32 4 seq_cst
   ret void
 }
 
@@ -654,9 +654,9 @@ define amdgpu_kernel void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) noun
 
 ; GCN: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_min_noret_i32_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw min ptr addrspace(3) %gep, i32 4 seq_cst
   ret void
 }
 
@@ -666,8 +666,8 @@ define amdgpu_kernel void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %pt
 
 ; GCN: ds_max_i32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_max_noret_i32(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw max ptr addrspace(3) %ptr, i32 4 seq_cst
   ret void
 }
 
@@ -677,9 +677,9 @@ define amdgpu_kernel void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) noun
 
 ; GCN: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_max_noret_i32_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw max ptr addrspace(3) %gep, i32 4 seq_cst
   ret void
 }
 
@@ -689,8 +689,8 @@ define amdgpu_kernel void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %pt
 
 ; GCN: ds_min_u32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_umin_noret_i32(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw umin ptr addrspace(3) %ptr, i32 4 seq_cst
   ret void
 }
 
@@ -700,9 +700,9 @@ define amdgpu_kernel void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nou
 
 ; GCN: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_umin_noret_i32_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw umin ptr addrspace(3) %gep, i32 4 seq_cst
   ret void
 }
 
@@ -712,8 +712,8 @@ define amdgpu_kernel void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %p
 
 ; GCN: ds_max_u32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_umax_noret_i32(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw umax ptr addrspace(3) %ptr, i32 4 seq_cst
   ret void
 }
 
@@ -723,8 +723,8 @@ define amdgpu_kernel void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nou
 
 ; GCN: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
+define amdgpu_kernel void @lds_atomic_umax_noret_i32_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw umax ptr addrspace(3) %gep, i32 4 seq_cst
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/local-atomics64.ll b/llvm/test/CodeGen/AMDGPU/local-atomics64.ll
index 5c67b299cf134..aa93adfe427b5 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics64.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics64.ll
@@ -8,9 +8,9 @@
 
 ; GCN: ds_wrxchg_rtn_b64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_xchg_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw xchg ptr addrspace(3) %ptr, i64 4 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -20,10 +20,10 @@ define amdgpu_kernel void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 a
 
 ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_xchg_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw xchg ptr addrspace(3) %gep, i64 4 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -33,10 +33,10 @@ define amdgpu_kernel void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out
 
 ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_xchg_ret_f64_offset(double addrspace(1)* %out, double addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr double, double addrspace(3)* %ptr, i32 4
-  %result = atomicrmw xchg double addrspace(3)* %gep, double 4.0 seq_cst
-  store double %result, double addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_xchg_ret_f64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr double, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw xchg ptr addrspace(3) %gep, double 4.0 seq_cst
+  store double %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -46,10 +46,10 @@ define amdgpu_kernel void @lds_atomic_xchg_ret_f64_offset(double addrspace(1)* %
 
 ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_xchg_ret_pointer_offset(i8* addrspace(1)* %out, i8* addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i8*, i8* addrspace(3)* %ptr, i32 4
-  %result = atomicrmw xchg i8* addrspace(3)* %gep, i8* null seq_cst
-  store i8* %result, i8* addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_xchg_ret_pointer_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr ptr, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw xchg ptr addrspace(3) %gep, ptr null seq_cst
+  store ptr %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -59,9 +59,9 @@ define amdgpu_kernel void @lds_atomic_xchg_ret_pointer_offset(i8* addrspace(1)*
 
 ; GCN: ds_add_rtn_u64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_add_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw add ptr addrspace(3) %ptr, i64 4 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -77,10 +77,10 @@ define amdgpu_kernel void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 ad
 ; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v[[[LOVDATA]]:[[HIVDATA]]] offset:32
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
-  %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_add_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i64 4
+  %result = atomicrmw add ptr addrspace(3) %gep, i64 9 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -93,9 +93,9 @@ define amdgpu_kernel void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out,
 ; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, v[[[LOVDATA]]:[[HIVDATA]]]
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_add1_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw add ptr addrspace(3) %ptr, i64 1 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -105,10 +105,10 @@ define amdgpu_kernel void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 a
 
 ; GCN: ds_add_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_add1_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw add ptr addrspace(3) %gep, i64 1 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -118,9 +118,9 @@ define amdgpu_kernel void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out
 
 ; GCN: ds_sub_rtn_u64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_sub_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw sub ptr addrspace(3) %ptr, i64 4 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -130,10 +130,10 @@ define amdgpu_kernel void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 ad
 
 ; GCN: ds_sub_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_sub_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw sub ptr addrspace(3) %gep, i64 4 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -146,9 +146,9 @@ define amdgpu_kernel void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out,
 ; GCN: ds_sub_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, v[[[LOVDATA]]:[[HIVDATA]]]
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_sub1_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw sub ptr addrspace(3) %ptr, i64 1 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -158,10 +158,10 @@ define amdgpu_kernel void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 a
 
 ; GCN: ds_sub_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_sub1_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw sub ptr addrspace(3) %gep, i64 1 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -171,9 +171,9 @@ define amdgpu_kernel void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out
 
 ; GCN: ds_and_rtn_b64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_and_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw and ptr addrspace(3) %ptr, i64 4 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -183,10 +183,10 @@ define amdgpu_kernel void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 ad
 
 ; GCN: ds_and_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_and_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw and ptr addrspace(3) %gep, i64 4 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -196,9 +196,9 @@ define amdgpu_kernel void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out,
 
 ; GCN: ds_or_rtn_b64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_or_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw or ptr addrspace(3) %ptr, i64 4 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -208,10 +208,10 @@ define amdgpu_kernel void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 add
 
 ; GCN: ds_or_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_or_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw or ptr addrspace(3) %gep, i64 4 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -221,9 +221,9 @@ define amdgpu_kernel void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out,
 
 ; GCN: ds_xor_rtn_b64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_xor_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw xor ptr addrspace(3) %ptr, i64 4 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -233,18 +233,18 @@ define amdgpu_kernel void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 ad
 
 ; GCN: ds_xor_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_xor_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw xor ptr addrspace(3) %gep, i64 4 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; FIXME: There is no atomic nand instr
 ; XGCN-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this.
-; define amdgpu_kernel void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
-;   store i64 %result, i64 addrspace(1)* %out, align 8
+; define amdgpu_kernel void @lds_atomic_nand_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+;   %result = atomicrmw nand ptr addrspace(3) %ptr, i32 4 seq_cst
+;   store i64 %result, ptr addrspace(1) %out, align 8
 ;   ret void
 ; }
 
@@ -254,9 +254,9 @@ define amdgpu_kernel void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out,
 
 ; GCN: ds_min_rtn_i64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_min_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw min ptr addrspace(3) %ptr, i64 4 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -266,10 +266,10 @@ define amdgpu_kernel void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 ad
 
 ; GCN: ds_min_rtn_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_min_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw min ptr addrspace(3) %gep, i64 4 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -279,9 +279,9 @@ define amdgpu_kernel void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out,
 
 ; GCN: ds_max_rtn_i64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_max_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw max ptr addrspace(3) %ptr, i64 4 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -291,10 +291,10 @@ define amdgpu_kernel void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 ad
 
 ; GCN: ds_max_rtn_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_max_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw max ptr addrspace(3) %gep, i64 4 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -304,9 +304,9 @@ define amdgpu_kernel void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out,
 
 ; GCN: ds_min_rtn_u64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_umin_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw umin ptr addrspace(3) %ptr, i64 4 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -316,10 +316,10 @@ define amdgpu_kernel void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 a
 
 ; GCN: ds_min_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_umin_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw umin ptr addrspace(3) %gep, i64 4 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -329,9 +329,9 @@ define amdgpu_kernel void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out
 
 ; GCN: ds_max_rtn_u64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_umax_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw umax ptr addrspace(3) %ptr, i64 4 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -341,10 +341,10 @@ define amdgpu_kernel void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 a
 
 ; GCN: ds_max_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
-  store i64 %result, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @lds_atomic_umax_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw umax ptr addrspace(3) %gep, i64 4 seq_cst
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -354,8 +354,8 @@ define amdgpu_kernel void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out
 
 ; GCN: ds_wrxchg_rtn_b64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
+define amdgpu_kernel void @lds_atomic_xchg_noret_i64(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw xchg ptr addrspace(3) %ptr, i64 4 seq_cst
   ret void
 }
 
@@ -365,9 +365,9 @@ define amdgpu_kernel void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nou
 
 ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
+define amdgpu_kernel void @lds_atomic_xchg_noret_i64_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw xchg ptr addrspace(3) %gep, i64 4 seq_cst
   ret void
 }
 
@@ -377,8 +377,8 @@ define amdgpu_kernel void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %p
 
 ; GCN: ds_add_u64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
+define amdgpu_kernel void @lds_atomic_add_noret_i64(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw add ptr addrspace(3) %ptr, i64 4 seq_cst
   ret void
 }
 
@@ -393,9 +393,9 @@ define amdgpu_kernel void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) noun
 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GCN: ds_add_u64 {{v[0-9]+}}, v[[[LOVDATA]]:[[HIVDATA]]] offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
-  %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
+define amdgpu_kernel void @lds_atomic_add_noret_i64_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i64 4
+  %result = atomicrmw add ptr addrspace(3) %gep, i64 9 seq_cst
   ret void
 }
 
@@ -407,8 +407,8 @@ define amdgpu_kernel void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %pt
 ; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
 ; GCN: ds_add_u64 {{v[0-9]+}}, v[[[LOVDATA]]:[[HIVDATA]]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
+define amdgpu_kernel void @lds_atomic_add1_noret_i64(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw add ptr addrspace(3) %ptr, i64 1 seq_cst
   ret void
 }
 
@@ -418,9 +418,9 @@ define amdgpu_kernel void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nou
 
 ; GCN: ds_add_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
+define amdgpu_kernel void @lds_atomic_add1_noret_i64_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw add ptr addrspace(3) %gep, i64 1 seq_cst
   ret void
 }
 
@@ -430,8 +430,8 @@ define amdgpu_kernel void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %p
 
 ; GCN: ds_sub_u64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
+define amdgpu_kernel void @lds_atomic_sub_noret_i64(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw sub ptr addrspace(3) %ptr, i64 4 seq_cst
   ret void
 }
 
@@ -441,9 +441,9 @@ define amdgpu_kernel void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) noun
 
 ; GCN: ds_sub_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
+define amdgpu_kernel void @lds_atomic_sub_noret_i64_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw sub ptr addrspace(3) %gep, i64 4 seq_cst
   ret void
 }
 
@@ -455,8 +455,8 @@ define amdgpu_kernel void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %pt
 ; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
 ; GCN: ds_sub_u64 {{v[0-9]+}}, v[[[LOVDATA]]:[[HIVDATA]]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
+define amdgpu_kernel void @lds_atomic_sub1_noret_i64(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw sub ptr addrspace(3) %ptr, i64 1 seq_cst
   ret void
 }
 
@@ -466,9 +466,9 @@ define amdgpu_kernel void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nou
 
 ; GCN: ds_sub_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
+define amdgpu_kernel void @lds_atomic_sub1_noret_i64_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw sub ptr addrspace(3) %gep, i64 1 seq_cst
   ret void
 }
 
@@ -478,8 +478,8 @@ define amdgpu_kernel void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %p
 
 ; GCN: ds_and_b64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
+define amdgpu_kernel void @lds_atomic_and_noret_i64(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw and ptr addrspace(3) %ptr, i64 4 seq_cst
   ret void
 }
 
@@ -489,9 +489,9 @@ define amdgpu_kernel void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) noun
 
 ; GCN: ds_and_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
+define amdgpu_kernel void @lds_atomic_and_noret_i64_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw and ptr addrspace(3) %gep, i64 4 seq_cst
   ret void
 }
 
@@ -501,8 +501,8 @@ define amdgpu_kernel void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %pt
 
 ; GCN: ds_or_b64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
+define amdgpu_kernel void @lds_atomic_or_noret_i64(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw or ptr addrspace(3) %ptr, i64 4 seq_cst
   ret void
 }
 
@@ -512,9 +512,9 @@ define amdgpu_kernel void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounw
 
 ; GCN: ds_or_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
+define amdgpu_kernel void @lds_atomic_or_noret_i64_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw or ptr addrspace(3) %gep, i64 4 seq_cst
   ret void
 }
 
@@ -524,8 +524,8 @@ define amdgpu_kernel void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr
 
 ; GCN: ds_xor_b64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
+define amdgpu_kernel void @lds_atomic_xor_noret_i64(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw xor ptr addrspace(3) %ptr, i64 4 seq_cst
   ret void
 }
 
@@ -535,16 +535,16 @@ define amdgpu_kernel void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) noun
 
 ; GCN: ds_xor_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
+define amdgpu_kernel void @lds_atomic_xor_noret_i64_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw xor ptr addrspace(3) %gep, i64 4 seq_cst
   ret void
 }
 
 ; FIXME: There is no atomic nand instr
 ; XGCN-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this.
-; define amdgpu_kernel void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
+; define amdgpu_kernel void @lds_atomic_nand_noret_i64(ptr addrspace(3) %ptr) nounwind {
+;   %result = atomicrmw nand ptr addrspace(3) %ptr, i32 4 seq_cst
 ;   ret void
 ; }
 
@@ -554,8 +554,8 @@ define amdgpu_kernel void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %pt
 
 ; GCN: ds_min_i64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
+define amdgpu_kernel void @lds_atomic_min_noret_i64(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw min ptr addrspace(3) %ptr, i64 4 seq_cst
   ret void
 }
 
@@ -565,9 +565,9 @@ define amdgpu_kernel void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) noun
 
 ; GCN: ds_min_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
+define amdgpu_kernel void @lds_atomic_min_noret_i64_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw min ptr addrspace(3) %gep, i64 4 seq_cst
   ret void
 }
 
@@ -577,8 +577,8 @@ define amdgpu_kernel void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %pt
 
 ; GCN: ds_max_i64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
+define amdgpu_kernel void @lds_atomic_max_noret_i64(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw max ptr addrspace(3) %ptr, i64 4 seq_cst
   ret void
 }
 
@@ -588,9 +588,9 @@ define amdgpu_kernel void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) noun
 
 ; GCN: ds_max_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
+define amdgpu_kernel void @lds_atomic_max_noret_i64_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw max ptr addrspace(3) %gep, i64 4 seq_cst
   ret void
 }
 
@@ -600,8 +600,8 @@ define amdgpu_kernel void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %pt
 
 ; GCN: ds_min_u64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
+define amdgpu_kernel void @lds_atomic_umin_noret_i64(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw umin ptr addrspace(3) %ptr, i64 4 seq_cst
   ret void
 }
 
@@ -611,9 +611,9 @@ define amdgpu_kernel void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nou
 
 ; GCN: ds_min_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
+define amdgpu_kernel void @lds_atomic_umin_noret_i64_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw umin ptr addrspace(3) %gep, i64 4 seq_cst
   ret void
 }
 
@@ -623,8 +623,8 @@ define amdgpu_kernel void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %p
 
 ; GCN: ds_max_u64
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
+define amdgpu_kernel void @lds_atomic_umax_noret_i64(ptr addrspace(3) %ptr) nounwind {
+  %result = atomicrmw umax ptr addrspace(3) %ptr, i64 4 seq_cst
   ret void
 }
 
@@ -634,8 +634,8 @@ define amdgpu_kernel void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nou
 
 ; GCN: ds_max_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define amdgpu_kernel void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
-  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
+define amdgpu_kernel void @lds_atomic_umax_noret_i64_offset(ptr addrspace(3) %ptr) nounwind {
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %result = atomicrmw umax ptr addrspace(3) %gep, i64 4 seq_cst
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
index 27207ea944790..22447d47d3e27 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -4,7 +4,7 @@
 
 @local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
 
-define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @local_memory(ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: local_memory:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
@@ -26,16 +26,16 @@ define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out) #0 {
 ; GCN-NEXT:    s_endpgm
 entry:
   %y.i = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
-  store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds [128 x i32], ptr addrspace(3) @local_memory.local_mem, i32 0, i32 %y.i
+  store i32 %y.i, ptr addrspace(3) %arrayidx, align 4
   %add = add nsw i32 %y.i, 1
   %cmp = icmp eq i32 %add, 16
   %.add = select i1 %cmp, i32 0, i32 %add
   call void @llvm.amdgcn.s.barrier()
-  %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
-  %tmp = load i32, i32 addrspace(3)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
-  store i32 %tmp, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx1 = getelementptr inbounds [128 x i32], ptr addrspace(3) @local_memory.local_mem, i32 0, i32 %.add
+  %tmp = load i32, ptr addrspace(3) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %y.i
+  store i32 %tmp, ptr addrspace(1) %arrayidx2, align 4
   ret void
 }
 
@@ -43,7 +43,7 @@ entry:
 @local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
 
 ; Check that the LDS size emitted correctly
-define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @local_memory_two_objects(ptr addrspace(1) %out) #0 {
 ; SI-LABEL: local_memory_two_objects:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
@@ -86,22 +86,22 @@ define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
 ; CI-NEXT:    s_endpgm
 entry:
   %x.i = call i32 @llvm.amdgcn.workitem.id.x()
-  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
-  store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(3) @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
+  store i32 %x.i, ptr addrspace(3) %arrayidx, align 4
   %mul = shl nsw i32 %x.i, 1
-  %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
-  store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [4 x i32], ptr addrspace(3) @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
+  store i32 %mul, ptr addrspace(3) %arrayidx1, align 4
   %sub = sub nsw i32 3, %x.i
   call void @llvm.amdgcn.s.barrier()
-  %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
-  %tmp = load i32, i32 addrspace(3)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
-  store i32 %tmp, i32 addrspace(1)* %arrayidx3, align 4
-  %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
-  %tmp1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
+  %arrayidx2 = getelementptr inbounds [4 x i32], ptr addrspace(3) @local_memory_two_objects.local_mem0, i32 0, i32 %sub
+  %tmp = load i32, ptr addrspace(3) %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %x.i
+  store i32 %tmp, ptr addrspace(1) %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds [4 x i32], ptr addrspace(3) @local_memory_two_objects.local_mem1, i32 0, i32 %sub
+  %tmp1 = load i32, ptr addrspace(3) %arrayidx4, align 4
   %add = add nsw i32 %x.i, 4
-  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
-  store i32 %tmp1, i32 addrspace(1)* %arrayidx5, align 4
+  %arrayidx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %add
+  store i32 %tmp1, ptr addrspace(1) %arrayidx5, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/local-memory.ll b/llvm/test/CodeGen/AMDGPU/local-memory.ll
index 78e76f7e00301..2d36949526722 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.ll
@@ -14,12 +14,12 @@
 ; GCN: ds_read_b32 v{{[0-9]+}}, v[[PTR]] offset:4
 
 ; R600: LDS_READ_RET
-define amdgpu_kernel void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @load_i32_local_const_ptr(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
 entry:
-  %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1
-  %tmp1 = load i32, i32 addrspace(3)* %tmp0
-  %tmp2 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  store i32 %tmp1, i32 addrspace(1)* %tmp2
+  %tmp0 = getelementptr [512 x i32], ptr addrspace(3) @lds, i32 0, i32 1
+  %tmp1 = load i32, ptr addrspace(3) %tmp0
+  %tmp2 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  store i32 %tmp1, ptr addrspace(1) %tmp2
   ret void
 }
 
@@ -30,14 +30,13 @@ entry:
 ; R600: LDS_READ_RET
 ; GCN-DAG: ds_read_b32
 ; GCN-DAG: ds_read2_b32
-define amdgpu_kernel void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
-  %scalar = load i32, i32 addrspace(3)* %in
-  %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
-  %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2
-  %vec0 = load <2 x i32>, <2 x i32> addrspace(3)* %vec_ptr, align 4
+define amdgpu_kernel void @load_i32_v2i32_local(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+  %scalar = load i32, ptr addrspace(3) %in
+  %vec_ptr = getelementptr <2 x i32>, ptr addrspace(3) %in, i32 2
+  %vec0 = load <2 x i32>, ptr addrspace(3) %vec_ptr, align 4
   %vec1 = insertelement <2 x i32> <i32 0, i32 0>, i32 %scalar, i32 0
   %vec = add <2 x i32> %vec0, %vec1
-  store <2 x i32> %vec, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %vec, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
index a8e4999a0bb0e..df126d5c3b64b 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@@ -16,7 +16,7 @@
 ; so eliminateFrameIndex would not adjust the access to use the
 ; correct FP offset.
 
-define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) {
 ; MUBUF-LABEL: local_stack_offset_uses_sp:
 ; MUBUF:       ; %bb.0: ; %entry
 ; MUBUF-NEXT:    s_add_u32 s0, s0, s9
@@ -92,19 +92,18 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out) {
 entry:
   %pin.low = alloca i32, align 8192, addrspace(5)
   %local.area = alloca [1060 x i64], align 4096, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %pin.low
-  %local.area.cast = bitcast [1060 x i64] addrspace(5)* %local.area to i8 addrspace(5)*
-  call void @llvm.memset.p5i8.i32(i8 addrspace(5)* align 4 %local.area.cast, i8 0, i32 8480, i1 true)
-  %gep.large.offset = getelementptr inbounds [1060 x i64], [1060 x i64] addrspace(5)* %local.area, i64 0, i64 1050
-  %gep.small.offset = getelementptr inbounds [1060 x i64], [1060 x i64] addrspace(5)* %local.area, i64 0, i64 8
-  %load0 = load volatile i64, i64 addrspace(5)* %gep.large.offset
-  %load1 = load volatile i64, i64 addrspace(5)* %gep.small.offset
+  store volatile i32 0, ptr addrspace(5) %pin.low
+  call void @llvm.memset.p5.i32(ptr addrspace(5) align 4 %local.area, i8 0, i32 8480, i1 true)
+  %gep.large.offset = getelementptr inbounds [1060 x i64], ptr addrspace(5) %local.area, i64 0, i64 1050
+  %gep.small.offset = getelementptr inbounds [1060 x i64], ptr addrspace(5) %local.area, i64 0, i64 8
+  %load0 = load volatile i64, ptr addrspace(5) %gep.large.offset
+  %load1 = load volatile i64, ptr addrspace(5) %gep.small.offset
   %add0 = add i64 %load0, %load1
-  store volatile i64 %add0, i64 addrspace(1)* %out
+  store volatile i64 %add0, ptr addrspace(1) %out
   ret void
 }
 
-define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out) {
+define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
 ; MUBUF-LABEL: func_local_stack_offset_uses_sp:
 ; MUBUF:       ; %bb.0: ; %entry
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -188,19 +187,18 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out) {
 entry:
   %pin.low = alloca i32, align 8192, addrspace(5)
   %local.area = alloca [1060 x i64], align 4096, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %pin.low
-  %local.area.cast = bitcast [1060 x i64] addrspace(5)* %local.area to i8 addrspace(5)*
-  call void @llvm.memset.p5i8.i32(i8 addrspace(5)* align 4 %local.area.cast, i8 0, i32 8480, i1 true)
-  %gep.large.offset = getelementptr inbounds [1060 x i64], [1060 x i64] addrspace(5)* %local.area, i64 0, i64 1050
-  %gep.small.offset = getelementptr inbounds [1060 x i64], [1060 x i64] addrspace(5)* %local.area, i64 0, i64 8
-  %load0 = load volatile i64, i64 addrspace(5)* %gep.large.offset
-  %load1 = load volatile i64, i64 addrspace(5)* %gep.small.offset
+  store volatile i32 0, ptr addrspace(5) %pin.low
+  call void @llvm.memset.p5.i32(ptr addrspace(5) align 4 %local.area, i8 0, i32 8480, i1 true)
+  %gep.large.offset = getelementptr inbounds [1060 x i64], ptr addrspace(5) %local.area, i64 0, i64 1050
+  %gep.small.offset = getelementptr inbounds [1060 x i64], ptr addrspace(5) %local.area, i64 0, i64 8
+  %load0 = load volatile i64, ptr addrspace(5) %gep.large.offset
+  %load1 = load volatile i64, ptr addrspace(5) %gep.small.offset
   %add0 = add i64 %load0, %load1
-  store volatile i64 %add0, i64 addrspace(1)* %out
+  store volatile i64 %add0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @local_stack_offset_uses_sp_flat(<3 x i64> addrspace(1)* %out) {
+define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out) {
 ; MUBUF-LABEL: local_stack_offset_uses_sp_flat:
 ; MUBUF:       ; %bb.0: ; %entry
 ; MUBUF-NEXT:    s_add_u32 s0, s0, s9
@@ -319,18 +317,16 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(<3 x i64> addrspace(1
 entry:
   %pin.low = alloca i32, align 1024, addrspace(5)
   %local.area = alloca [160 x <3 x i64>], align 8192, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %pin.low
-  %local.area.cast = bitcast [160 x <3 x i64>] addrspace(5)* %local.area to i8 addrspace(5)*
-  call void @llvm.memset.p5i8.i32(i8 addrspace(5)* align 4 %local.area.cast, i8 0, i32 8480, i1 true)
-  %gep.large.offset = getelementptr inbounds [160 x <3 x i64>], [160 x <3 x i64>] addrspace(5)* %local.area, i64 0, i64 150
-  %gep.small.offset = getelementptr inbounds [160 x <3 x i64>], [160 x <3 x i64>] addrspace(5)* %local.area, i64 0, i64 0
-  %load0 = load volatile <3 x i64>, <3 x i64> addrspace(5)* %gep.large.offset
-  %load1 = load volatile <3 x i64>, <3 x i64> addrspace(5)* %gep.small.offset
+  store volatile i32 0, ptr addrspace(5) %pin.low
+  call void @llvm.memset.p5.i32(ptr addrspace(5) align 4 %local.area, i8 0, i32 8480, i1 true)
+  %gep.large.offset = getelementptr inbounds [160 x <3 x i64>], ptr addrspace(5) %local.area, i64 0, i64 150
+  %load0 = load volatile <3 x i64>, ptr addrspace(5) %gep.large.offset
+  %load1 = load volatile <3 x i64>, ptr addrspace(5) %local.area
   %add0 = add <3 x i64> %load0, %load1
-  store volatile <3 x i64> %add0, <3 x i64> addrspace(1)* %out
+  store volatile <3 x i64> %add0, ptr addrspace(1) %out
   ret void
 }
 
-declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture writeonly, i8, i32, i1 immarg) #0
+declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg) #0
 
 attributes #0 = { argmemonly nounwind willreturn writeonly }

diff  --git a/llvm/test/CodeGen/AMDGPU/local-stack-slot-offset.ll b/llvm/test/CodeGen/AMDGPU/local-stack-slot-offset.ll
index 790715cda728e..30cba8e9e1a9e 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-slot-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-slot-offset.ll
@@ -16,19 +16,18 @@ main_body:
   %m1 = alloca [513 x float], addrspace(5)
   %m2 = alloca [513 x float], addrspace(5)
 
-  %gep1.store = getelementptr [513 x float], [513 x float] addrspace(5)* %m1, i32 0, i32 %idx1
-  store float %v1, float addrspace(5)* %gep1.store
+  %gep1.store = getelementptr [513 x float], ptr addrspace(5) %m1, i32 0, i32 %idx1
+  store float %v1, ptr addrspace(5) %gep1.store
 
-  %gep2.store = getelementptr [513 x float], [513 x float] addrspace(5)* %m2, i32 0, i32 %idx2
-  store float %v2, float addrspace(5)* %gep2.store
+  %gep2.store = getelementptr [513 x float], ptr addrspace(5) %m2, i32 0, i32 %idx2
+  store float %v2, ptr addrspace(5) %gep2.store
 
 ; This used to use a base reg equal to 0.
-  %gep1.load = getelementptr [513 x float], [513 x float] addrspace(5)* %m1, i32 0, i32 0
-  %out1 = load float, float addrspace(5)* %gep1.load
+  %out1 = load float, ptr addrspace(5) %m1
 
 ; This used to attempt to re-use the base reg at 0, generating an out-of-bounds instruction offset.
-  %gep2.load = getelementptr [513 x float], [513 x float] addrspace(5)* %m2, i32 0, i32 512
-  %out2 = load float, float addrspace(5)* %gep2.load
+  %gep2.load = getelementptr [513 x float], ptr addrspace(5) %m2, i32 0, i32 512
+  %out2 = load float, ptr addrspace(5) %gep2.load
 
   %r = fadd float %out1, %out2
   ret float %r

diff  --git a/llvm/test/CodeGen/AMDGPU/loop-address.ll b/llvm/test/CodeGen/AMDGPU/loop-address.ll
index e25d4f4b4f5f2..5f558fa7c7eb6 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-address.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-address.ll
@@ -5,7 +5,7 @@
 ;CHECK: LOOP_BREAK @10
 ;CHECK: POP @10
 
-define amdgpu_kernel void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 {
+define amdgpu_kernel void @loop_ge(ptr addrspace(1) nocapture %out, i32 %iterations) #0 {
 entry:
   %cmp5 = icmp sgt i32 %iterations, 0
   br i1 %cmp5, label %for.body, label %for.end
@@ -14,8 +14,8 @@ for.body:                                         ; preds = %for.body, %entry
   %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ]
   %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ]
   %i.07 = add nsw i32 %i.07.in, -1
-  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %ai.06
-  store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %ai.06
+  store i32 %i.07, ptr addrspace(1) %arrayidx, align 4
   %add = add nsw i32 %ai.06, 1
   %exitcond = icmp eq i32 %add, %iterations
   br i1 %exitcond, label %for.end, label %for.body
@@ -28,7 +28,7 @@ attributes #0 = { nounwind "fp-contract-model"="standard" "relocation-model"="pi
 
 !opencl.kernels = !{!0, !1, !2, !3}
 
-!0 = !{void (i32 addrspace(1)*, i32)* @loop_ge}
+!0 = !{ptr @loop_ge}
 !1 = !{null}
 !2 = !{null}
 !3 = !{null}

diff  --git a/llvm/test/CodeGen/AMDGPU/loop-idiom.ll b/llvm/test/CodeGen/AMDGPU/loop-idiom.ll
index a274774af1b9f..c16b43555237b 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-idiom.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-idiom.ll
@@ -10,17 +10,17 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
 ; FUNC: @no_memcpy
 ; R600-NOT: {{^}}llvm.memcpy
 ; SI-NOT: {{^}}llvm.memcpy
-define amdgpu_kernel void @no_memcpy(i8 addrspace(3)* %in, i32 %size) {
+define amdgpu_kernel void @no_memcpy(ptr addrspace(3) %in, i32 %size) {
 entry:
   %dest = alloca i8, i32 32, addrspace(5)
   br label %for.body
 
 for.body:
   %0 = phi i32 [0, %entry], [%4, %for.body]
-  %1 = getelementptr i8, i8 addrspace(3)* %in, i32 %0
-  %2 = getelementptr i8, i8 addrspace(5)* %dest, i32 %0
-  %3 = load i8, i8 addrspace(3)* %1
-  store i8 %3, i8 addrspace(5)* %2
+  %1 = getelementptr i8, ptr addrspace(3) %in, i32 %0
+  %2 = getelementptr i8, ptr addrspace(5) %dest, i32 %0
+  %3 = load i8, ptr addrspace(3) %1
+  store i8 %3, ptr addrspace(5) %2
   %4 = add i32 %0, 1
   %5 = icmp eq i32 %4, %size
   br i1 %5, label %for.end, label %for.body
@@ -41,8 +41,8 @@ entry:
 
 for.body:
   %0 = phi i32 [0, %entry], [%2, %for.body]
-  %1 = getelementptr i8, i8 addrspace(5)* %dest, i32 %0
-  store i8 0, i8 addrspace(5)* %1
+  %1 = getelementptr i8, ptr addrspace(5) %dest, i32 %0
+  store i8 0, ptr addrspace(5) %1
   %2 = add i32 %0, 1
   %3 = icmp eq i32 %2, %size
   br i1 %3, label %for.end, label %for.body

diff  --git a/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll b/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll
index 6f5b220d3c354..546022b4f9c43 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll
@@ -9,7 +9,7 @@ define void @loop_on_argument(i1 %arg) {
 ; IR:       loop:
 ; IR-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP0:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
 ; IR-NEXT:    [[TMP0]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[ARG:%.*]], i64 [[PHI_BROKEN]])
-; IR-NEXT:    store volatile i32 0, i32 addrspace(1)* undef, align 4
+; IR-NEXT:    store volatile i32 0, ptr addrspace(1) undef, align 4
 ; IR-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP0]])
 ; IR-NEXT:    br i1 [[TMP1]], label [[EXIT:%.*]], label [[LOOP]]
 ; IR:       exit:
@@ -38,7 +38,7 @@ entry:
   br label %loop
 
 loop:
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   br i1 %arg, label %exit, label %loop
 
 exit:

diff  --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll
index 8ecbd04188545..1227f83baabe7 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll
@@ -13,7 +13,7 @@
 ; GFX10:          s_sleep 0
 ; GFX10:          s_cbranch_scc0 [[L1]]
 ; GFX10-NEXT:     s_endpgm
-define amdgpu_kernel void @test_loop_64(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @test_loop_64(ptr addrspace(1) nocapture %arg) {
 bb:
   br label %bb2
 
@@ -38,7 +38,7 @@ bb2:                                              ; preds = %bb2, %bb
 ; GFX10:          s_sleep 0
 ; GFX10:          s_cbranch_scc0 [[L1]]
 ; GFX10-NEXT:     s_endpgm
-define amdgpu_kernel void @test_loop_128(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @test_loop_128(ptr addrspace(1) nocapture %arg) {
 bb:
   br label %bb2
 
@@ -80,7 +80,7 @@ bb2:                                              ; preds = %bb2, %bb
 ; GFX10:          s_cbranch_scc0 [[L1]]
 ; GFX10-NEXT:     s_inst_prefetch 0x2
 ; GFX10-NEXT:     s_endpgm
-define amdgpu_kernel void @test_loop_192(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @test_loop_192(ptr addrspace(1) nocapture %arg) {
 bb:
   br label %bb2
 
@@ -136,7 +136,7 @@ bb2:                                              ; preds = %bb2, %bb
 ; GFX10:          s_sleep 0
 ; GFX10:          s_cbranch_scc0 [[L1]]
 ; GFX10-NEXT:     s_endpgm
-define amdgpu_kernel void @test_loop_256(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @test_loop_256(ptr addrspace(1) nocapture %arg) {
 bb:
   br label %bb2
 
@@ -220,7 +220,7 @@ bb2:                                              ; preds = %bb2, %bb
 ; GFX10:          s_cbranch_scc{{[01]}} [[L1]]
 ; GFX10-NEXT:     s_inst_prefetch 0x2
 ; GFX10-NEXT:     s_endpgm
-define amdgpu_kernel void @test_loop_prefetch_inner_outer(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @test_loop_prefetch_inner_outer(ptr addrspace(1) nocapture %arg) {
 bb:
   br label %bb2
 
@@ -303,7 +303,7 @@ bb4:
 ; GFX10-NEXT:     s_inst_prefetch 0x2
 ; GFX10:          s_cbranch_scc{{[01]}} [[L0]]
 ; GFX10-NEXT:     s_endpgm
-define amdgpu_kernel void @test_loop_prefetch_inner_outer_noouter(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @test_loop_prefetch_inner_outer_noouter(ptr addrspace(1) nocapture %arg) {
 bb:
   br label %bb2
 

diff  --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll
index 6c23be91a153d..3959ff5487f8b 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_break.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll
@@ -17,7 +17,7 @@ define amdgpu_kernel void @break_loop(i32 %arg) #0 {
 ; OPT-NEXT:    [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0
 ; OPT-NEXT:    br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]]
 ; OPT:       bb4:
-; OPT-NEXT:    [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4
 ; OPT-NEXT:    [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]]
 ; OPT-NEXT:    br label [[FLOW]]
 ; OPT:       Flow:
@@ -77,7 +77,7 @@ bb1:
   br i1 %cmp0, label %bb4, label %bb9
 
 bb4:
-  %load = load volatile i32, i32 addrspace(1)* undef, align 4
+  %load = load volatile i32, ptr addrspace(1) undef, align 4
   %cmp1 = icmp slt i32 %my.tmp, %load
   br i1 %cmp1, label %bb1, label %bb9
 
@@ -98,7 +98,7 @@ define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 {
 ; OPT-NEXT:    [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0
 ; OPT-NEXT:    br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]]
 ; OPT:       bb4:
-; OPT-NEXT:    [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4
 ; OPT-NEXT:    [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]]
 ; OPT-NEXT:    br label [[FLOW]]
 ; OPT:       Flow:
@@ -109,7 +109,7 @@ define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 {
 ; OPT-NEXT:    br i1 [[TMP1]], label [[BB9:%.*]], label [[BB1]]
 ; OPT:       bb9:
 ; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP0]])
-; OPT-NEXT:    store volatile i32 7, i32 addrspace(3)* undef, align 4
+; OPT-NEXT:    store volatile i32 7, ptr addrspace(3) undef, align 4
 ; OPT-NEXT:    ret void
 ;
 ; GCN-LABEL: undef_phi_cond_break_loop:
@@ -160,7 +160,7 @@ bb1:                                              ; preds = %Flow, %bb
   br i1 %cmp0, label %bb4, label %Flow
 
 bb4:                                              ; preds = %bb1
-  %load = load volatile i32, i32 addrspace(1)* undef, align 4
+  %load = load volatile i32, ptr addrspace(1) undef, align 4
   %cmp1 = icmp sge i32 %my.tmp, %load
   br label %Flow
 
@@ -170,7 +170,7 @@ Flow:                                             ; preds = %bb4, %bb1
   br i1 %my.tmp3, label %bb9, label %bb1
 
 bb9:                                              ; preds = %Flow
-  store volatile i32 7, i32 addrspace(3)* undef
+  store volatile i32 7, ptr addrspace(3) undef
   ret void
 }
 
@@ -190,18 +190,18 @@ define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 {
 ; OPT-NEXT:    [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0
 ; OPT-NEXT:    br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]]
 ; OPT:       bb4:
-; OPT-NEXT:    [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4
 ; OPT-NEXT:    [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]]
 ; OPT-NEXT:    br label [[FLOW]]
 ; OPT:       Flow:
 ; OPT-NEXT:    [[MY_TMP2]] = phi i32 [ [[LSR_IV_NEXT]], [[BB4]] ], [ undef, [[BB1]] ]
-; OPT-NEXT:    [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], [[BB4]] ], [ icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds), [[BB1]] ]
+; OPT-NEXT:    [[MY_TMP3:%.*]] = phi i1 [ [[CMP1]], [[BB4]] ], [ icmp ne (ptr addrspace(3) inttoptr (i32 4 to ptr addrspace(3)), ptr addrspace(3) @lds), [[BB1]] ]
 ; OPT-NEXT:    [[TMP0]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[MY_TMP3]], i64 [[PHI_BROKEN]])
 ; OPT-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP0]])
 ; OPT-NEXT:    br i1 [[TMP1]], label [[BB9:%.*]], label [[BB1]]
 ; OPT:       bb9:
 ; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP0]])
-; OPT-NEXT:    store volatile i32 7, i32 addrspace(3)* undef, align 4
+; OPT-NEXT:    store volatile i32 7, ptr addrspace(3) undef, align 4
 ; OPT-NEXT:    ret void
 ;
 ; GCN-LABEL: constexpr_phi_cond_break_loop:
@@ -252,17 +252,17 @@ bb1:                                              ; preds = %Flow, %bb
   br i1 %cmp0, label %bb4, label %Flow
 
 bb4:                                              ; preds = %bb1
-  %load = load volatile i32, i32 addrspace(1)* undef, align 4
+  %load = load volatile i32, ptr addrspace(1) undef, align 4
   %cmp1 = icmp sge i32 %my.tmp, %load
   br label %Flow
 
 Flow:                                             ; preds = %bb4, %bb1
   %my.tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
-  %my.tmp3 = phi i1 [ %cmp1, %bb4 ], [ icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds), %bb1 ]
+  %my.tmp3 = phi i1 [ %cmp1, %bb4 ], [ icmp ne (ptr addrspace(3) inttoptr (i32 4 to ptr addrspace(3)), ptr addrspace(3) @lds), %bb1 ]
   br i1 %my.tmp3, label %bb9, label %bb1
 
 bb9:                                              ; preds = %Flow
-  store volatile i32 7, i32 addrspace(3)* undef
+  store volatile i32 7, ptr addrspace(3) undef
   ret void
 }
 
@@ -279,7 +279,7 @@ define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 {
 ; OPT-NEXT:    [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0
 ; OPT-NEXT:    br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]]
 ; OPT:       bb4:
-; OPT-NEXT:    [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4
 ; OPT-NEXT:    [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]]
 ; OPT-NEXT:    br label [[FLOW]]
 ; OPT:       Flow:
@@ -290,7 +290,7 @@ define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 {
 ; OPT-NEXT:    br i1 [[TMP1]], label [[BB9:%.*]], label [[BB1]]
 ; OPT:       bb9:
 ; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP0]])
-; OPT-NEXT:    store volatile i32 7, i32 addrspace(3)* undef, align 4
+; OPT-NEXT:    store volatile i32 7, ptr addrspace(3) undef, align 4
 ; OPT-NEXT:    ret void
 ;
 ; GCN-LABEL: true_phi_cond_break_loop:
@@ -341,7 +341,7 @@ bb1:                                              ; preds = %Flow, %bb
   br i1 %cmp0, label %bb4, label %Flow
 
 bb4:                                              ; preds = %bb1
-  %load = load volatile i32, i32 addrspace(1)* undef, align 4
+  %load = load volatile i32, ptr addrspace(1) undef, align 4
   %cmp1 = icmp sge i32 %my.tmp, %load
   br label %Flow
 
@@ -351,7 +351,7 @@ Flow:                                             ; preds = %bb4, %bb1
   br i1 %my.tmp3, label %bb9, label %bb1
 
 bb9:                                              ; preds = %Flow
-  store volatile i32 7, i32 addrspace(3)* undef
+  store volatile i32 7, ptr addrspace(3) undef
   ret void
 }
 
@@ -368,7 +368,7 @@ define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 {
 ; OPT-NEXT:    [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0
 ; OPT-NEXT:    br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]]
 ; OPT:       bb4:
-; OPT-NEXT:    [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4
 ; OPT-NEXT:    [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]]
 ; OPT-NEXT:    br label [[FLOW]]
 ; OPT:       Flow:
@@ -379,7 +379,7 @@ define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 {
 ; OPT-NEXT:    br i1 [[TMP1]], label [[BB9:%.*]], label [[BB1]]
 ; OPT:       bb9:
 ; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP0]])
-; OPT-NEXT:    store volatile i32 7, i32 addrspace(3)* undef, align 4
+; OPT-NEXT:    store volatile i32 7, ptr addrspace(3) undef, align 4
 ; OPT-NEXT:    ret void
 ;
 ; GCN-LABEL: false_phi_cond_break_loop:
@@ -430,7 +430,7 @@ bb1:                                              ; preds = %Flow, %bb
   br i1 %cmp0, label %bb4, label %Flow
 
 bb4:                                              ; preds = %bb1
-  %load = load volatile i32, i32 addrspace(1)* undef, align 4
+  %load = load volatile i32, ptr addrspace(1) undef, align 4
   %cmp1 = icmp sge i32 %my.tmp, %load
   br label %Flow
 
@@ -440,7 +440,7 @@ Flow:                                             ; preds = %bb4, %bb1
   br i1 %my.tmp3, label %bb9, label %bb1
 
 bb9:                                              ; preds = %Flow
-  store volatile i32 7, i32 addrspace(3)* undef
+  store volatile i32 7, ptr addrspace(3) undef
   ret void
 }
 
@@ -460,7 +460,7 @@ define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 {
 ; OPT-NEXT:    [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0
 ; OPT-NEXT:    br i1 [[CMP0]], label [[BB4:%.*]], label [[FLOW]]
 ; OPT:       bb4:
-; OPT-NEXT:    [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4
 ; OPT-NEXT:    [[CMP1:%.*]] = icmp sge i32 [[MY_TMP]], [[LOAD]]
 ; OPT-NEXT:    br label [[FLOW]]
 ; OPT:       Flow:
@@ -472,7 +472,7 @@ define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 {
 ; OPT-NEXT:    br i1 [[TMP1]], label [[BB9:%.*]], label [[BB1]]
 ; OPT:       bb9:
 ; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP0]])
-; OPT-NEXT:    store volatile i32 7, i32 addrspace(3)* undef, align 4
+; OPT-NEXT:    store volatile i32 7, ptr addrspace(3) undef, align 4
 ; OPT-NEXT:    ret void
 ;
 ; GCN-LABEL: invert_true_phi_cond_break_loop:
@@ -524,7 +524,7 @@ bb1:                                              ; preds = %Flow, %bb
   br i1 %cmp0, label %bb4, label %Flow
 
 bb4:                                              ; preds = %bb1
-  %load = load volatile i32, i32 addrspace(1)* undef, align 4
+  %load = load volatile i32, ptr addrspace(1) undef, align 4
   %cmp1 = icmp sge i32 %my.tmp, %load
   br label %Flow
 
@@ -534,7 +534,7 @@ Flow:                                             ; preds = %bb4, %bb1
   br i1 %my.tmp3, label %bb1, label %bb9
 
 bb9:                                              ; preds = %Flow
-  store volatile i32 7, i32 addrspace(3)* undef
+  store volatile i32 7, ptr addrspace(3) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-declaration.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-declaration.ll
index d293984ed7d09..b6409911dd23f 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-declaration.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-declaration.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck %s
 
- at llvm.global_ctors = external global [2 x { i32, void ()*, i8* }]
- at llvm.global_dtors = external global [2 x { i32, void ()*, i8* }]
+ at llvm.global_ctors = external global [2 x { i32, ptr, ptr }]
+ at llvm.global_dtors = external global [2 x { i32, ptr, ptr }]
 
 ; No amdgpu_kernels emitted for global_ctors declaration
 ; CHECK-NOT: amdgcn.device.init

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
index c763c434ea49f..f230d8d5e2dda 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
@@ -5,8 +5,8 @@
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-ctor-dtor,amdgpu-lower-ctor-dtor < %s | FileCheck %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf -s - 2>&1 | FileCheck %s -check-prefix=VISIBILITY
 
- at llvm.global_ctors = appending addrspace(1) global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @foo, i8* null }]
- at llvm.global_dtors = appending addrspace(1) global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @bar, i8* null }]
+ at llvm.global_ctors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @foo, ptr null }]
+ at llvm.global_dtors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @bar, ptr null }]
 
 ; CHECK-NOT: @llvm.global_ctors
 ; CHECK-NOT: @llvm.global_dtors

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-empty-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-empty-ctor-dtor.ll
index 0657ad78ff57c..0fde0dadc0d8d 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-empty-ctor-dtor.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-empty-ctor-dtor.ll
@@ -1,8 +1,8 @@
 ; RUN: opt -S -mtriple=amdgcn--  -amdgpu-lower-ctor-dtor < %s | FileCheck %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf -s - 2>&1 | FileCheck %s
 
- at llvm.global_ctors = appending global [0 x { i32, void ()*, i8* }] zeroinitializer
- at llvm.global_dtors = appending global [0 x { i32, void ()*, i8* }] zeroinitializer
+ at llvm.global_ctors = appending global [0 x { i32, ptr, ptr }] zeroinitializer
+ at llvm.global_dtors = appending global [0 x { i32, ptr, ptr }] zeroinitializer
 
 ; No amdgpu_kernels emitted for empty global_ctors
 ; CHECK-NOT: amdgcn.device.init

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
index 13b6d3c7fb471..5f4c62f3b233b 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
@@ -24,27 +24,25 @@
 ;.
 define amdgpu_kernel void @k0(i64 %x) {
 ; CHECK-LABEL: @k0(
-; CHECK-NEXT:    %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0), i32 0, i32 0
-; CHECK-NEXT:    %2 = addrspacecast i8 addrspace(3)* %1 to i8*
-; CHECK-NEXT:    %ptr = getelementptr inbounds i8, i8* %2, i64 %x
-; CHECK-NEXT:    store i8 1, i8* %ptr, align 1
+; CHECK-NEXT:    %1 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds to ptr
+; CHECK-NEXT:    %ptr = getelementptr inbounds i8, ptr %1, i64 %x
+; CHECK-NEXT:    store i8 1, ptr %ptr, align 1
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = getelementptr inbounds i8, i8* addrspacecast ([2 x i8] addrspace(3)* @lds.1 to i8*), i64 %x
-  store i8 1, i8 addrspace(0)* %ptr, align 1
+  %ptr = getelementptr inbounds i8, ptr addrspacecast (ptr addrspace(3) @lds.1 to ptr), i64 %x
+  store i8 1, ptr addrspace(0) %ptr, align 1
   ret void
 }
 
 define amdgpu_kernel void @k1(i64 %x) {
 ; CHECK-LABEL: @k1(
-; CHECK-NEXT:    %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0), i32 0, i32 0
-; CHECK-NEXT:    %2 = addrspacecast i8 addrspace(3)* %1 to i8*
-; CHECK-NEXT:    %ptr = getelementptr inbounds i8, i8* %2, i64 %x
-; CHECK-NEXT:    store i8 1, i8* %ptr, align 1
+; CHECK-NEXT:    %1 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds to ptr
+; CHECK-NEXT:    %ptr = getelementptr inbounds i8, ptr %1, i64 %x
+; CHECK-NEXT:    store i8 1, ptr %ptr, align 1
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = getelementptr inbounds i8, i8* addrspacecast ([2 x i8] addrspace(3)* @lds.1 to i8*), i64 %x
-  store i8 1, i8 addrspace(0)* %ptr, align 1
+  %ptr = getelementptr inbounds i8, ptr addrspacecast (ptr addrspace(3) @lds.1 to ptr), i64 %x
+  store i8 1, ptr addrspace(0) %ptr, align 1
   ret void
 }
 
@@ -53,16 +51,12 @@ define amdgpu_kernel void @k1(i64 %x) {
 ; Use constant twice from the same kernel
 define amdgpu_kernel void @k2(i64 %x) {
 ; CHECK-LABEL: @k2(
-; CHECK-NEXT:    %ptr1 = bitcast i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k2.lds.t, %llvm.amdgcn.kernel.k2.lds.t addrspace(3)* @llvm.amdgcn.kernel.k2.lds, i32 0, i32 0) to i8 addrspace(3)*
-; CHECK-NEXT:    store i8 1, i8 addrspace(3)* %ptr1, align 4
-; CHECK-NEXT:    %ptr2 = bitcast i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k2.lds.t, %llvm.amdgcn.kernel.k2.lds.t addrspace(3)* @llvm.amdgcn.kernel.k2.lds, i32 0, i32 0) to i8 addrspace(3)*
-; CHECK-NEXT:    store i8 2, i8 addrspace(3)* %ptr2, align 4
+; CHECK-NEXT:    store i8 1, ptr addrspace(3) @llvm.amdgcn.kernel.k2.lds, align 4
+; CHECK-NEXT:    store i8 2, ptr addrspace(3) @llvm.amdgcn.kernel.k2.lds, align 4
 ; CHECK-NEXT:    ret void
 ;
-  %ptr1 = bitcast i32 addrspace(3)* @lds.2 to i8 addrspace(3)*
-  store i8 1, i8 addrspace(3)* %ptr1, align 4
-  %ptr2 = bitcast i32 addrspace(3)* @lds.2 to i8 addrspace(3)*
-  store i8 2, i8 addrspace(3)* %ptr2, align 4
+  store i8 1, ptr addrspace(3) @lds.2, align 4
+  store i8 2, ptr addrspace(3) @lds.2, align 4
   ret void
 }
 
@@ -71,34 +65,31 @@ define amdgpu_kernel void @k2(i64 %x) {
 ; Use constant twice from the same kernel but a 
diff erent other constant.
 define amdgpu_kernel void @k3(i64 %x) {
 ; CHECK-LABEL: @k3(
-; CHECK-NEXT:    %1 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0), i32 0, i32 16
-; CHECK-NEXT:    %2 = bitcast i8 addrspace(3)* %1 to i64 addrspace(3)*
-; CHECK-NEXT:    %ptr1 = addrspacecast i64 addrspace(3)* %2 to i64*
-; CHECK-NEXT:    store i64 1, i64* %ptr1, align 1
-; CHECK-NEXT:    %3 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0), i32 0, i32 24
-; CHECK-NEXT:    %4 = bitcast i8 addrspace(3)* %3 to i64 addrspace(3)*
-; CHECK-NEXT:    %ptr2 = addrspacecast i64 addrspace(3)* %4 to i64*
-; CHECK-NEXT:    store i64 2, i64* %ptr2, align 8
+; CHECK-NEXT:    %1 = getelementptr inbounds [32 x i8], ptr addrspace(3) @llvm.amdgcn.kernel.k3.lds, i32 0, i32 16
+; CHECK-NEXT:    %ptr1 = addrspacecast ptr addrspace(3) %1 to ptr
+; CHECK-NEXT:    store i64 1, ptr %ptr1, align 1
+; CHECK-NEXT:    %2 = getelementptr inbounds [32 x i8], ptr addrspace(3) @llvm.amdgcn.kernel.k3.lds, i32 0, i32 24
+; CHECK-NEXT:    %ptr2 = addrspacecast ptr addrspace(3) %2 to ptr
+; CHECK-NEXT:    store i64 2, ptr %ptr2, align 8
 ; CHECK-NEXT:    ret void
 ;
-  %ptr1 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.3, i32 0, i32 16) to i64 addrspace(3)*) to i64*
-  store i64 1, i64* %ptr1, align 1
-  %ptr2 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.3, i32 0, i32 24) to i64 addrspace(3)*) to i64*
-  store i64 2, i64* %ptr2, align 1
+  %ptr1 = addrspacecast ptr addrspace(3) getelementptr inbounds ([32 x i8], ptr addrspace(3) @lds.3, i32 0, i32 16) to ptr
+  store i64 1, ptr %ptr1, align 1
+  %ptr2 = addrspacecast ptr addrspace(3) getelementptr inbounds ([32 x i8], ptr addrspace(3) @lds.3, i32 0, i32 24) to ptr
+  store i64 2, ptr %ptr2, align 1
   ret void
 }
 
 ; @lds.1 is used from constant expressions in 
diff erent kernels.
 define amdgpu_kernel void @k4(i64 %x) {
 ; CHECK-LABEL: @k4(
-; CHECK-NEXT:    %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k4.lds.t, %llvm.amdgcn.kernel.k4.lds.t addrspace(3)* @llvm.amdgcn.kernel.k4.lds, i32 0, i32 0), i32 0, i32 0
-; CHECK-NEXT:    %2 = addrspacecast i8 addrspace(3)* %1 to i8*
-; CHECK-NEXT:    %ptr = getelementptr inbounds i8, i8* %2, i64 %x
-; CHECK-NEXT:    store i8 1, i8* %ptr, align 1
+; CHECK-NEXT:    %1 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.k4.lds to ptr
+; CHECK-NEXT:    %ptr = getelementptr inbounds i8, ptr %1, i64 %x
+; CHECK-NEXT:    store i8 1, ptr %ptr, align 1
 ; CHECK-NEXT:    ret void
 ;
-  %ptr = getelementptr inbounds i8, i8* addrspacecast ([2 x i8] addrspace(3)* @lds.1 to i8*), i64 %x
-  store i8 1, i8 addrspace(0)* %ptr, align 1
+  %ptr = getelementptr inbounds i8, ptr addrspacecast (ptr addrspace(3) @lds.1 to ptr), i64 %x
+  store i8 1, ptr addrspace(0) %ptr, align 1
   ret void
 }
 
@@ -107,13 +98,11 @@ define amdgpu_kernel void @k4(i64 %x) {
 ; Multiple constexpr use in a same instruction.
 define amdgpu_kernel void @k5() {
 ; CHECK-LABEL: @k5(
-; CHECK-NEXT:  %1 = addrspacecast [505 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k5.lds.t, %llvm.amdgcn.kernel.k5.lds.t addrspace(3)* @llvm.amdgcn.kernel.k5.lds, i32 0, i32 0) to [505 x i32]*
-; CHECK-NEXT:  %2 = getelementptr inbounds [505 x i32], [505 x i32]* %1, i64 0, i64 0
-; CHECK-NEXT:  %3 = addrspacecast [505 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k5.lds.t, %llvm.amdgcn.kernel.k5.lds.t addrspace(3)* @llvm.amdgcn.kernel.k5.lds, i32 0, i32 0) to [505 x i32]*
-; CHECK-NEXT:  %4 = getelementptr inbounds [505 x i32], [505 x i32]* %3, i64 0, i64 0
-; CHECK-NEXT:  call void undef(i32* %2, i32* %4)
+; CHECK-NEXT:  %1 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.k5.lds to ptr
+; CHECK-NEXT:  %2 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.k5.lds to ptr
+; CHECK-NEXT:  call void undef(ptr %1, ptr %2)
 ;
-  call void undef(i32* getelementptr inbounds ([505 x i32], [505 x i32]* addrspacecast ([505 x i32] addrspace(3)* @lds.4 to [505 x i32]*), i64 0, i64 0), i32* getelementptr inbounds ([505 x i32], [505 x i32]* addrspacecast ([505 x i32] addrspace(3)* @lds.4 to [505 x i32]*), i64 0, i64 0))
+  call void undef(ptr addrspacecast (ptr addrspace(3) @lds.4 to ptr), ptr addrspacecast (ptr addrspace(3) @lds.4 to ptr))
   ret void
 }
 
@@ -125,12 +114,12 @@ define amdgpu_kernel void @k5() {
 define amdgpu_kernel void @k6() {
 ; CHECK-LABEL: @k6(
 
-; CHECK-NEXT:  %1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k6.lds.t, %llvm.amdgcn.kernel.k6.lds.t addrspace(3)* @llvm.amdgcn.kernel.k6.lds, i32 0, i32 0), i32 0, i32 2
-; CHECK-NEXT:  %2 = ptrtoint i32 addrspace(3)* %1 to i32
-; CHECK-NEXT:  %3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k6.lds.t, %llvm.amdgcn.kernel.k6.lds.t addrspace(3)* @llvm.amdgcn.kernel.k6.lds, i32 0, i32 0), i32 0, i32 2
-; CHECK-NEXT:  store i32 %2, i32 addrspace(3)* %3, align 8
+; CHECK-NEXT:  %1 = getelementptr inbounds [4 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.k6.lds, i32 0, i32 2
+; CHECK-NEXT:  %2 = ptrtoint ptr addrspace(3) %1 to i32
+; CHECK-NEXT:  %3 = getelementptr inbounds [4 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.k6.lds, i32 0, i32 2
+; CHECK-NEXT:  store i32 %2, ptr addrspace(3) %3, align 8
 ; CHECK-NEXT:  ret void
 ;
-  store i32 ptrtoint (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds.5, i32 0, i32 2) to i32), i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds.5, i32 0, i32 2)
+  store i32 ptrtoint (ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @lds.5, i32 0, i32 2) to i32), ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @lds.5, i32 0, i32 2)
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-check-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-check-metadata.ll
index e6ca5bb9472f2..d96805656f72c 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-check-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-check-metadata.ll
@@ -10,7 +10,7 @@ target triple = "amdgcn-amd-amdhsa"
 
 define i32 @rw() #0 {
 entry:
-  %0 = atomicrmw add i32 addrspace(3)* @global_barrier_state, i32 1 acq_rel, align 4
+  %0 = atomicrmw add ptr addrspace(3) @global_barrier_state, i32 1 acq_rel, align 4
   ret i32 %0
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
index 303cc82e69c2d..d55d39107c4d8 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
@@ -32,36 +32,36 @@
 @addr4 = addrspace(4) global i64 undef
 
 ; Assign to self is treated as any other initializer, i.e. ignored by this pass
-; CHECK: @toself = addrspace(3) global float addrspace(3)* bitcast (float addrspace(3)* addrspace(3)* @toself to float addrspace(3)*), align 8
- at toself = addrspace(3) global float addrspace(3)* bitcast (float addrspace(3)* addrspace(3)* @toself to float addrspace(3)*), align 8
+; CHECK: @toself = addrspace(3) global ptr addrspace(3) @toself, align 8
+ at toself = addrspace(3) global ptr addrspace(3) @toself, align 8
 
 ; Use by .used lists doesn't trigger lowering
 ; CHECK-NOT: @llvm.used =
- at llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @var1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+ at llvm.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @var1 to ptr)], section "llvm.metadata"
 
-; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @var2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
- at llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @var2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @var2 to ptr)], section "llvm.metadata"
+ at llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @var2 to ptr)], section "llvm.metadata"
 
 ; Access from a function would cause lowering for non-excluded cases
 ; CHECK-LABEL: @use_variables()
-; CHECK: %c0 = load i32, i32 addrspace(3)* @const_undef, align 4
-; CHECK: %c1 = load i64, i64 addrspace(3)* @const_with_init, align 4
-; CHECK: %v0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 seq_cst
-; CHECK: %v1 = cmpxchg i32 addrspace(3)* @extern, i32 4, i32 %c0 acq_rel monotonic
-; CHECK: %v2 = atomicrmw add i64 addrspace(4)* @addr4, i64 %c1 monotonic
+; CHECK: %c0 = load i32, ptr addrspace(3) @const_undef, align 4
+; CHECK: %c1 = load i64, ptr addrspace(3) @const_with_init, align 4
+; CHECK: %v0 = atomicrmw add ptr addrspace(3) @with_init, i64 1 seq_cst
+; CHECK: %v1 = cmpxchg ptr addrspace(3) @extern, i32 4, i32 %c0 acq_rel monotonic
+; CHECK: %v2 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic
 define void @use_variables() {
-  %c0 = load i32, i32 addrspace(3)* @const_undef, align 4
-  %c1 = load i64, i64 addrspace(3)* @const_with_init, align 4
-  %v0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 seq_cst
-  %v1 = cmpxchg i32 addrspace(3)* @extern, i32 4, i32 %c0 acq_rel monotonic
-  %v2 = atomicrmw add i64 addrspace(4)* @addr4, i64 %c1 monotonic
+  %c0 = load i32, ptr addrspace(3) @const_undef, align 4
+  %c1 = load i64, ptr addrspace(3) @const_with_init, align 4
+  %v0 = atomicrmw add ptr addrspace(3) @with_init, i64 1 seq_cst
+  %v1 = cmpxchg ptr addrspace(3) @extern, i32 4, i32 %c0 acq_rel monotonic
+  %v2 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic
   ret void
 }
 
 ; CHECK-LABEL: @kern_use()
-; CHECK: %inc = atomicrmw add i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.kern_use.lds.t, %llvm.amdgcn.kernel.kern_use.lds.t addrspace(3)* @llvm.amdgcn.kernel.kern_use.lds, i32 0, i32 0), i32 1 monotonic, align 4
+; CHECK: %inc = atomicrmw add ptr addrspace(3) @llvm.amdgcn.kernel.kern_use.lds, i32 1 monotonic, align 4
 define amdgpu_kernel void @kern_use() {
-  %inc = atomicrmw add i32 addrspace(3)* @var1, i32 1 monotonic
+  %inc = atomicrmw add ptr addrspace(3) @var1, i32 1 monotonic
   call void @use_variables()
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
index 1ddc365bc9741..e08bef6ac8e0f 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
@@ -17,18 +17,14 @@
 ; GCN:     ds_write_b8 [[NULL]], [[TWO]] offset:16
 define amdgpu_kernel void @k0() {
 ; OPT-LABEL: @k0(
-; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_MODULE_LDS_T:%.*]] addrspace(3)* @llvm.amdgcn.module.lds) ]
-; OPT-NEXT:    [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
-; OPT-NEXT:    store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
-; OPT-NEXT:    [[LDS_SIZE_16_ALIGN_16_BC:%.*]] = bitcast [16 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_LDS_T:%.*]], [[LLVM_AMDGCN_KERNEL_K0_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*
-; OPT-NEXT:    store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16
+; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
+; OPT-NEXT:    store i8 1, ptr addrspace(3) @llvm.amdgcn.module.lds, align 1
+; OPT-NEXT:    store i8 2, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 16
 ; OPT-NEXT:    call void @f0()
 ; OPT-NEXT:    ret void
 ;
-  %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
-  store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
-  %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
-  store i8 2, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
+  store i8 1, ptr addrspace(3) @lds.size.1.align.1, align 1
+  store i8 2, ptr addrspace(3) @lds.size.16.align.16, align 16
   call void @f0()
   ret void
 }
@@ -39,12 +35,10 @@ define amdgpu_kernel void @k0() {
 ; GCN:     ds_write_b8 [[NULL]], [[TREE]]
 define void @f0() {
 ; OPT-LABEL: @f0() {
-; OPT-NEXT:    [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
-; OPT-NEXT:    store i8 3, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
+; OPT-NEXT:    store i8 3, ptr addrspace(3) @llvm.amdgcn.module.lds, align 1
 ; OPT-NEXT:    ret void
 ;
-  %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
-  store i8 3, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
+  store i8 3, ptr addrspace(3) @lds.size.1.align.1, align 1
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll
index 7a33754892e4e..66d357f724c3c 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-ambiguous.ll
@@ -9,27 +9,27 @@
 @kernel.lds = addrspace(3) global i8 undef
 define amdgpu_kernel void @k0() {
 ; CHECK-LABEL: @k0(
-; CHECK-NEXT:    [[LD:%.*]] = load i8, i8 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_LDS_T:%.*]], [[LLVM_AMDGCN_KERNEL_K0_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0), align 1
+; CHECK-NEXT:    [[LD:%.*]] = load i8, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 1
 ; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[LD]], 2
-; CHECK-NEXT:    store i8 [[MUL]], i8 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_LDS_T]], [[LLVM_AMDGCN_KERNEL_K0_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0), align 1
+; CHECK-NEXT:    store i8 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 1
 ; CHECK-NEXT:    ret void
 ;
-  %ld = load i8, i8 addrspace(3)* @kernel.lds
+  %ld = load i8, ptr addrspace(3) @kernel.lds
   %mul = mul i8 %ld, 2
-  store i8 %mul, i8  addrspace(3)* @kernel.lds
+  store i8 %mul, ptr  addrspace(3) @kernel.lds
   ret void
 }
 
 define amdgpu_kernel void @k1() {
 ; CHECK-LABEL: @k1(
-; CHECK-NEXT:    [[LD:%.*]] = load i8, i8 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K1_LDS_T:%.*]], [[LLVM_AMDGCN_KERNEL_K1_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0), align 1
+; CHECK-NEXT:    [[LD:%.*]] = load i8, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 1
 ; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[LD]], 3
-; CHECK-NEXT:    store i8 [[MUL]], i8 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K1_LDS_T]], [[LLVM_AMDGCN_KERNEL_K1_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0), align 1
+; CHECK-NEXT:    store i8 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 1
 ; CHECK-NEXT:    ret void
 ;
-  %ld = load i8, i8 addrspace(3)* @kernel.lds
+  %ld = load i8, ptr addrspace(3) @kernel.lds
   %mul = mul i8 %ld, 3
-  store i8 %mul, i8  addrspace(3)* @kernel.lds
+  store i8 %mul, ptr  addrspace(3) @kernel.lds
   ret void
 }
 
@@ -41,39 +41,39 @@ define amdgpu_kernel void @k1() {
 @function.lds = addrspace(3) global i16 undef
 define void @f0() {
 ; M_OR_HY-LABEL: @f0(
-; M_OR_HY-NEXT:    [[LD:%.*]] = load i16, i16 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2
+; M_OR_HY-NEXT:    [[LD:%.*]] = load i16, ptr addrspace(3) @llvm.amdgcn.module.lds, align 2
 ; M_OR_HY-NEXT:    [[MUL:%.*]] = mul i16 [[LD]], 4
-; M_OR_HY-NEXT:    store i16 [[MUL]], i16 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2
+; M_OR_HY-NEXT:    store i16 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 2
 ; M_OR_HY-NEXT:    ret void
 ;
 ; TABLE-LABEL: @f0(
 ; TABLE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
-; TABLE-NEXT:    [[FUNCTION_LDS2:%.*]] = getelementptr inbounds [2 x [1 x i32]], [2 x [1 x i32]] addrspace(4)* @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
-; TABLE-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(4)* [[FUNCTION_LDS2]], align 4
-; TABLE-NEXT:    [[FUNCTION_LDS3:%.*]] = inttoptr i32 [[TMP2]] to i16 addrspace(3)*
-; TABLE-NEXT:    [[LD:%.*]] = load i16, i16 addrspace(3)* [[FUNCTION_LDS3]], align 2
+; TABLE-NEXT:    [[FUNCTION_LDS2:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; TABLE-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[FUNCTION_LDS2]], align 4
+; TABLE-NEXT:    [[FUNCTION_LDS3:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; TABLE-NEXT:    [[LD:%.*]] = load i16, ptr addrspace(3) [[FUNCTION_LDS3]], align 2
 ; TABLE-NEXT:    [[MUL:%.*]] = mul i16 [[LD]], 4
-; TABLE-NEXT:    [[FUNCTION_LDS:%.*]] = getelementptr inbounds [2 x [1 x i32]], [2 x [1 x i32]] addrspace(4)* @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
-; TABLE-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[FUNCTION_LDS]], align 4
-; TABLE-NEXT:    [[FUNCTION_LDS1:%.*]] = inttoptr i32 [[TMP3]] to i16 addrspace(3)*
-; TABLE-NEXT:    store i16 [[MUL]], i16 addrspace(3)* [[FUNCTION_LDS1]], align 2
+; TABLE-NEXT:    [[FUNCTION_LDS:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; TABLE-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[FUNCTION_LDS]], align 4
+; TABLE-NEXT:    [[FUNCTION_LDS1:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; TABLE-NEXT:    store i16 [[MUL]], ptr addrspace(3) [[FUNCTION_LDS1]], align 2
 ; TABLE-NEXT:    ret void
 ;
-  %ld = load i16, i16 addrspace(3)* @function.lds
+  %ld = load i16, ptr addrspace(3) @function.lds
   %mul = mul i16 %ld, 4
-  store i16 %mul, i16  addrspace(3)* @function.lds
+  store i16 %mul, ptr  addrspace(3) @function.lds
   ret void
 }
 
 
 define amdgpu_kernel void @k0_f0() {
 ; M_OR_HY-LABEL: @k0_f0(
-; M_OR_HY-NEXT:    call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_MODULE_LDS_T:%.*]] addrspace(3)* @llvm.amdgcn.module.lds) ]
+; M_OR_HY-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
 ; M_OR_HY-NEXT:    call void @f0()
 ; M_OR_HY-NEXT:    ret void
 ;
 ; TABLE-LABEL: @k0_f0(
-; TABLE-NEXT:    call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_KERNEL_K0_F0_LDS_T:%.*]] addrspace(3)* @llvm.amdgcn.kernel.k0_f0.lds) ]
+; TABLE-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0_f0.lds) ]
 ; TABLE-NEXT:    call void @f0()
 ; TABLE-NEXT:    ret void
 ;
@@ -83,12 +83,12 @@ define amdgpu_kernel void @k0_f0() {
 
 define amdgpu_kernel void @k1_f0() {
 ; M_OR_HY-LABEL: @k1_f0(
-; M_OR_HY-NEXT:    call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_MODULE_LDS_T:%.*]] addrspace(3)* @llvm.amdgcn.module.lds) ]
+; M_OR_HY-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
 ; M_OR_HY-NEXT:    call void @f0()
 ; M_OR_HY-NEXT:    ret void
 ;
 ; TABLE-LABEL: @k1_f0(
-; TABLE-NEXT:    call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_KERNEL_K1_F0_LDS_T:%.*]] addrspace(3)* @llvm.amdgcn.kernel.k1_f0.lds) ]
+; TABLE-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k1_f0.lds) ]
 ; TABLE-NEXT:    call void @f0()
 ; TABLE-NEXT:    ret void
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll
index 0fe89a3ddb5a4..c380bb1c9b926 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll
@@ -12,14 +12,14 @@
 @k0.lds = addrspace(3) global i8 undef
 define amdgpu_kernel void @k0() {
 ; CHECK-LABEL: @k0(
-; CHECK-NEXT:    [[LD:%.*]] = load i8, i8 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_LDS_T:%.*]], [[LLVM_AMDGCN_KERNEL_K0_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0), align 1
+; CHECK-NEXT:    [[LD:%.*]] = load i8, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 1
 ; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[LD]], 2
-; CHECK-NEXT:    store i8 [[MUL]], i8 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_LDS_T]], [[LLVM_AMDGCN_KERNEL_K0_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0), align 1
+; CHECK-NEXT:    store i8 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 1
 ; CHECK-NEXT:    ret void
 ;
-  %ld = load i8, i8 addrspace(3)* @k0.lds
+  %ld = load i8, ptr addrspace(3) @k0.lds
   %mul = mul i8 %ld, 2
-  store i8 %mul, i8  addrspace(3)* @k0.lds
+  store i8 %mul, ptr  addrspace(3) @k0.lds
   ret void
 }
 
@@ -28,44 +28,44 @@ define amdgpu_kernel void @k0() {
 @f0.lds = addrspace(3) global i16 undef
 define void @f0() {
 ; MODULE-LABEL: @f0(
-; MODULE-NEXT:    [[LD:%.*]] = load i16, i16 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !alias.scope !0, !noalias !3
+; MODULE-NEXT:    [[LD:%.*]] = load i16, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !alias.scope !0, !noalias !3
 ; MODULE-NEXT:    [[MUL:%.*]] = mul i16 [[LD]], 3
-; MODULE-NEXT:    store i16 [[MUL]], i16 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !alias.scope !0, !noalias !3
+; MODULE-NEXT:    store i16 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !alias.scope !0, !noalias !3
 ; MODULE-NEXT:    ret void
 ;
 ; TABLE-LABEL: @f0(
 ; TABLE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
-; TABLE-NEXT:    [[F0_LDS2:%.*]] = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(4)* @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
-; TABLE-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(4)* [[F0_LDS2]], align 4
-; TABLE-NEXT:    [[F0_LDS3:%.*]] = inttoptr i32 [[TMP2]] to i16 addrspace(3)*
-; TABLE-NEXT:    [[LD:%.*]] = load i16, i16 addrspace(3)* [[F0_LDS3]], align 2
+; TABLE-NEXT:    [[F0_LDS2:%.*]] = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; TABLE-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[F0_LDS2]], align 4
+; TABLE-NEXT:    [[F0_LDS3:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; TABLE-NEXT:    [[LD:%.*]] = load i16, ptr addrspace(3) [[F0_LDS3]], align 2
 ; TABLE-NEXT:    [[MUL:%.*]] = mul i16 [[LD]], 3
-; TABLE-NEXT:    [[F0_LDS:%.*]] = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(4)* @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
-; TABLE-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[F0_LDS]], align 4
-; TABLE-NEXT:    [[F0_LDS1:%.*]] = inttoptr i32 [[TMP3]] to i16 addrspace(3)*
-; TABLE-NEXT:    store i16 [[MUL]], i16 addrspace(3)* [[F0_LDS1]], align 2
+; TABLE-NEXT:    [[F0_LDS:%.*]] = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; TABLE-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[F0_LDS]], align 4
+; TABLE-NEXT:    [[F0_LDS1:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; TABLE-NEXT:    store i16 [[MUL]], ptr addrspace(3) [[F0_LDS1]], align 2
 ; TABLE-NEXT:    ret void
 ;
 ; K_OR_HY-LABEL: @f0(
-; K_OR_HY-NEXT:    [[LD:%.*]] = load i16, i16 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K_F0_LDS_T:%.*]], [[LLVM_AMDGCN_KERNEL_K_F0_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k_f0.lds, i32 0, i32 0), align 2
+; K_OR_HY-NEXT:    [[LD:%.*]] = load i16, ptr addrspace(3) @llvm.amdgcn.kernel.k_f0.lds, align 2
 ; K_OR_HY-NEXT:    [[MUL:%.*]] = mul i16 [[LD]], 3
-; K_OR_HY-NEXT:    store i16 [[MUL]], i16 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K_F0_LDS_T]], [[LLVM_AMDGCN_KERNEL_K_F0_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k_f0.lds, i32 0, i32 0), align 2
+; K_OR_HY-NEXT:    store i16 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k_f0.lds, align 2
 ; K_OR_HY-NEXT:    ret void
 ;
-  %ld = load i16, i16 addrspace(3)* @f0.lds
+  %ld = load i16, ptr addrspace(3) @f0.lds
   %mul = mul i16 %ld, 3
-  store i16 %mul, i16  addrspace(3)* @f0.lds
+  store i16 %mul, ptr  addrspace(3) @f0.lds
   ret void
 }
 
 define amdgpu_kernel void @k_f0() {
 ; MODULE-LABEL: @k_f0(
-; MODULE-NEXT:    call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_MODULE_LDS_T:%.*]] addrspace(3)* @llvm.amdgcn.module.lds) ]
+; MODULE-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
 ; MODULE-NEXT:    call void @f0()
 ; MODULE-NEXT:    ret void
 ;
 ; TABLE-LABEL: @k_f0(
-; TABLE-NEXT:    call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_KERNEL_K_F0_LDS_T:%.*]] addrspace(3)* @llvm.amdgcn.kernel.k_f0.lds) ]
+; TABLE-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k_f0.lds) ]
 ; TABLE-NEXT:    call void @f0()
 ; TABLE-NEXT:    ret void
 ;
@@ -82,63 +82,63 @@ define amdgpu_kernel void @k_f0() {
 @both.lds = addrspace(3) global i32 undef
 define void @f_both() {
 ; MODULE-LABEL: @f_both(
-; MODULE-NEXT:    [[LD:%.*]] = load i32, i32 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 4, !alias.scope !4, !noalias !3
+; MODULE-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope !4, !noalias !3
 ; MODULE-NEXT:    [[MUL:%.*]] = mul i32 [[LD]], 4
-; MODULE-NEXT:    store i32 [[MUL]], i32 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 4, !alias.scope !4, !noalias !3
+; MODULE-NEXT:    store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope !4, !noalias !3
 ; MODULE-NEXT:    ret void
 ;
 ; TABLE-LABEL: @f_both(
 ; TABLE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
-; TABLE-NEXT:    [[BOTH_LDS2:%.*]] = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(4)* @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
-; TABLE-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(4)* [[BOTH_LDS2]], align 4
-; TABLE-NEXT:    [[BOTH_LDS3:%.*]] = inttoptr i32 [[TMP2]] to i32 addrspace(3)*
-; TABLE-NEXT:    [[LD:%.*]] = load i32, i32 addrspace(3)* [[BOTH_LDS3]], align 4
+; TABLE-NEXT:    [[BOTH_LDS2:%.*]] = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; TABLE-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[BOTH_LDS2]], align 4
+; TABLE-NEXT:    [[BOTH_LDS3:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; TABLE-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(3) [[BOTH_LDS3]], align 4
 ; TABLE-NEXT:    [[MUL:%.*]] = mul i32 [[LD]], 4
-; TABLE-NEXT:    [[BOTH_LDS:%.*]] = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(4)* @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
-; TABLE-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[BOTH_LDS]], align 4
-; TABLE-NEXT:    [[BOTH_LDS1:%.*]] = inttoptr i32 [[TMP3]] to i32 addrspace(3)*
-; TABLE-NEXT:    store i32 [[MUL]], i32 addrspace(3)* [[BOTH_LDS1]], align 4
+; TABLE-NEXT:    [[BOTH_LDS:%.*]] = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; TABLE-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[BOTH_LDS]], align 4
+; TABLE-NEXT:    [[BOTH_LDS1:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; TABLE-NEXT:    store i32 [[MUL]], ptr addrspace(3) [[BOTH_LDS1]], align 4
 ; TABLE-NEXT:    ret void
 ;
 ; K_OR_HY-LABEL: @f_both(
-; K_OR_HY-NEXT:    [[LD:%.*]] = load i32, i32 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_BOTH_LDS_T:%.*]], [[LLVM_AMDGCN_KERNEL_K0_BOTH_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k0_both.lds, i32 0, i32 0), align 4
+; K_OR_HY-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds, align 4
 ; K_OR_HY-NEXT:    [[MUL:%.*]] = mul i32 [[LD]], 4
-; K_OR_HY-NEXT:    store i32 [[MUL]], i32 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_BOTH_LDS_T]], [[LLVM_AMDGCN_KERNEL_K0_BOTH_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k0_both.lds, i32 0, i32 0), align 4
+; K_OR_HY-NEXT:    store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds, align 4
 ; K_OR_HY-NEXT:    ret void
 ;
-  %ld = load i32, i32 addrspace(3)* @both.lds
+  %ld = load i32, ptr addrspace(3) @both.lds
   %mul = mul i32 %ld, 4
-  store i32 %mul, i32  addrspace(3)* @both.lds
+  store i32 %mul, ptr  addrspace(3) @both.lds
   ret void
 }
 
 define amdgpu_kernel void @k0_both() {
 ; MODULE-LABEL: @k0_both(
-; MODULE-NEXT:    call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_MODULE_LDS_T:%.*]] addrspace(3)* @llvm.amdgcn.module.lds) ]
-; MODULE-NEXT:    [[LD:%.*]] = load i32, i32 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 4, !alias.scope !4, !noalias !0
+; MODULE-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
+; MODULE-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope !4, !noalias !0
 ; MODULE-NEXT:    [[MUL:%.*]] = mul i32 [[LD]], 5
-; MODULE-NEXT:    store i32 [[MUL]], i32 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 4, !alias.scope !4, !noalias !0
+; MODULE-NEXT:    store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope !4, !noalias !0
 ; MODULE-NEXT:    call void @f_both()
 ; MODULE-NEXT:    ret void
 ;
 ; TABLE-LABEL: @k0_both(
-; TABLE-NEXT:    call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_KERNEL_K0_BOTH_LDS_T:%.*]] addrspace(3)* @llvm.amdgcn.kernel.k0_both.lds) ]
-; TABLE-NEXT:    [[LD:%.*]] = load i32, i32 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_BOTH_LDS_T]], [[LLVM_AMDGCN_KERNEL_K0_BOTH_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k0_both.lds, i32 0, i32 0), align 4
+; TABLE-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds) ]
+; TABLE-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds, align 4
 ; TABLE-NEXT:    [[MUL:%.*]] = mul i32 [[LD]], 5
-; TABLE-NEXT:    store i32 [[MUL]], i32 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_BOTH_LDS_T]], [[LLVM_AMDGCN_KERNEL_K0_BOTH_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k0_both.lds, i32 0, i32 0), align 4
+; TABLE-NEXT:    store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds, align 4
 ; TABLE-NEXT:    call void @f_both()
 ; TABLE-NEXT:    ret void
 ;
 ; K_OR_HY-LABEL: @k0_both(
-; K_OR_HY-NEXT:    [[LD:%.*]] = load i32, i32 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_BOTH_LDS_T:%.*]], [[LLVM_AMDGCN_KERNEL_K0_BOTH_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k0_both.lds, i32 0, i32 0), align 4
+; K_OR_HY-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds, align 4
 ; K_OR_HY-NEXT:    [[MUL:%.*]] = mul i32 [[LD]], 5
-; K_OR_HY-NEXT:    store i32 [[MUL]], i32 addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_BOTH_LDS_T]], [[LLVM_AMDGCN_KERNEL_K0_BOTH_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k0_both.lds, i32 0, i32 0), align 4
+; K_OR_HY-NEXT:    store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k0_both.lds, align 4
 ; K_OR_HY-NEXT:    call void @f_both()
 ; K_OR_HY-NEXT:    ret void
 ;
-  %ld = load i32, i32 addrspace(3)* @both.lds
+  %ld = load i32, ptr addrspace(3) @both.lds
   %mul = mul i32 %ld, 5
-  store i32 %mul, i32  addrspace(3)* @both.lds
+  store i32 %mul, ptr  addrspace(3) @both.lds
   call void @f_both()
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
index 972a7a286c0b3..b8e49d5c80455 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
@@ -10,7 +10,7 @@
 ; Start with one value to replace and one to ignore in the .use list
 
 ; @ignored still in list, @tolower removed
-; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
+; CHECK: @llvm.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @ignored to ptr)], section "llvm.metadata"
 
 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
 
@@ -22,12 +22,12 @@
 @ignored = addrspace(1) global i64 0
 
 
- at llvm.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
+ at llvm.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(3) @tolower to ptr), ptr addrspacecast (ptr addrspace(1) @ignored to ptr)], section "llvm.metadata"
 
 ; @ignored still in list, @tolower removed, llvm.amdgcn.module.lds appended
-; CHECK: @llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(1) @ignored to ptr), ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
 
- at llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
+ at llvm.compiler.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(3) @tolower to ptr), ptr addrspacecast (ptr addrspace(1) @ignored to ptr)], section "llvm.metadata"
 
 
 ; Functions that are not called are ignored by the lowering
@@ -37,9 +37,9 @@ define amdgpu_kernel void @call_func() {
 }
 
 ; CHECK-LABEL: @func()
-; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 1.000000e+00 monotonic, align 8
+; CHECK: %dec = atomicrmw fsub ptr addrspace(3) @llvm.amdgcn.module.lds, float 1.000000e+00 monotonic, align 8
 define void @func() {
-  %dec = atomicrmw fsub float addrspace(3)* @tolower, float 1.0 monotonic
-  %unused0 = atomicrmw add i64 addrspace(1)* @ignored, i64 1 monotonic
+  %dec = atomicrmw fsub ptr addrspace(3) @tolower, float 1.0 monotonic
+  %unused0 = atomicrmw add ptr addrspace(1) @ignored, i64 1 monotonic
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
index 113c7b0f08c4e..ae52854d2f851 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
@@ -1,8 +1,8 @@
 ; RUN: opt -S -mtriple=amdgcn--  -amdgpu-lower-ctor-dtor < %s | FileCheck %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readelf -s - 2>&1 | FileCheck %s -check-prefix=CHECK-VIS
 
- at llvm.global_ctors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @foo, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @foo.5, i8* null }]
- at llvm.global_dtors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @bar, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @bar.5, i8* null }]
+ at llvm.global_ctors = appending addrspace(1) global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @foo, ptr null }, { i32, ptr, ptr } { i32 1, ptr @foo.5, ptr null }]
+ at llvm.global_dtors = appending addrspace(1) global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @bar, ptr null }, { i32, ptr, ptr } { i32 1, ptr @bar.5, ptr null }]
 
 ; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.init() #0
 ; CHECK-NEXT: call void @foo

diff  --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
index 42f723d62c5a2..b89aea8edbae5 100644
--- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
@@ -15,24 +15,24 @@
 ; GCN: {{flat|global}}_store_{{short|b16}} v{{.+}}, v[[R]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @mad_u16(
-    i16 addrspace(1)* %r,
-    i16 addrspace(1)* %a,
-    i16 addrspace(1)* %b,
-    i16 addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a, i32 %tid
-  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b, i32 %tid
-  %c.gep = getelementptr inbounds i16, i16 addrspace(1)* %c, i32 %tid
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a, i32 %tid
+  %b.gep = getelementptr inbounds i16, ptr addrspace(1) %b, i32 %tid
+  %c.gep = getelementptr inbounds i16, ptr addrspace(1) %c, i32 %tid
 
-  %a.val = load volatile i16, i16 addrspace(1)* %a.gep
-  %b.val = load volatile i16, i16 addrspace(1)* %b.gep
-  %c.val = load volatile i16, i16 addrspace(1)* %c.gep
+  %a.val = load volatile i16, ptr addrspace(1) %a.gep
+  %b.val = load volatile i16, ptr addrspace(1) %b.gep
+  %c.val = load volatile i16, ptr addrspace(1) %c.gep
 
   %m.val = mul i16 %a.val, %b.val
   %r.val = add i16 %m.val, %c.val
 
-  store i16 %r.val, i16 addrspace(1)* %r
+  store i16 %r.val, ptr addrspace(1) %r
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
index fefdbe8a7e1a0..620566d3baff3 100644
--- a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
@@ -5,17 +5,16 @@
 
 declare i32 @llvm.amdgcn.workgroup.id.x() #0
 declare i32 @llvm.amdgcn.workitem.id.x() #0
-declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
 
 ; GCN-LABEL: {{^}}get_global_id_0:
 ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff
 ; GCN: s_mul_i32 [[MUL:s[0-9]+]], s8, [[WGSIZEX]]
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, [[MUL]], v0
-define amdgpu_kernel void @get_global_id_0(i32 addrspace(1)* %out) #1 {
-  %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %cast.dispatch.ptr = bitcast i8 addrspace(4)* %dispatch.ptr to i32 addrspace(4)*
-  %gep = getelementptr inbounds i32, i32 addrspace(4)* %cast.dispatch.ptr, i64 1
-  %workgroup.size.xy = load i32, i32 addrspace(4)* %gep, align 4, !invariant.load !0
+define amdgpu_kernel void @get_global_id_0(ptr addrspace(1) %out) #1 {
+  %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep = getelementptr inbounds i32, ptr addrspace(4) %dispatch.ptr, i64 1
+  %workgroup.size.xy = load i32, ptr addrspace(4) %gep, align 4, !invariant.load !0
   %workgroup.size.x = and i32 %workgroup.size.xy, 65535
 
   %workitem.id.x = call i32 @llvm.amdgcn.workitem.id.x(), !range !1
@@ -24,7 +23,7 @@ define amdgpu_kernel void @get_global_id_0(i32 addrspace(1)* %out) #1 {
   %mul = mul i32 %workgroup.id.x, %workgroup.size.x
   %add = add i32 %mul, %workitem.id.x
 
-  store i32 %add, i32 addrspace(1)* %out, align 4
+  store i32 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index f806149d0c395..26a9043a1b779 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -631,7 +631,7 @@ define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
   ret i64 %mad
 }
 
-define amdgpu_kernel void @mad_i64_i32_uniform(i64 addrspace(1)* %out, i32 %arg0, i32 %arg1, i64 %arg2) #0 {
+define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; CI-LABEL: mad_i64_i32_uniform:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -700,7 +700,7 @@ define amdgpu_kernel void @mad_i64_i32_uniform(i64 addrspace(1)* %out, i32 %arg0
   %ext1 = zext i32 %arg1 to i64
   %mul = mul i64 %ext0, %ext1
   %mad = add i64 %mul, %arg2
-  store i64 %mad, i64 addrspace(1)* %out
+  store i64 %mad, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/mad_int24.ll b/llvm/test/CodeGen/AMDGPU/mad_int24.ll
index e0fd0fe58150c..0e51e38a12f2a 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_int24.ll
@@ -11,7 +11,7 @@
 ; GCN: s_bfe_i32
 ; GCN: s_mul_i32
 ; GCN: s_add_i32
-define amdgpu_kernel void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @i32_mad24(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = shl i32 %a, 8
   %a_24 = ashr i32 %0, 8
@@ -19,7 +19,7 @@ entry:
   %b_24 = ashr i32 %1, 8
   %2 = mul i32 %a_24, %b_24
   %3 = add i32 %2, %c
-  store i32 %3, i32 addrspace(1)* %out
+  store i32 %3, ptr addrspace(1) %out
   ret void
 }
 
@@ -83,7 +83,7 @@ define i32 @mad24_intrin_known_bits_destroyed(i32 %a, i32 %b, i32 %c) {
 ; GCN-NOT: v_bfe
 ; GCN: v_mad_i32_i24
 ; GCN-NOT: v_bfe
-define void @mad24_destroyed_knownbits_2(i32 %arg, i32 %arg1, i32 %arg2, i32 addrspace(1)* %arg3) {
+define void @mad24_destroyed_knownbits_2(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(1) %arg3) {
 bb:
   br label %bb6
 
@@ -112,7 +112,7 @@ bb6:                                              ; preds = %bb6, %bb
   %tmp24 = ashr exact i32 %tmp23, 8
   %tmp25 = mul nsw i32 %tmp24, %tmp20
   %tmp26 = add nsw i32 %tmp25, %tmp22
-  store i32 %tmp26, i32 addrspace(1)* %arg3
+  store i32 %tmp26, ptr addrspace(1) %arg3
   %tmp27 = add nuw i32 %tmp, 1
   %tmp28 = icmp eq i32 %tmp27, %arg1
   br i1 %tmp28, label %bb5, label %bb6

diff  --git a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
index d7ef6a1fb634d..b1d53343a9557 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
@@ -326,7 +326,7 @@ define amdgpu_ps float @mad_i32_vvv_multiuse(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    ; return to shader part epilog
   %mul = mul i32 %a, %b
   %add = add i32 %mul, %c
-  store i32 %mul, i32* undef
+  store i32 %mul, ptr undef
   %cast = bitcast i32 %add to float
   ret float %cast
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
index db8904ef71e82..b9143150280a8 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
@@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; VI: s_mul_{{[iu]}}32
 ; VI: s_add_{{[iu]}}32
 
-define amdgpu_kernel void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @u32_mad24(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = shl i32 %a, 8
   %a_24 = lshr i32 %0, 8
@@ -21,7 +21,7 @@ entry:
   %b_24 = lshr i32 %1, 8
   %2 = mul i32 %a_24, %b_24
   %3 = add i32 %2, %c
-  store i32 %3, i32 addrspace(1)* %out
+  store i32 %3, ptr addrspace(1) %out
   ret void
 }
 
@@ -36,12 +36,12 @@ entry:
 ; GCN:	s_add_i32 [[MAD:s[0-9]]], [[MUL]], s{{[0-9]}}
 ; GCN:	s_sext_i32_i16 [[EXT:s[0-9]]], [[MAD]]
 ; GCN:	v_mov_b32_e32 v0, [[EXT]]
-define amdgpu_kernel void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
+define amdgpu_kernel void @i16_mad24(ptr addrspace(1) %out, i16 %a, i16 %b, i16 %c) {
 entry:
   %0 = mul i16 %a, %b
   %1 = add i16 %0, %c
   %2 = sext i16 %1 to i32
-  store i32 %2, i32 addrspace(1)* %out
+  store i32 %2, ptr addrspace(1) %out
   ret void
 }
 
@@ -56,12 +56,12 @@ entry:
 ; GCN:	s_add_i32 [[MAD:s[0-9]]], [[MUL]], s{{[0-9]}}
 ; GCN:	s_sext_i32_i8 [[EXT:s[0-9]]], [[MAD]]
 ; GCN:	v_mov_b32_e32 v0, [[EXT]]
-define amdgpu_kernel void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
+define amdgpu_kernel void @i8_mad24(ptr addrspace(1) %out, i8 %a, i8 %b, i8 %c) {
 entry:
   %0 = mul i8 %a, %b
   %1 = add i8 %0, %c
   %2 = sext i8 %1 to i32
-  store i32 %2, i32 addrspace(1)* %out
+  store i32 %2, ptr addrspace(1) %out
   ret void
 }
 
@@ -76,14 +76,14 @@ entry:
 ; EG: CNDE_INT
 ; SI: s_cselect
 ; GCN2: s_cselect
-define amdgpu_kernel void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @i24_i32_i32_mad(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) {
 entry:
   %0 = ashr i32 %a, 8
   %1 = icmp ne i32 %c, 0
   %2 = select i1 %1, i32 %0, i32 34
   %3 = mul i32 %2, %c
   %4 = add i32 %3, %d
-  store i32 %4, i32 addrspace(1)* %out
+  store i32 %4, ptr addrspace(1) %out
   ret void
 }
 
@@ -93,7 +93,7 @@ entry:
 ; SI: s_mul_i32
 ; SI: s_add_i32
 ; SI: s_add_i32
-define amdgpu_kernel void @extra_and(i32 addrspace(1)* %arg, i32 %arg2, i32 %arg3) {
+define amdgpu_kernel void @extra_and(ptr addrspace(1) %arg, i32 %arg2, i32 %arg3) {
 bb:
   br label %bb4
 
@@ -115,7 +115,7 @@ bb4:                                              ; preds = %bb4, %bb
   br i1 %tmp17, label %bb18, label %bb4
 
 bb18:                                             ; preds = %bb4
-  store i32 %tmp16, i32 addrspace(1)* %arg
+  store i32 %tmp16, ptr addrspace(1) %arg
   ret void
 }
 
@@ -125,7 +125,7 @@ bb18:                                             ; preds = %bb4
 ; SI: s_mul_i32
 ; SI: s_add_i32
 ; SI: s_add_i32
-define amdgpu_kernel void @dont_remove_shift(i32 addrspace(1)* %arg, i32 %arg2, i32 %arg3) {
+define amdgpu_kernel void @dont_remove_shift(ptr addrspace(1) %arg, i32 %arg2, i32 %arg3) {
 bb:
   br label %bb4
 
@@ -147,7 +147,7 @@ bb4:                                              ; preds = %bb4, %bb
   br i1 %tmp17, label %bb18, label %bb4
 
 bb18:                                             ; preds = %bb4
-  store i32 %tmp16, i32 addrspace(1)* %arg
+  store i32 %tmp16, ptr addrspace(1) %arg
   ret void
 }
 
@@ -161,15 +161,15 @@ bb18:                                             ; preds = %bb4
 ; VI: v_mad_u16 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 [[EXT:v[0-9]]], [[MAD]], 0, 16
 ; GCN: v_med3_i32 v{{[0-9]}}, [[EXT]],
-define amdgpu_kernel void @i8_mad_sat_16(i8 addrspace(1)* %out, i8 addrspace(1)* %in0, i8 addrspace(1)* %in1, i8 addrspace(1)* %in2, i64 addrspace(5)* %idx) {
+define amdgpu_kernel void @i8_mad_sat_16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(5) %idx) {
 entry:
-  %retval.0.i = load i64, i64 addrspace(5)* %idx
-  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 %retval.0.i
-  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 %retval.0.i
-  %arrayidx4 = getelementptr inbounds i8, i8 addrspace(1)* %in2, i64 %retval.0.i
-  %l1 = load i8, i8 addrspace(1)* %arrayidx, align 1
-  %l2 = load i8, i8 addrspace(1)* %arrayidx2, align 1
-  %l3 = load i8, i8 addrspace(1)* %arrayidx4, align 1
+  %retval.0.i = load i64, ptr addrspace(5) %idx
+  %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %in0, i64 %retval.0.i
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in1, i64 %retval.0.i
+  %arrayidx4 = getelementptr inbounds i8, ptr addrspace(1) %in2, i64 %retval.0.i
+  %l1 = load i8, ptr addrspace(1) %arrayidx, align 1
+  %l2 = load i8, ptr addrspace(1) %arrayidx2, align 1
+  %l3 = load i8, ptr addrspace(1) %arrayidx4, align 1
   %conv1.i = sext i8 %l1 to i16
   %conv3.i = sext i8 %l2 to i16
   %conv5.i = sext i8 %l3 to i16
@@ -180,8 +180,8 @@ entry:
   %c5 = icmp slt i16 %cond.i.i, 127
   %cond13.i.i = select i1 %c5, i16 %cond.i.i, i16 127
   %conv8.i = trunc i16 %cond13.i.i to i8
-  %arrayidx7 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %retval.0.i
-  store i8 %conv8.i, i8 addrspace(1)* %arrayidx7, align 1
+  %arrayidx7 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 %retval.0.i
+  store i8 %conv8.i, ptr addrspace(1) %arrayidx7, align 1
   ret void
 }
 
@@ -194,22 +194,22 @@ entry:
 ; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; VI: v_mad_u16 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 [[EXT:v[0-9]]], [[MAD]], 0, 16
-define amdgpu_kernel void @i8_mad_32(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b, i8 addrspace(1)* %c, i64 addrspace(5)* %idx) {
+define amdgpu_kernel void @i8_mad_32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(5) %idx) {
 entry:
-  %retval.0.i = load i64, i64 addrspace(5)* %idx
-  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %a, i64 %retval.0.i
-  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %b, i64 %retval.0.i
-  %arrayidx4 = getelementptr inbounds i8, i8 addrspace(1)* %c, i64 %retval.0.i
-  %la = load i8, i8 addrspace(1)* %arrayidx, align 1
-  %lb = load i8, i8 addrspace(1)* %arrayidx2, align 1
-  %lc = load i8, i8 addrspace(1)* %arrayidx4, align 1
+  %retval.0.i = load i64, ptr addrspace(5) %idx
+  %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %a, i64 %retval.0.i
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %b, i64 %retval.0.i
+  %arrayidx4 = getelementptr inbounds i8, ptr addrspace(1) %c, i64 %retval.0.i
+  %la = load i8, ptr addrspace(1) %arrayidx, align 1
+  %lb = load i8, ptr addrspace(1) %arrayidx2, align 1
+  %lc = load i8, ptr addrspace(1) %arrayidx4, align 1
   %exta = sext i8 %la to i16
   %extb = sext i8 %lb to i16
   %extc = sext i8 %lc to i16
   %mul = mul i16 %exta, %extb
   %mad = add i16 %mul, %extc
   %mad_ext = sext i16 %mad to i32
-  store i32 %mad_ext, i32 addrspace(1)* %out
+  store i32 %mad_ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -222,22 +222,22 @@ entry:
 ; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; VI: v_mad_u16 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 [[EXT:v[0-9]]], [[MAD]], 0, 16
-define amdgpu_kernel void @i8_mad_64(i64 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b, i8 addrspace(1)* %c, i64 addrspace(5)* %idx) {
+define amdgpu_kernel void @i8_mad_64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(5) %idx) {
 entry:
-  %retval.0.i = load i64, i64 addrspace(5)* %idx
-  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %a, i64 %retval.0.i
-  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %b, i64 %retval.0.i
-  %arrayidx4 = getelementptr inbounds i8, i8 addrspace(1)* %c, i64 %retval.0.i
-  %la = load i8, i8 addrspace(1)* %arrayidx, align 1
-  %lb = load i8, i8 addrspace(1)* %arrayidx2, align 1
-  %lc = load i8, i8 addrspace(1)* %arrayidx4, align 1
+  %retval.0.i = load i64, ptr addrspace(5) %idx
+  %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %a, i64 %retval.0.i
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %b, i64 %retval.0.i
+  %arrayidx4 = getelementptr inbounds i8, ptr addrspace(1) %c, i64 %retval.0.i
+  %la = load i8, ptr addrspace(1) %arrayidx, align 1
+  %lb = load i8, ptr addrspace(1) %arrayidx2, align 1
+  %lc = load i8, ptr addrspace(1) %arrayidx4, align 1
   %exta = sext i8 %la to i16
   %extb = sext i8 %lb to i16
   %extc = sext i8 %lc to i16
   %mul = mul i16 %exta, %extb
   %mad = add i16 %mul, %extc
   %mad_ext = sext i16 %mad to i64
-  store i64 %mad_ext, i64 addrspace(1)* %out
+  store i64 %mad_ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -256,7 +256,7 @@ entry:
 ; GCN: v_mad_u32_u24
 ; GCN: v_mad_u32_u24
 ; GCN: v_mad_u32_u24
-define void @mad24_known_bits_destroyed(i32 %arg, <4 x i32> %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 addrspace(1)* %arg7, <4 x i32> addrspace(1)* %arg8) #0 {
+define void @mad24_known_bits_destroyed(i32 %arg, <4 x i32> %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i32 %arg4, i32 %arg5, i32 %arg6, ptr addrspace(1) %arg7, ptr addrspace(1) %arg8) #0 {
 bb:
   %tmp = and i32 %arg4, 16777215
   %tmp9 = extractelement <4 x i32> %arg1, i64 1
@@ -295,7 +295,7 @@ bb19:                                             ; preds = %bb19, %bb
   %tmp38 = and i32 %tmp25, 16777215
   %tmp39 = mul i32 %tmp38, %tmp
   %tmp40 = add i32 %tmp39, %arg5
-  store i32 %tmp40, i32 addrspace(1)* %arg7
+  store i32 %tmp40, ptr addrspace(1) %arg7
   %tmp41 = insertelement <4 x i32> undef, i32 %tmp40, i32 0
   %tmp42 = and i32 %tmp29, 16777215
   %tmp43 = mul i32 %tmp42, %tmp11
@@ -309,7 +309,7 @@ bb19:                                             ; preds = %bb19, %bb
   %tmp51 = mul i32 %tmp50, %tmp17
   %tmp52 = add i32 %tmp51, %tmp16
   %tmp53 = insertelement <4 x i32> %tmp49, i32 %tmp52, i32 3
-  store <4 x i32> %tmp53, <4 x i32> addrspace(1)* %arg8
+  store <4 x i32> %tmp53, ptr addrspace(1) %arg8
   %tmp54 = add nuw nsw i32 %tmp21, 1
   %tmp55 = icmp eq i32 %tmp54, %arg6
   br i1 %tmp55, label %bb18, label %bb19

diff  --git a/llvm/test/CodeGen/AMDGPU/mai-inline.ll b/llvm/test/CodeGen/AMDGPU/mai-inline.ll
index 8b7f542118d92..a97a87ffbc753 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-inline.ll
+++ b/llvm/test/CodeGen/AMDGPU/mai-inline.ll
@@ -4,12 +4,12 @@
 ; GFX908: v_accvgpr_write [[AREG:a[0-9]+]], 1
 ; GFX908: v_accvgpr_read [[VREG:v[0-9]+]], [[AREG]]
 ; GFX908: global_store_dword v{{[0-9]+}}, [[VREG]], s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @accvgpr_write_read(float addrspace(1)* %arg) {
+define amdgpu_kernel void @accvgpr_write_read(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load float, float addrspace(1)* %arg
+  %in.1 = load float, ptr addrspace(1) %arg
   %init = tail call float asm "v_accvgpr_write $0, 1", "=a"()
   %read = tail call float asm "v_accvgpr_read $0, $1", "=v,a"(float %init)
-  store float %read, float addrspace(1)* %arg
+  store float %read, ptr addrspace(1) %arg
   ret void
 }
 
@@ -23,11 +23,11 @@ bb:
 ; GFX908: v_accvgpr_read_b32
 ; GFX908: v_accvgpr_read_b32
 ; GFX908: v_accvgpr_read_b32
-define amdgpu_kernel void @v_mfma_f32_4x4x1f32_avva(<4 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @v_mfma_f32_4x4x1f32_avva(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> asm "v_mfma_f32_4x4x1f32 $0, $1, $2, $3", "=a,v,v,a"(float 1.0, float 2.0, <4 x float> %in.1)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -41,11 +41,11 @@ bb:
 ; GFX908: v_accvgpr_read_b32
 ; GFX908: v_accvgpr_read_b32
 ; GFX908: v_accvgpr_read_b32
-define amdgpu_kernel void @v_mfma_f32_4x4x1f32_aaaa(<4 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @v_mfma_f32_4x4x1f32_aaaa(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> asm "v_mfma_f32_4x4x1f32 $0, $1, $2, $3", "=a,a,a,a"(float 1.0, float 2.0, <4 x float> %in.1)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -63,11 +63,11 @@ bb:
 ; GFX908: v_accvgpr_read_b32
 ; GFX908: v_accvgpr_read_b32
 ; GFX908: v_accvgpr_read_b32
-define amdgpu_kernel void @v_mfma_f32_4x4x4f16_aaaa(<4 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @v_mfma_f32_4x4x4f16_aaaa(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> asm "v_mfma_f32_4x4x4f16 $0, $1, $2, $3", "=a,a,a,a"(<4 x half> <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800>, <4 x half> <half 0xH03FF, half 0xH03FF, half 0xH03FF, half 0xH03FF>, <4 x float> %in.1)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -106,11 +106,11 @@ bb:
 ; GFX908: v_accvgpr_read_b32
 ; GFX908: v_accvgpr_read_b32
 ; GFX908: v_accvgpr_read_b32
-define amdgpu_kernel void @v_mfma_f32_16x16x1f32_avaa(<16 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @v_mfma_f32_16x16x1f32_avaa(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> asm "v_mfma_f32_16x16x1f32 $0, $1, $2, $3", "=a,v,a,a"(float 1.0, float 2.0, <16 x float> %in.1)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -181,10 +181,10 @@ bb:
 ; GFX908: v_accvgpr_read_b32
 ; GFX908: v_accvgpr_read_b32
 ; GFX908: v_accvgpr_read_b32
-define amdgpu_kernel void @v_mfma_f32_32x32x1f32_avaa(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @v_mfma_f32_32x32x1f32_avaa(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
+  %in.1 = load <32 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x i32> asm "v_mfma_f32_32x32x1f32 $0, $1, $2, $3", "=a,v,a,a"(float 1.0, float 2.0, <32 x i32> %in.1)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  store <32 x i32> %mai.1, ptr addrspace(1) %arg
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
index 68c56dd3e0ea4..8ef2ca2765e8a 100644
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX9
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-define amdgpu_kernel void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
 ; VI-LABEL: v_test_imax_sge_i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -39,19 +39,19 @@ define amdgpu_kernel void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrs
 ; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
-  %a = load i16, i16 addrspace(1)* %gep0, align 4
-  %b = load i16, i16 addrspace(1)* %gep1, align 4
+  %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid
+  %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid
+  %a = load i16, ptr addrspace(1) %gep0, align 4
+  %b = load i16, ptr addrspace(1) %gep1, align 4
   %cmp = icmp sge i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
-  store i16 %val, i16 addrspace(1)* %outgep, align 4
+  store i16 %val, ptr addrspace(1) %outgep, align 4
   ret void
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
 ; VI-LABEL: v_test_imax_sge_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -89,19 +89,19 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
-  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep0, align 4
-  %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep1, align 4
+  %gep0 = getelementptr <2 x i16>, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %bptr, i32 %tid
+  %outgep = getelementptr <2 x i16>, ptr addrspace(1) %out, i32 %tid
+  %a = load <2 x i16>, ptr addrspace(1) %gep0, align 4
+  %b = load <2 x i16>, ptr addrspace(1) %gep1, align 4
   %cmp = icmp sge <2 x i16> %a, %b
   %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
-  store <2 x i16> %val, <2 x i16> addrspace(1)* %outgep, align 4
+  store <2 x i16> %val, ptr addrspace(1) %outgep, align 4
   ret void
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %aptr, <3 x i16> addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
 ; VI-LABEL: v_test_imax_sge_v3i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -160,19 +160,19 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <
 ; GFX9-NEXT:    global_store_dword v0, v3, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %out, i32 %tid
-  %a = load <3 x i16>, <3 x i16> addrspace(1)* %gep0, align 4
-  %b = load <3 x i16>, <3 x i16> addrspace(1)* %gep1, align 4
+  %gep0 = getelementptr <3 x i16>, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr <3 x i16>, ptr addrspace(1) %bptr, i32 %tid
+  %outgep = getelementptr <3 x i16>, ptr addrspace(1) %out, i32 %tid
+  %a = load <3 x i16>, ptr addrspace(1) %gep0, align 4
+  %b = load <3 x i16>, ptr addrspace(1) %gep1, align 4
   %cmp = icmp sge <3 x i16> %a, %b
   %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store <3 x i16> %val, <3 x i16> addrspace(1)* %outgep, align 4
+  store <3 x i16> %val, ptr addrspace(1) %outgep, align 4
   ret void
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-define amdgpu_kernel void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
 ; VI-LABEL: v_test_imax_sge_v4i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -214,19 +214,19 @@ define amdgpu_kernel void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep0 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid
-  %a = load <4 x i16>, <4 x i16> addrspace(1)* %gep0, align 4
-  %b = load <4 x i16>, <4 x i16> addrspace(1)* %gep1, align 4
+  %gep0 = getelementptr <4 x i16>, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %bptr, i32 %tid
+  %outgep = getelementptr <4 x i16>, ptr addrspace(1) %out, i32 %tid
+  %a = load <4 x i16>, ptr addrspace(1) %gep0, align 4
+  %b = load <4 x i16>, ptr addrspace(1) %gep1, align 4
   %cmp = icmp sge <4 x i16> %a, %b
   %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
-  store <4 x i16> %val, <4 x i16> addrspace(1)* %outgep, align 4
+  store <4 x i16> %val, ptr addrspace(1) %outgep, align 4
   ret void
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-define amdgpu_kernel void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sgt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
 ; VI-LABEL: v_test_imax_sgt_i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -262,19 +262,19 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrs
 ; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
-  %a = load i16, i16 addrspace(1)* %gep0, align 4
-  %b = load i16, i16 addrspace(1)* %gep1, align 4
+  %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid
+  %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid
+  %a = load i16, ptr addrspace(1) %gep0, align 4
+  %b = load i16, ptr addrspace(1) %gep1, align 4
   %cmp = icmp sgt i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
-  store i16 %val, i16 addrspace(1)* %outgep, align 4
+  store i16 %val, ptr addrspace(1) %outgep, align 4
   ret void
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-define amdgpu_kernel void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_uge_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
 ; VI-LABEL: v_test_umax_uge_i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -310,19 +310,19 @@ define amdgpu_kernel void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrs
 ; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
-  %a = load i16, i16 addrspace(1)* %gep0, align 4
-  %b = load i16, i16 addrspace(1)* %gep1, align 4
+  %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid
+  %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid
+  %a = load i16, ptr addrspace(1) %gep0, align 4
+  %b = load i16, ptr addrspace(1) %gep1, align 4
   %cmp = icmp uge i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
-  store i16 %val, i16 addrspace(1)* %outgep, align 4
+  store i16 %val, ptr addrspace(1) %outgep, align 4
   ret void
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-define amdgpu_kernel void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_ugt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
 ; VI-LABEL: v_test_umax_ugt_i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -358,18 +358,18 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrs
 ; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
-  %a = load i16, i16 addrspace(1)* %gep0, align 4
-  %b = load i16, i16 addrspace(1)* %gep1, align 4
+  %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid
+  %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid
+  %a = load i16, ptr addrspace(1) %gep0, align 4
+  %b = load i16, ptr addrspace(1) %gep1, align 4
   %cmp = icmp ugt i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
-  store i16 %val, i16 addrspace(1)* %outgep, align 4
+  store i16 %val, ptr addrspace(1) %outgep, align 4
   ret void
 }
 
-define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_ugt_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
 ; VI-LABEL: v_test_umax_ugt_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -407,14 +407,14 @@ define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %bptr, i32 %tid
-  %outgep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
-  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep0, align 4
-  %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep1, align 4
+  %gep0 = getelementptr <2 x i16>, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %bptr, i32 %tid
+  %outgep = getelementptr <2 x i16>, ptr addrspace(1) %out, i32 %tid
+  %a = load <2 x i16>, ptr addrspace(1) %gep0, align 4
+  %b = load <2 x i16>, ptr addrspace(1) %gep1, align 4
   %cmp = icmp ugt <2 x i16> %a, %b
   %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
-  store <2 x i16> %val, <2 x i16> addrspace(1)* %outgep, align 4
+  store <2 x i16> %val, ptr addrspace(1) %outgep, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll
index 70c0287590490..ce1b5bbe32ed2 100644
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -6,14 +6,14 @@
 ; SI: v_max_i32_e32
 
 ; EG: MAX_INT
-define amdgpu_kernel void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds i32, i32 addrspace(1)* %bptr, i32 %tid
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %gep.in, align 4
+  %gep.in = getelementptr inbounds i32, ptr addrspace(1) %bptr, i32 %tid
+  %a = load i32, ptr addrspace(1) %aptr, align 4
+  %b = load i32, ptr addrspace(1) %gep.in, align 4
   %cmp = icmp sge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -28,14 +28,14 @@ define amdgpu_kernel void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrs
 ; EG: MAX_INT
 ; EG: MAX_INT
 ; EG: MAX_INT
-define amdgpu_kernel void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %aptr, <4 x i32> addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %bptr, i32 %tid
-  %a = load <4 x i32>, <4 x i32> addrspace(1)* %aptr, align 4
-  %b = load <4 x i32>, <4 x i32> addrspace(1)* %gep.in, align 4
+  %gep.in = getelementptr inbounds <4 x i32>, ptr addrspace(1) %bptr, i32 %tid
+  %a = load <4 x i32>, ptr addrspace(1) %aptr, align 4
+  %b = load <4 x i32>, ptr addrspace(1) %gep.in, align 4
   %cmp = icmp sge <4 x i32> %a, %b
   %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
-  store <4 x i32> %val, <4 x i32> addrspace(1)* %out, align 4
+  store <4 x i32> %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -43,10 +43,10 @@ define amdgpu_kernel void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <
 ; SI: s_max_i32
 
 ; EG: MAX_INT
-define amdgpu_kernel void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_imax_sge_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp sge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -54,10 +54,10 @@ define amdgpu_kernel void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i
 ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
 
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
-define amdgpu_kernel void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_test_imax_sge_imm_i32(ptr addrspace(1) %out, i32 %a) nounwind {
   %cmp = icmp sge i32 %a, 9
   %val = select i1 %cmp, i32 %a, i32 9
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -67,12 +67,12 @@ define amdgpu_kernel void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %
 ; SI: v_max_i32_e32
 
 ; EG: MAX_INT
-define amdgpu_kernel void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
-  %a = load i8, i8 addrspace(1)* %aptr, align 1
-  %b = load i8, i8 addrspace(1)* %bptr, align 1
+define amdgpu_kernel void @v_test_imax_sge_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+  %a = load i8, ptr addrspace(1) %aptr, align 1
+  %b = load i8, ptr addrspace(1) %bptr, align 1
   %cmp = icmp sge i8 %a, %b
   %val = select i1 %cmp, i8 %a, i8 %b
-  store i8 %val, i8 addrspace(1)* %out, align 1
+  store i8 %val, ptr addrspace(1) %out, align 1
   ret void
 }
 
@@ -80,10 +80,10 @@ define amdgpu_kernel void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspac
 ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
 
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
-define amdgpu_kernel void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_test_imax_sgt_imm_i32(ptr addrspace(1) %out, i32 %a) nounwind {
   %cmp = icmp sgt i32 %a, 9
   %val = select i1 %cmp, i32 %a, i32 9
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -93,10 +93,10 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %
 
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
-define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(ptr addrspace(1) %out, <2 x i32> %a) nounwind {
   %cmp = icmp sgt <2 x i32> %a, <i32 9, i32 9>
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 9, i32 9>
-  store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
+  store <2 x i32> %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -104,14 +104,14 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %ou
 ; SI: v_max_i32_e32
 
 ; EG: MAX_INT
-define amdgpu_kernel void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sgt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds i32, i32 addrspace(1)* %bptr, i32 %tid
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %gep.in, align 4
+  %gep.in = getelementptr inbounds i32, ptr addrspace(1) %bptr, i32 %tid
+  %a = load i32, ptr addrspace(1) %aptr, align 4
+  %b = load i32, ptr addrspace(1) %gep.in, align 4
   %cmp = icmp sgt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -119,10 +119,10 @@ define amdgpu_kernel void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrs
 ; SI: s_max_i32
 
 ; EG: MAX_INT
-define amdgpu_kernel void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_imax_sgt_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp sgt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -130,14 +130,14 @@ define amdgpu_kernel void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i
 ; SI: v_max_u32_e32
 
 ; EG: MAX_UINT
-define amdgpu_kernel void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_uge_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds i32, i32 addrspace(1)* %bptr, i32 %tid
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %gep.in, align 4
+  %gep.in = getelementptr inbounds i32, ptr addrspace(1) %bptr, i32 %tid
+  %a = load i32, ptr addrspace(1) %aptr, align 4
+  %b = load i32, ptr addrspace(1) %gep.in, align 4
   %cmp = icmp uge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -145,10 +145,10 @@ define amdgpu_kernel void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrs
 ; SI: s_max_u32
 
 ; EG: MAX_UINT
-define amdgpu_kernel void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_umax_uge_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp uge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -163,10 +163,10 @@ define amdgpu_kernel void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i
 ; EG: MAX_UINT
 ; EG: MAX_UINT
 ; EG-NOT: MAX_UINT
-define amdgpu_kernel void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <3 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_umax_uge_v3i32(ptr addrspace(1) %out, <3 x i32> %a, <3 x i32> %b) nounwind {
   %cmp = icmp uge <3 x i32> %a, %b
   %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
-  store <3 x i32> %val, <3 x i32> addrspace(1)* %out, align 4
+  store <3 x i32> %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -176,12 +176,12 @@ define amdgpu_kernel void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <
 ; SI: v_max_u32_e32
 
 ; EG: MAX_UINT
-define amdgpu_kernel void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
-  %a = load i8, i8 addrspace(1)* %aptr, align 1
-  %b = load i8, i8 addrspace(1)* %bptr, align 1
+define amdgpu_kernel void @v_test_umax_uge_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+  %a = load i8, ptr addrspace(1) %aptr, align 1
+  %b = load i8, ptr addrspace(1) %bptr, align 1
   %cmp = icmp uge i8 %a, %b
   %val = select i1 %cmp, i8 %a, i8 %b
-  store i8 %val, i8 addrspace(1)* %out, align 1
+  store i8 %val, ptr addrspace(1) %out, align 1
   ret void
 }
 
@@ -189,14 +189,14 @@ define amdgpu_kernel void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspac
 ; SI: v_max_u32_e32
 
 ; EG: MAX_UINT
-define amdgpu_kernel void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds i32, i32 addrspace(1)* %bptr, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep.in, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %gep.in = getelementptr inbounds i32, ptr addrspace(1) %bptr, i32 %tid
+  %a = load i32, ptr addrspace(1) %gep.in, align 4
+  %b = load i32, ptr addrspace(1) %bptr, align 4
   %cmp = icmp ugt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -204,10 +204,10 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrs
 ; SI: s_max_u32
 
 ; EG: MAX_UINT
-define amdgpu_kernel void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_umax_ugt_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp ugt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -217,10 +217,10 @@ define amdgpu_kernel void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i
 
 ; EG: MAX_UINT {{.*}}literal.{{[xyzw]}}
 ; EG: MAX_UINT {{.*}}literal.{{[xyzw]}}
-define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(ptr addrspace(1) %out, <2 x i32> %a) nounwind {
   %cmp = icmp ugt <2 x i32> %a, <i32 15, i32 23>
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 15, i32 23>
-  store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
+  store <2 x i32> %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -235,13 +235,13 @@ define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %ou
 ; SI: buffer_store_dword [[VMAX]]
 
 ; EG: MAX_UINT
-define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(ptr addrspace(1) %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) nounwind {
   %a.ext = zext i16 %a to i32
   %b.ext = zext i16 %b to i32
   %cmp = icmp ugt i32 %a.ext, %b.ext
   %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
   %mask = and i32 %val, 65535
-  store i32 %mask, i32 addrspace(1)* %out
+  store i32 %mask, ptr addrspace(1) %out
   ret void
 }
 
@@ -258,14 +258,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspac
 ; SI: buffer_store_dword [[VMAX]]
 
 ; EG: MAX_INT
-define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(ptr addrspace(1) %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) nounwind {
   %a.ext = sext i16 %a to i32
   %b.ext = sext i16 %b to i32
   %cmp = icmp sgt i32 %a.ext, %b.ext
   %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
   %shl = shl i32 %val, 16
   %sextinreg = ashr i32 %shl, 16
-  store i32 %sextinreg, i32 addrspace(1)* %out
+  store i32 %sextinreg, ptr addrspace(1) %out
   ret void
 }
 
@@ -277,10 +277,10 @@ define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace
 ; SI: s_max_i32
 
 ; EG: MAX_INT
-define amdgpu_kernel void @s_test_imax_sge_i16(i16 addrspace(1)* %out, [8 x i32], i16 %a, [8 x i32], i16 %b) nounwind {
+define amdgpu_kernel void @s_test_imax_sge_i16(ptr addrspace(1) %out, [8 x i32], i16 %a, [8 x i32], i16 %b) nounwind {
   %cmp = icmp sge i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
-  store i16 %val, i16 addrspace(1)* %out
+  store i16 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -290,10 +290,10 @@ define amdgpu_kernel void @s_test_imax_sge_i16(i16 addrspace(1)* %out, [8 x i32]
 
 ; EG: MAX_UINT
 ; EG: MAX_UINT
-define amdgpu_kernel void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_umax_ugt_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp ugt i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
-  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i64 %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -302,10 +302,10 @@ define amdgpu_kernel void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64
 
 ; EG: MAX_UINT
 ; EG: MAX_UINT
-define amdgpu_kernel void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_umax_uge_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp uge i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
-  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i64 %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -314,10 +314,10 @@ define amdgpu_kernel void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64
 
 ; EG-DAG: MAX_UINT
 ; EG-DAG: MAX_INT
-define amdgpu_kernel void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_imax_sgt_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp sgt i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
-  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i64 %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -326,10 +326,10 @@ define amdgpu_kernel void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64
 
 ; EG-DAG: MAX_UINT
 ; EG-DAG: MAX_INT
-define amdgpu_kernel void @test_imax_sge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_imax_sge_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp sge i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
-  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i64 %val, ptr addrspace(1) %out, align 8
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/max3.ll b/llvm/test/CodeGen/AMDGPU/max3.ll
index 4dc5d484e1673..3e0c7e52be99e 100644
--- a/llvm/test/CodeGen/AMDGPU/max3.ll
+++ b/llvm/test/CodeGen/AMDGPU/max3.ll
@@ -4,39 +4,39 @@
 
 ; GCN-LABEL: {{^}}v_test_imax3_sgt_i32:
 ; GCN: v_max3_i32
-define amdgpu_kernel void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_imax3_sgt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0
-  %b = load i32, i32 addrspace(1)* %gep1
-  %c = load i32, i32 addrspace(1)* %gep2
+  %gep0 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i32, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load i32, ptr addrspace(1) %gep0
+  %b = load i32, ptr addrspace(1) %gep1
+  %c = load i32, ptr addrspace(1) %gep2
   %icmp0 = icmp sgt i32 %a, %b
   %i0 = select i1 %icmp0, i32 %a, i32 %b
   %icmp1 = icmp sgt i32 %i0, %c
   %i1 = select i1 %icmp1, i32 %i0, i32 %c
-  store i32 %i1, i32 addrspace(1)* %out
+  store i32 %i1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_umax3_ugt_i32:
 ; GCN: v_max3_u32
-define amdgpu_kernel void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_umax3_ugt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0
-  %b = load i32, i32 addrspace(1)* %gep1
-  %c = load i32, i32 addrspace(1)* %gep2
+  %gep0 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i32, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load i32, ptr addrspace(1) %gep0
+  %b = load i32, ptr addrspace(1) %gep1
+  %c = load i32, ptr addrspace(1) %gep2
   %icmp0 = icmp ugt i32 %a, %b
   %i0 = select i1 %icmp0, i32 %a, i32 %b
   %icmp1 = icmp ugt i32 %i0, %c
   %i1 = select i1 %icmp1, i32 %i0, i32 %c
-  store i32 %i1, i32 addrspace(1)* %out
+  store i32 %i1, ptr addrspace(1) %out
   ret void
 }
 
@@ -47,20 +47,20 @@ define amdgpu_kernel void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addr
 ; VI: v_max_i16
 
 ; GFX9: v_max3_i16
-define amdgpu_kernel void @v_test_imax3_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_imax3_sgt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
-  %a = load i16, i16 addrspace(1)* %gep0
-  %b = load i16, i16 addrspace(1)* %gep1
-  %c = load i16, i16 addrspace(1)* %gep2
+  %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i16, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid
+  %a = load i16, ptr addrspace(1) %gep0
+  %b = load i16, ptr addrspace(1) %gep1
+  %c = load i16, ptr addrspace(1) %gep2
   %icmp0 = icmp sgt i16 %a, %b
   %i0 = select i1 %icmp0, i16 %a, i16 %b
   %icmp1 = icmp sgt i16 %i0, %c
   %i1 = select i1 %icmp1, i16 %i0, i16 %c
-  store i16 %i1, i16 addrspace(1)* %out
+  store i16 %i1, ptr addrspace(1) %out
   ret void
 }
 
@@ -71,20 +71,20 @@ define amdgpu_kernel void @v_test_imax3_sgt_i16(i16 addrspace(1)* %out, i16 addr
 ; VI: v_max_u16
 
 ; GFX9: v_max3_u16
-define amdgpu_kernel void @v_test_umax3_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_umax3_ugt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
-  %a = load i16, i16 addrspace(1)* %gep0
-  %b = load i16, i16 addrspace(1)* %gep1
-  %c = load i16, i16 addrspace(1)* %gep2
+  %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i16, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid
+  %a = load i16, ptr addrspace(1) %gep0
+  %b = load i16, ptr addrspace(1) %gep1
+  %c = load i16, ptr addrspace(1) %gep2
   %icmp0 = icmp ugt i16 %a, %b
   %i0 = select i1 %icmp0, i16 %a, i16 %b
   %icmp1 = icmp ugt i16 %i0, %c
   %i1 = select i1 %icmp1, i16 %i0, i16 %c
-  store i16 %i1, i16 addrspace(1)* %out
+  store i16 %i1, ptr addrspace(1) %out
   ret void
 }
 
@@ -95,20 +95,20 @@ define amdgpu_kernel void @v_test_umax3_ugt_i16(i16 addrspace(1)* %out, i16 addr
 ; VI: v_max_i16
 
 ; GFX9: v_max3_i16
-define amdgpu_kernel void @v_test_imax3_sgt_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr, i8 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_imax3_sgt_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i8, i8 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i8, i8 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i8, i8 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
-  %a = load i8, i8 addrspace(1)* %gep0
-  %b = load i8, i8 addrspace(1)* %gep1
-  %c = load i8, i8 addrspace(1)* %gep2
+  %gep0 = getelementptr i8, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i8, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i8, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i8, ptr addrspace(1) %out, i32 %tid
+  %a = load i8, ptr addrspace(1) %gep0
+  %b = load i8, ptr addrspace(1) %gep1
+  %c = load i8, ptr addrspace(1) %gep2
   %icmp0 = icmp sgt i8 %a, %b
   %i0 = select i1 %icmp0, i8 %a, i8 %b
   %icmp1 = icmp sgt i8 %i0, %c
   %i1 = select i1 %icmp1, i8 %i0, i8 %c
-  store i8 %i1, i8 addrspace(1)* %out
+  store i8 %i1, ptr addrspace(1) %out
   ret void
 }
 
@@ -119,20 +119,20 @@ define amdgpu_kernel void @v_test_imax3_sgt_i8(i8 addrspace(1)* %out, i8 addrspa
 ; VI: v_max_u16
 
 ; GFX9: v_max3_u16
-define amdgpu_kernel void @v_test_umax3_ugt_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr, i8 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_umax3_ugt_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i8, i8 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i8, i8 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i8, i8 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
-  %a = load i8, i8 addrspace(1)* %gep0
-  %b = load i8, i8 addrspace(1)* %gep1
-  %c = load i8, i8 addrspace(1)* %gep2
+  %gep0 = getelementptr i8, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i8, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i8, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i8, ptr addrspace(1) %out, i32 %tid
+  %a = load i8, ptr addrspace(1) %gep0
+  %b = load i8, ptr addrspace(1) %gep1
+  %c = load i8, ptr addrspace(1) %gep2
   %icmp0 = icmp ugt i8 %a, %b
   %i0 = select i1 %icmp0, i8 %a, i8 %b
   %icmp1 = icmp ugt i8 %i0, %c
   %i1 = select i1 %icmp1, i8 %i0, i8 %c
-  store i8 %i1, i8 addrspace(1)* %out
+  store i8 %i1, ptr addrspace(1) %out
   ret void
 }
 
@@ -143,20 +143,20 @@ define amdgpu_kernel void @v_test_umax3_ugt_i8(i8 addrspace(1)* %out, i8 addrspa
 ; VI: v_max_i16
 
 ; GFX9: v_max3_i16
-define amdgpu_kernel void @v_test_imax3_sgt_i7(i7 addrspace(1)* %out, i7 addrspace(1)* %aptr, i7 addrspace(1)* %bptr, i7 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_imax3_sgt_i7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i7, i7 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i7, i7 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i7, i7 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i7, i7 addrspace(1)* %out, i32 %tid
-  %a = load i7, i7 addrspace(1)* %gep0
-  %b = load i7, i7 addrspace(1)* %gep1
-  %c = load i7, i7 addrspace(1)* %gep2
+  %gep0 = getelementptr i7, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i7, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i7, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i7, ptr addrspace(1) %out, i32 %tid
+  %a = load i7, ptr addrspace(1) %gep0
+  %b = load i7, ptr addrspace(1) %gep1
+  %c = load i7, ptr addrspace(1) %gep2
   %icmp0 = icmp sgt i7 %a, %b
   %i0 = select i1 %icmp0, i7 %a, i7 %b
   %icmp1 = icmp sgt i7 %i0, %c
   %i1 = select i1 %icmp1, i7 %i0, i7 %c
-  store i7 %i1, i7 addrspace(1)* %out
+  store i7 %i1, ptr addrspace(1) %out
   ret void
 }
 
@@ -167,96 +167,96 @@ define amdgpu_kernel void @v_test_imax3_sgt_i7(i7 addrspace(1)* %out, i7 addrspa
 ; VI: v_max_u16
 
 ; GFX9: v_max3_u16
-define amdgpu_kernel void @v_test_umax3_ugt_i7(i7 addrspace(1)* %out, i7 addrspace(1)* %aptr, i7 addrspace(1)* %bptr, i7 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_umax3_ugt_i7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i7, i7 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i7, i7 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i7, i7 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i7, i7 addrspace(1)* %out, i32 %tid
-  %a = load i7, i7 addrspace(1)* %gep0
-  %b = load i7, i7 addrspace(1)* %gep1
-  %c = load i7, i7 addrspace(1)* %gep2
+  %gep0 = getelementptr i7, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i7, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i7, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i7, ptr addrspace(1) %out, i32 %tid
+  %a = load i7, ptr addrspace(1) %gep0
+  %b = load i7, ptr addrspace(1) %gep1
+  %c = load i7, ptr addrspace(1) %gep2
   %icmp0 = icmp ugt i7 %a, %b
   %i0 = select i1 %icmp0, i7 %a, i7 %b
   %icmp1 = icmp ugt i7 %i0, %c
   %i1 = select i1 %icmp1, i7 %i0, i7 %c
-  store i7 %i1, i7 addrspace(1)* %out
+  store i7 %i1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_imax3_sgt_i33:
 ; GCN-NOT: v_max3
-define amdgpu_kernel void @v_test_imax3_sgt_i33(i33 addrspace(1)* %out, i33 addrspace(1)* %aptr, i33 addrspace(1)* %bptr, i33 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_imax3_sgt_i33(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i33, i33 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i33, i33 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i33, i33 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i33, i33 addrspace(1)* %out, i32 %tid
-  %a = load i33, i33 addrspace(1)* %gep0
-  %b = load i33, i33 addrspace(1)* %gep1
-  %c = load i33, i33 addrspace(1)* %gep2
+  %gep0 = getelementptr i33, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i33, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i33, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i33, ptr addrspace(1) %out, i32 %tid
+  %a = load i33, ptr addrspace(1) %gep0
+  %b = load i33, ptr addrspace(1) %gep1
+  %c = load i33, ptr addrspace(1) %gep2
   %icmp0 = icmp sgt i33 %a, %b
   %i0 = select i1 %icmp0, i33 %a, i33 %b
   %icmp1 = icmp sgt i33 %i0, %c
   %i1 = select i1 %icmp1, i33 %i0, i33 %c
-  store i33 %i1, i33 addrspace(1)* %out
+  store i33 %i1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_umax3_ugt_i33:
 ; GCN-NOT: v_max3
-define amdgpu_kernel void @v_test_umax3_ugt_i33(i33 addrspace(1)* %out, i33 addrspace(1)* %aptr, i33 addrspace(1)* %bptr, i33 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_umax3_ugt_i33(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i33, i33 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i33, i33 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i33, i33 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i33, i33 addrspace(1)* %out, i32 %tid
-  %a = load i33, i33 addrspace(1)* %gep0
-  %b = load i33, i33 addrspace(1)* %gep1
-  %c = load i33, i33 addrspace(1)* %gep2
+  %gep0 = getelementptr i33, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i33, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i33, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i33, ptr addrspace(1) %out, i32 %tid
+  %a = load i33, ptr addrspace(1) %gep0
+  %b = load i33, ptr addrspace(1) %gep1
+  %c = load i33, ptr addrspace(1) %gep2
   %icmp0 = icmp ugt i33 %a, %b
   %i0 = select i1 %icmp0, i33 %a, i33 %b
   %icmp1 = icmp ugt i33 %i0, %c
   %i1 = select i1 %icmp1, i33 %i0, i33 %c
-  store i33 %i1, i33 addrspace(1)* %out
+  store i33 %i1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_imax3_sgt_i64:
 ; GCN-NOT: v_max3
-define amdgpu_kernel void @v_test_imax3_sgt_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_imax3_sgt_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i64, i64 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i64, i64 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
-  %a = load i64, i64 addrspace(1)* %gep0
-  %b = load i64, i64 addrspace(1)* %gep1
-  %c = load i64, i64 addrspace(1)* %gep2
+  %gep0 = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i64, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i64, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i64, ptr addrspace(1) %out, i32 %tid
+  %a = load i64, ptr addrspace(1) %gep0
+  %b = load i64, ptr addrspace(1) %gep1
+  %c = load i64, ptr addrspace(1) %gep2
   %icmp0 = icmp sgt i64 %a, %b
   %i0 = select i1 %icmp0, i64 %a, i64 %b
   %icmp1 = icmp sgt i64 %i0, %c
   %i1 = select i1 %icmp1, i64 %i0, i64 %c
-  store i64 %i1, i64 addrspace(1)* %out
+  store i64 %i1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_umax3_ugt_i64:
 ; GCN-NOT: v_max3
-define amdgpu_kernel void @v_test_umax3_ugt_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_umax3_ugt_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i64, i64 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i64, i64 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
-  %a = load i64, i64 addrspace(1)* %gep0
-  %b = load i64, i64 addrspace(1)* %gep1
-  %c = load i64, i64 addrspace(1)* %gep2
+  %gep0 = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i64, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i64, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i64, ptr addrspace(1) %out, i32 %tid
+  %a = load i64, ptr addrspace(1) %gep0
+  %b = load i64, ptr addrspace(1) %gep1
+  %c = load i64, ptr addrspace(1) %gep2
   %icmp0 = icmp ugt i64 %a, %b
   %i0 = select i1 %icmp0, i64 %a, i64 %b
   %icmp1 = icmp ugt i64 %i0, %c
   %i1 = select i1 %icmp1, i64 %i0, i64 %c
-  store i64 %i1, i64 addrspace(1)* %out
+  store i64 %i1, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/med3-no-simplify.ll b/llvm/test/CodeGen/AMDGPU/med3-no-simplify.ll
index efa104b732f50..512b7b3e3c121 100644
--- a/llvm/test/CodeGen/AMDGPU/med3-no-simplify.ll
+++ b/llvm/test/CodeGen/AMDGPU/med3-no-simplify.ll
@@ -11,11 +11,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_constant_order_i32:
 ; GCN: v_max_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
 ; GCN: v_min_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
-define amdgpu_kernel void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_constant_order_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0
+  %gep0 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load i32, ptr addrspace(1) %gep0
 
   %icmp0 = icmp ugt i32 %a, 17
   %i0 = select i1 %icmp0, i32 %a, i32 17
@@ -23,18 +23,18 @@ define amdgpu_kernel void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1
   %icmp1 = icmp ult i32 %i0, 12
   %i1 = select i1 %icmp1, i32 %i0, i32 12
 
-  store i32 %i1, i32 addrspace(1)* %outgep
+  store i32 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_constant_order_i32:
 ; GCN: v_max_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
 ; GCN: v_min_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
-define amdgpu_kernel void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_smed3_r_i_i_constant_order_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0
+  %gep0 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load i32, ptr addrspace(1) %gep0
 
   %icmp0 = icmp sgt i32 %a, 17
   %i0 = select i1 %icmp0, i32 %a, i32 17
@@ -42,7 +42,7 @@ define amdgpu_kernel void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1
   %icmp1 = icmp slt i32 %i0, 12
   %i1 = select i1 %icmp1, i32 %i0, i32 12
 
-  store i32 %i1, i32 addrspace(1)* %outgep
+  store i32 %i1, ptr addrspace(1) %outgep
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index 024f04b4cc824..52dd4afcdca5f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SCRATCH %s
 
-define amdgpu_kernel void @vector_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) {
+define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) {
 ; GCN-LABEL: vector_clause:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -44,29 +44,29 @@ define amdgpu_kernel void @vector_clause(<4 x i32> addrspace(1)* noalias nocaptu
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = zext i32 %tmp to i64
-  %tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp2
-  %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 16
-  %tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp2
+  %tmp3 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp2
+  %tmp4 = load <4 x i32>, ptr addrspace(1) %tmp3, align 16
+  %tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp2
   %tmp6 = add nuw nsw i64 %tmp2, 1
-  %tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp6
-  %tmp8 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp7, align 16
-  %tmp9 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp6
+  %tmp7 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp6
+  %tmp8 = load <4 x i32>, ptr addrspace(1) %tmp7, align 16
+  %tmp9 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp6
   %tmp10 = add nuw nsw i64 %tmp2, 2
-  %tmp11 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp10
-  %tmp12 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp11, align 16
-  %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp10
+  %tmp11 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp10
+  %tmp12 = load <4 x i32>, ptr addrspace(1) %tmp11, align 16
+  %tmp13 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp10
   %tmp14 = add nuw nsw i64 %tmp2, 3
-  %tmp15 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp14
-  %tmp16 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp15, align 16
-  %tmp17 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp14
-  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp5, align 16
-  store <4 x i32> %tmp8, <4 x i32> addrspace(1)* %tmp9, align 16
-  store <4 x i32> %tmp12, <4 x i32> addrspace(1)* %tmp13, align 16
-  store <4 x i32> %tmp16, <4 x i32> addrspace(1)* %tmp17, align 16
+  %tmp15 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp14
+  %tmp16 = load <4 x i32>, ptr addrspace(1) %tmp15, align 16
+  %tmp17 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp14
+  store <4 x i32> %tmp4, ptr addrspace(1) %tmp5, align 16
+  store <4 x i32> %tmp8, ptr addrspace(1) %tmp9, align 16
+  store <4 x i32> %tmp12, ptr addrspace(1) %tmp13, align 16
+  store <4 x i32> %tmp16, ptr addrspace(1) %tmp17, align 16
   ret void
 }
 
-define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) {
+define amdgpu_kernel void @scalar_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) {
 ; GCN-LABEL: scalar_clause:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x24
@@ -125,24 +125,24 @@ define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocaptu
 ; GCN-SCRATCH-NEXT:    global_store_dwordx4 v16, v[12:15], s[18:19] offset:48
 ; GCN-SCRATCH-NEXT:    s_endpgm
 bb:
-  %tmp = load <4 x i32>, <4 x i32> addrspace(1)* %arg, align 16
-  %tmp2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 1
-  %tmp3 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp2, align 16
-  %tmp4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 1
-  %tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 2
-  %tmp6 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp5, align 16
-  %tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 2
-  %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 3
-  %tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp8, align 16
-  %tmp10 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 3
-  store <4 x i32> %tmp, <4 x i32> addrspace(1)* %arg1, align 16
-  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %tmp4, align 16
-  store <4 x i32> %tmp6, <4 x i32> addrspace(1)* %tmp7, align 16
-  store <4 x i32> %tmp9, <4 x i32> addrspace(1)* %tmp10, align 16
+  %tmp = load <4 x i32>, ptr addrspace(1) %arg, align 16
+  %tmp2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 1
+  %tmp3 = load <4 x i32>, ptr addrspace(1) %tmp2, align 16
+  %tmp4 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 1
+  %tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 2
+  %tmp6 = load <4 x i32>, ptr addrspace(1) %tmp5, align 16
+  %tmp7 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 2
+  %tmp8 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 3
+  %tmp9 = load <4 x i32>, ptr addrspace(1) %tmp8, align 16
+  %tmp10 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 3
+  store <4 x i32> %tmp, ptr addrspace(1) %arg1, align 16
+  store <4 x i32> %tmp3, ptr addrspace(1) %tmp4, align 16
+  store <4 x i32> %tmp6, ptr addrspace(1) %tmp7, align 16
+  store <4 x i32> %tmp9, ptr addrspace(1) %tmp10, align 16
   ret void
 }
 
-define void @mubuf_clause(<4 x i32> addrspace(5)* noalias nocapture readonly %arg, <4 x i32> addrspace(5)* noalias nocapture %arg1) {
+define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr addrspace(5) noalias nocapture %arg1) {
 ; GCN-LABEL: mubuf_clause:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -227,29 +227,29 @@ define void @mubuf_clause(<4 x i32> addrspace(5)* noalias nocapture readonly %ar
 ; GCN-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg, i32 %tmp
-  %tmp3 = load <4 x i32>, <4 x i32> addrspace(5)* %tmp2, align 16
-  %tmp4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg1, i32 %tmp
+  %tmp2 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp
+  %tmp3 = load <4 x i32>, ptr addrspace(5) %tmp2, align 16
+  %tmp4 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp
   %tmp5 = add nuw nsw i32 %tmp, 1
-  %tmp6 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg, i32 %tmp5
-  %tmp7 = load <4 x i32>, <4 x i32> addrspace(5)* %tmp6, align 16
-  %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg1, i32 %tmp5
+  %tmp6 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp5
+  %tmp7 = load <4 x i32>, ptr addrspace(5) %tmp6, align 16
+  %tmp8 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp5
   %tmp9 = add nuw nsw i32 %tmp, 2
-  %tmp10 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg, i32 %tmp9
-  %tmp11 = load <4 x i32>, <4 x i32> addrspace(5)* %tmp10, align 16
-  %tmp12 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg1, i32 %tmp9
+  %tmp10 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp9
+  %tmp11 = load <4 x i32>, ptr addrspace(5) %tmp10, align 16
+  %tmp12 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp9
   %tmp13 = add nuw nsw i32 %tmp, 3
-  %tmp14 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg, i32 %tmp13
-  %tmp15 = load <4 x i32>, <4 x i32> addrspace(5)* %tmp14, align 16
-  %tmp16 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg1, i32 %tmp13
-  store <4 x i32> %tmp3, <4 x i32> addrspace(5)* %tmp4, align 16
-  store <4 x i32> %tmp7, <4 x i32> addrspace(5)* %tmp8, align 16
-  store <4 x i32> %tmp11, <4 x i32> addrspace(5)* %tmp12, align 16
-  store <4 x i32> %tmp15, <4 x i32> addrspace(5)* %tmp16, align 16
+  %tmp14 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp13
+  %tmp15 = load <4 x i32>, ptr addrspace(5) %tmp14, align 16
+  %tmp16 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp13
+  store <4 x i32> %tmp3, ptr addrspace(5) %tmp4, align 16
+  store <4 x i32> %tmp7, ptr addrspace(5) %tmp8, align 16
+  store <4 x i32> %tmp11, ptr addrspace(5) %tmp12, align 16
+  store <4 x i32> %tmp15, ptr addrspace(5) %tmp16, align 16
   ret void
 }
 
-define amdgpu_kernel void @vector_clause_indirect(i64 addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture readnone %arg1, <4 x i32> addrspace(1)* noalias nocapture %arg2) {
+define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture readnone %arg1, ptr addrspace(1) noalias nocapture %arg2) {
 ; GCN-LABEL: vector_clause_indirect:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -287,19 +287,18 @@ define amdgpu_kernel void @vector_clause_indirect(i64 addrspace(1)* noalias noca
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp3 = zext i32 %tmp to i64
-  %tmp4 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp3
-  %tmp5 = bitcast i64 addrspace(1)* %tmp4 to <4 x i32> addrspace(1)* addrspace(1)*
-  %tmp6 = load <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)* addrspace(1)* %tmp5, align 8
-  %tmp7 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp6, align 16
-  %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %tmp6, i64 1
-  %tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp8, align 16
-  store <4 x i32> %tmp7, <4 x i32> addrspace(1)* %arg2, align 16
-  %tmp10 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg2, i64 1
-  store <4 x i32> %tmp9, <4 x i32> addrspace(1)* %tmp10, align 16
+  %tmp4 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmp3
+  %tmp6 = load ptr addrspace(1), ptr addrspace(1) %tmp4, align 8
+  %tmp7 = load <4 x i32>, ptr addrspace(1) %tmp6, align 16
+  %tmp8 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %tmp6, i64 1
+  %tmp9 = load <4 x i32>, ptr addrspace(1) %tmp8, align 16
+  store <4 x i32> %tmp7, ptr addrspace(1) %arg2, align 16
+  %tmp10 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg2, i64 1
+  store <4 x i32> %tmp9, ptr addrspace(1) %tmp10, align 16
   ret void
 }
 
-define void @load_global_d16_hi(i16 addrspace(1)* %in, i16 %reg, <2 x i16> addrspace(1)* %out) {
+define void @load_global_d16_hi(ptr addrspace(1) %in, i16 %reg, ptr addrspace(1) %out) {
 ; GCN-LABEL: load_global_d16_hi:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -329,20 +328,20 @@ define void @load_global_d16_hi(i16 addrspace(1)* %in, i16 %reg, <2 x i16> addrs
 ; GCN-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 32
-  %load1 = load i16, i16 addrspace(1)* %in
-  %load2 = load i16, i16 addrspace(1)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 32
+  %load1 = load i16, ptr addrspace(1) %in
+  %load2 = load i16, ptr addrspace(1) %gep
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %build1, ptr addrspace(1) %out
   %build2 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build3 = insertelement <2 x i16> %build2, i16 %load2, i32 1
-  %gep2 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 32
-  store <2 x i16> %build3, <2 x i16> addrspace(1)* %gep2
+  %gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 32
+  store <2 x i16> %build3, ptr addrspace(1) %gep2
   ret void
 }
 
-define void @load_global_d16_lo(i16 addrspace(1)* %in, i32 %reg, <2 x i16> addrspace(1)* %out) {
+define void @load_global_d16_lo(ptr addrspace(1) %in, i32 %reg, ptr addrspace(1) %out) {
 ; GCN-LABEL: load_global_d16_lo:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -372,16 +371,16 @@ define void @load_global_d16_lo(i16 addrspace(1)* %in, i32 %reg, <2 x i16> addrs
 ; GCN-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 32
+  %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 32
   %reg.bc1 = bitcast i32 %reg to <2 x i16>
   %reg.bc2 = bitcast i32 %reg to <2 x i16>
-  %load1 = load i16, i16 addrspace(1)* %in
-  %load2 = load i16, i16 addrspace(1)* %gep
+  %load1 = load i16, ptr addrspace(1) %in
+  %load2 = load i16, ptr addrspace(1) %gep
   %build1 = insertelement <2 x i16> %reg.bc1, i16 %load1, i32 0
   %build2 = insertelement <2 x i16> %reg.bc2, i16 %load2, i32 0
-  %gep2 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 32
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* %out
-  store <2 x i16> %build2, <2 x i16> addrspace(1)* %gep2
+  %gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 32
+  store <2 x i16> %build1, ptr addrspace(1) %out
+  store <2 x i16> %build2, ptr addrspace(1) %gep2
   ret void
 }
 
@@ -445,11 +444,11 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc
 ; GCN-SCRATCH-NEXT:    s_endpgm
 .entry:
   %alloca = alloca float, align 4, addrspace(5)
-  store volatile float 5.5, float addrspace(5)* %alloca
+  store volatile float 5.5, ptr addrspace(5) %alloca
   call void asm sideeffect "", ""()
   ; There was a bug with flat scratch instructions that do not not use any address registers (ST mode).
   ; To trigger, the scratch_load has to be immediately before the image_sample in MIR.
-  %load = load float, float addrspace(5)* %alloca
+  %load = load float, ptr addrspace(5) %alloca
   %val = call <2 x float> @llvm.amdgcn.image.sample.2d.v2f32.f32(i32 9, float %a, float %b, <8 x i32> %desc, <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 0>, i1 false, i32 0, i32 0)
   %val0 = extractelement <2 x float> %val, i32 0
   %valadd = fadd float %load, %val0
@@ -505,11 +504,11 @@ define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32
 .entry:
   %alloca = alloca float, align 4, addrspace(5)
   %alloca2 = alloca float, align 4, addrspace(5)
-  store volatile float 5.5, float addrspace(5)* %alloca
-  store volatile float 6.5, float addrspace(5)* %alloca2
+  store volatile float 5.5, ptr addrspace(5) %alloca
+  store volatile float 6.5, ptr addrspace(5) %alloca2
   call void asm sideeffect "", ""()
-  %load0 = load float, float addrspace(5)* %alloca
-  %load1 = load float, float addrspace(5)* %alloca2
+  %load0 = load float, ptr addrspace(5) %alloca
+  %load1 = load float, ptr addrspace(5) %alloca2
   %valadd = fadd float %load0, %load1
   call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 1, float %valadd, float undef, float undef, float undef, i1 immarg true, i1 immarg true)
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/mesa_regression.ll b/llvm/test/CodeGen/AMDGPU/mesa_regression.ll
index 90c4fe2ff54ae..4b669ac572bd6 100644
--- a/llvm/test/CodeGen/AMDGPU/mesa_regression.ll
+++ b/llvm/test/CodeGen/AMDGPU/mesa_regression.ll
@@ -3,9 +3,9 @@
 ; CHECK-LABEL: %entry
 ; CHECK: flat_load_dwordx4
 
-define amdgpu_kernel void @store_global(<16 x double> addrspace(1)* nocapture %out, <16 x double> addrspace(1)* nocapture readonly %in) {
+define amdgpu_kernel void @store_global(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture readonly %in) {
 entry:
-  %tmp = load <16 x double>, <16 x double> addrspace(1)* %in
-  store <16 x double> %tmp, <16 x double> addrspace(1)* %out
+  %tmp = load <16 x double>, ptr addrspace(1) %in
+  store <16 x double> %tmp, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll
index 19828663accac..0acacaa0e4004 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll
@@ -18,141 +18,141 @@ declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x2bf16:
 ; GCN:  v_mfma_f32_32x32x2bf16 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %a = bitcast i32 1 to <2 x i16>
   %b = bitcast i32 2 to <2 x i16>
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %in.1, i32 0, i32 0, i32 0)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x2bf16:
 ; GCN: v_mfma_f32_16x16x2bf16 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(<16 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> undef, <2 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x2bf16:
 ; GCN: v_mfma_f32_4x4x2bf16 v[{{[0-9:]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(<4 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> undef, <2 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16:
 ; GCN: v_mfma_f32_32x32x4bf16 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(<16 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> undef, <2 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x8bf16:
 ; GCN: v_mfma_f32_16x16x8bf16 v[{{[0-9:]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(<4 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> undef, <2 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16_1k:
 ; GCN:      v_mfma_f32_32x32x4bf16_1k v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(<32 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <32 x float> %in.1, i32 0, i32 0, i32 0)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x4bf16_1k:
 ; GCN: v_mfma_f32_16x16x4bf16_1k v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(<16 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x4bf16_1k:
 ; GCN: v_mfma_f32_4x4x4bf16_1k v[{{[0-9:]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(<4 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x8bf16_1k:
 ; GCN: v_mfma_f32_32x32x8bf16_1k v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(<16 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> undef, <4 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x16bf16_1k:
 ; GCN: v_mfma_f32_16x16x16bf16_1k v[{{[0-9:]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(<4 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> undef, <4 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f64_4x4x4f64:
 ; GCN: v_mfma_f64_4x4x4f64 v[{{[0-9:]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+:[0-9]+}}
-define amdgpu_kernel void @test_mfma_f64_4x4x4f64(double addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg) {
 bb:
   %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double 1.0, double 1.0, double 128.0, i32 0, i32 0, i32 0)
-  store double %mai.1, double addrspace(1)* %arg
+  store double %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64:
 ; GCN: v_mfma_f64_16x16x4f64 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f64_16x16x4f64(<4 x double> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x double>, <4 x double> addrspace(1)* %arg
+  %in.1 = load <4 x double>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double 1.0, double 1.0, <4 x double> %in.1, i32 0, i32 0, i32 0)
-  store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg
+  store <4 x double> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_i32_32x32x8i8:
 ; GCN: v_mfma_i32_32x32x8i8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_i32_32x32x8i8(<16 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg
+  %in.1 = load <16 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 1, i32 1, <16 x i32> %in.1, i32 0, i32 0, i32 0)
-  store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg
+  store <16 x i32> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_i32_16x16x16i8:
 ; GCN: v_mfma_i32_16x16x16i8 v[{{[0-9:]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_i32_16x16x16i8(<4 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg
+  %in.1 = load <4 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 1, i32 1, <4 x i32> %in.1, i32 0, i32 0, i32 0)
-  store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg
+  store <4 x i32> %mai.1, ptr addrspace(1) %arg
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
index a39bd5cf2888d..96030a457b57e 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
@@ -9,66 +9,66 @@ declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vgpr:
 ; GFX908: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
 ; GFX90A: v_mfma_f32_32x32x1{{.*}} v[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_agpr:
 ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(<32 x float> addrspace(1)* %arg) #1 {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) #1 {
 bb:
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr:
 ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr addrspace(1) %arg) #0 {
 bb:
   %acc = call i32 asm sideeffect "; def $0", "={a0}"()
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_phys_agpr:
 ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr addrspace(1) %arg) #0 {
 bb:
   call void asm sideeffect "; use $0", "{a[100:131]}"(<32 x float> undef)
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_no_agprs:
 ; GFX908: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
 ; GFX90A: v_mfma_f32_32x32x1{{.*}} v[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addrspace(1) %arg) #0 {
 bb:
   %acc = call i32 asm sideeffect "; def $0", "={v0}"()
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_call:
 ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) #0 {
 bb:
   call void @foo()
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -78,11 +78,11 @@ bb:
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_call_multi_bb:
 ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(1) %arg) #0 {
 bb1:
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   br i1 undef, label %bb2, label %bb3
   br label %bb2
 
@@ -96,11 +96,11 @@ bb3:
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_nonentry:
 ; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
-define void @test_mfma_f32_32x32x1f32_nonentry(<32 x float> addrspace(1)* %arg) #0 {
+define void @test_mfma_f32_32x32x1f32_nonentry(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index 61ce2689f333d..323d41732c8f7 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -24,7 +24,7 @@
 ; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
 ; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
-define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
 entry:
   br label %for.cond.preheader
 
@@ -37,7 +37,7 @@ for.cond.preheader:
   br i1 %cc, label %exit, label %for.cond.preheader
 
 exit:
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -65,7 +65,7 @@ exit:
 ; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
 ; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
-define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg) #0 {
 entry:
   br label %for.cond.preheader
 
@@ -78,7 +78,7 @@ for.cond.preheader:
   br i1 %cc, label %exit, label %for.cond.preheader
 
 exit:
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -101,7 +101,7 @@ exit:
 ; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
 ; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
-define amdgpu_kernel void @test_mfma_loop_non_splat(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 entry:
   br label %for.cond.preheader
 
@@ -114,7 +114,7 @@ for.cond.preheader:
   br i1 %cc, label %exit, label %for.cond.preheader
 
 exit:
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -203,7 +203,7 @@ exit:
 ; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
 ; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
-define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) #0 {
 entry:
   br label %for.cond.preheader
 
@@ -216,7 +216,7 @@ for.cond.preheader:
   br i1 %cc, label %exit, label %for.cond.preheader
 
 exit:
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -236,7 +236,7 @@ exit:
 ; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
 ; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
-define amdgpu_kernel void @test_mfma_loop_vgpr_init(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %init = bitcast i32 %tid to float
@@ -284,7 +284,7 @@ for.cond.preheader:
   br i1 %cc, label %exit, label %for.cond.preheader
 
 exit:
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -307,7 +307,7 @@ exit:
 ; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
 ; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
-define amdgpu_kernel void @test_mfma_loop_sgpr_init(<32 x float> addrspace(1)* %arg, float %init) #0 {
+define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float %init) #0 {
 entry:
   %tmp0 = insertelement <32 x float> undef, float %init, i32 0
   %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
@@ -353,7 +353,7 @@ for.cond.preheader:
   br i1 %cc, label %exit, label %for.cond.preheader
 
 exit:
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -409,7 +409,7 @@ exit:
 ; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
 ; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
-define amdgpu_kernel void @test_mfma_loop_mixed_init(<32 x float> addrspace(1)* %arg, float %x) #0 {
+define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, float %x) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %init = bitcast i32 %tid to float
@@ -427,7 +427,7 @@ for.cond.preheader:
   br i1 %cc, label %exit, label %for.cond.preheader
 
 exit:
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -453,7 +453,7 @@ exit:
 ; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
 ; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
-define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %arg) #0 {
 entry:
   %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
 
@@ -468,7 +468,7 @@ for.cond.preheader:
   br i1 %cc, label %exit, label %for.cond.preheader
 
 exit:
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -500,7 +500,7 @@ exit:
 ; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
 ; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
-define amdgpu_kernel void @test_mfma_loop_agpr_init(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 entry:
   %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
   %init = extractelement <32 x float> %mai.0, i32 0
@@ -548,7 +548,7 @@ for.cond.preheader:
   br i1 %cc, label %exit, label %for.cond.preheader
 
 exit:
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -578,7 +578,7 @@ exit:
 ; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
 ; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 
-define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) #0 {
 entry:
   br label %for.cond.preheader
 
@@ -601,7 +601,7 @@ inner.exit:
   br i1 %cc.0, label %exit, label %for.cond.preheader
 
 exit:
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
index 28539a49a965f..5b094e647fde8 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
@@ -17,14 +17,14 @@ declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i3
 ; FAST:   v_mfma_f32_32x32x1{{.*}} a[64:95], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95]
 ; FAST:   v_mfma_f32_32x32x1{{.*}} a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95]
 ; GCN:    v_mfma_f32_32x32x1{{.*}} a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31]
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
   %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0)
   %tmp.1 = shufflevector <32 x float> %mai.2, <32 x float> %mai.1, <32 x i32> <i32 32, i32 33, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29>
   %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %tmp.1, i32 0, i32 0, i32 0)
-  store <32 x float> %mai.3, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -41,14 +41,14 @@ bb:
 ; FAST:   v_mfma_f32_16x16x1{{.*}} a[32:47], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47]
 ; FAST:   v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47]
 ; GCN:    v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
-define amdgpu_kernel void @test_mfma_f32_16x16x1f32(<16 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0)
   %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %mai.1, i32 0, i32 0, i32 0)
   %tmp.1 = shufflevector <16 x float> %mai.2, <16 x float> %mai.1, <16 x i32> <i32 16, i32 17, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
   %mai.3 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %tmp.1, i32 0, i32 0, i32 0)
-  store <16 x float> %mai.3, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -60,14 +60,14 @@ bb:
 ; FAST:   v_mfma_f32_4x4x1{{.*}} a[8:11], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
 ; FAST:   v_mfma_f32_4x4x1{{.*}} a[4:7], v{{[0-9]+}}, v{{[0-9]+}}, a[8:11]
 ; GCN:    v_mfma_f32_4x4x1{{.*}} a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
-define amdgpu_kernel void @test_mfma_f32_4x4x1f32(<4 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
   %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %mai.1, i32 0, i32 0, i32 0)
   %tmp.1 = shufflevector <4 x float> %mai.1, <4 x float> %mai.2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   %mai.3 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %tmp.1, i32 0, i32 0, i32 0)
-  store <4 x float> %mai.3, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.3, ptr addrspace(1) %arg
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll
index 65c0c674da309..806ad620df898 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll
@@ -30,260 +30,260 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i3
 
 ; GCN-LABEL: {{^}}test_mfma_i32_16x16x32i8:
 ; GCN: v_mfma_i32_16x16x32_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_i32_16x16x32i8(<4 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg
+  %in.1 = load <4 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 4294967298, i64 12884901892, <4 x i32> %in.1, i32 0, i32 0, i32 0)
-  store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg
+  store <4 x i32> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_i32_32x32x16i8:
 ; GCN: v_mfma_i32_32x32x16_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_i32_32x32x16i8(<16 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg
+  %in.1 = load <16 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 4294967298, i64 12884901892, <16 x i32> %in.1, i32 0, i32 0, i32 0)
-  store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg
+  store <16 x i32> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x8xf32:
 ; GCN: v_mfma_f32_16x16x8_xf32 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(<4 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <4 x float> %in.1, i32 0, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4xf32:
 ; GCN: v_mfma_f32_32x32x4_xf32 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(<16 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <16 x float> %in.1, i32 0, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_bf8_bf8:
 ; GCN: v_mfma_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(<4 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_bf8_fp8:
 ; GCN: v_mfma_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(<4 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_fp8_bf8:
 ; GCN: v_mfma_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(<4 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_fp8_fp8:
 ; GCN: v_mfma_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(<4 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 0, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_bf8_bf8:
 ; GCN: v_mfma_f32_32x32x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(<16 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_bf8_fp8:
 ; GCN: v_mfma_f32_32x32x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(<16 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_fp8_bf8:
 ; GCN: v_mfma_f32_32x32x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(<16 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_fp8_fp8:
 ; GCN: v_mfma_f32_32x32x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(<16 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 0, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_f16:
 ; GCN: v_smfmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
-define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(<4 x float> addrspace(1)* %arg, <4 x half> %a, <8 x half> %b, i32 %idx) {
+define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_smfmac_f32_32x32x16_f16:
 ; GCN: v_smfmac_f32_32x32x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
-define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(<16 x float> addrspace(1)* %arg, <4 x half> %a, <8 x half> %b, i32 %idx) {
+define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_bf16:
 ; GCN: v_smfmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
-define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(<4 x float> addrspace(1)* %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) {
+define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_smfmac_f32_32x32x16_bf16:
 ; GCN: v_smfmac_f32_32x32x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
-define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(<16 x float> addrspace(1)* %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) {
+define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_i8:
 ; GCN: v_smfmac_i32_16x16x64_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
-define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(<4 x i32> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
+define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
 bb:
-  %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg
+  %in.1 = load <4 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %in.1, i32 %idx, i32 0, i32 0)
-  store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg
+  store <4 x i32> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_i8:
 ; GCN: v_smfmac_i32_32x32x32_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
-define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(<16 x i32> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
+define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
 bb:
-  %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg
+  %in.1 = load <16 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %in.1, i32 %idx, i32 0, i32 0)
-  store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg
+  store <16 x i32> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_bf8:
 ; GCN: v_smfmac_f32_16x16x64_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
-define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
+define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_fp8:
 ; GCN: v_smfmac_f32_16x16x64_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
-define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
+define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_bf8:
 ; GCN: v_smfmac_f32_16x16x64_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
-define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
+define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_fp8:
 ; GCN: v_smfmac_f32_16x16x64_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
-define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(<4 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
+define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_bf8:
 ; GCN: v_smfmac_f32_32x32x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
-define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
+define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_fp8:
 ; GCN: v_smfmac_f32_32x32x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
-define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
+define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_bf8:
 ; GCN: v_smfmac_f32_32x32x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
-define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
+define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_fp8:
 ; GCN: v_smfmac_f32_32x32x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v{{[0-9]+}}
-define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(<16 x float> addrspace(1)* %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
+define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll
index 6064107bf8432..2f2f90f0b4774 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select.ll
@@ -19,130 +19,130 @@ declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32:
 ; GCN: v_mfma_f32_32x32x1{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32:
 ; GCN: v_mfma_f32_16x16x1{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x1f32(<16 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32:
 ; GCN: v_mfma_f32_4x4x1{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_4x4x1f32(<4 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x2f32:
 ; GCN: v_mfma_f32_32x32x2{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x2f32(<16 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f32:
 ; GCN: v_mfma_f32_16x16x4{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x4f32(<4 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4f16:
 ; GCN: v_mfma_f32_32x32x4{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x4f16(<32 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> undef, <4 x half> undef, <32 x float> %in.1, i32 0, i32 0, i32 0)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  store <32 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f16:
 ; GCN: v_mfma_f32_16x16x4{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x4f16(<16 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> undef, <4 x half> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x4f16:
 ; GCN: v_mfma_f32_4x4x4{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_4x4x4f16(<4 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> undef, <4 x half> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16:
 ; GCN: v_mfma_f32_32x32x8{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_32x32x8f16(<16 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> undef, <4 x half> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
-  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  store <16 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_16x16x16f16:
 ; GCN: v_mfma_f32_16x16x16{{.*}} v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_f32_16x16x16f16(<4 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> undef, <4 x half> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
-  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_i32_32x32x4i8:
 ; GCN: v_mfma_i32_32x32x4{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_i32_32x32x4i8(<32 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
+  %in.1 = load <32 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 1, i32 1, <32 x i32> %in.1, i32 0, i32 0, i32 0)
-  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
+  store <32 x i32> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_i32_16x16x4i8:
 ; GCN: v_mfma_i32_16x16x4{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_i32_16x16x4i8(<16 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg
+  %in.1 = load <16 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 1, <16 x i32> %in.1, i32 0, i32 0, i32 0)
-  store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg
+  store <16 x i32> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_mfma_i32_4x4x4i8:
 ; GCN: v_mfma_i32_4x4x4{{.*}} v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test_mfma_i32_4x4x4i8(<4 x i32> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) {
 bb:
-  %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg
+  %in.1 = load <4 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 1, <4 x i32> %in.1, i32 0, i32 0, i32 0)
-  store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg
+  store <4 x i32> %mai.1, ptr addrspace(1) %arg
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 1dfc85d2ece60..c65db9ea22854 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -9,16 +9,16 @@
 ; GCN: v_min_i32_e32
 
 ; EG: MIN_INT
-define amdgpu_kernel void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
-  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %a.gep, align 4
-  %b = load i32, i32 addrspace(1)* %b.gep, align 4
+  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
+  %a = load i32, ptr addrspace(1) %a.gep, align 4
+  %b = load i32, ptr addrspace(1) %b.gep, align 4
   %cmp = icmp sle i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out.gep, align 4
+  store i32 %val, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
@@ -26,10 +26,10 @@ define amdgpu_kernel void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrs
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define amdgpu_kernel void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
   %cmp = icmp sle i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -37,10 +37,10 @@ define amdgpu_kernel void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define amdgpu_kernel void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32> %a, <1 x i32> %b) #0 {
   %cmp = icmp sle <1 x i32> %a, %b
   %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
-  store <1 x i32> %val, <1 x i32> addrspace(1)* %out
+  store <1 x i32> %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -54,10 +54,10 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
-define amdgpu_kernel void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b) #0 {
   %cmp = icmp sle <4 x i32> %a, %b
   %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
-  store <4 x i32> %val, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -67,10 +67,10 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <
 ; GCN: s_sext_i32_i8
 ; GCN: s_sext_i32_i8
 ; GCN: s_min_i32
-define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, [8 x i32], i8 %a, [8 x i32], i8 %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], i8 %a, [8 x i32], i8 %b) #0 {
   %cmp = icmp sle i8 %a, %b
   %val = select i1 %cmp, i8 %a, i8 %b
-  store i8 %val, i8 addrspace(1)* %out
+  store i8 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -100,10 +100,10 @@ define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, [8 x i32],
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
-define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], <4 x i8> %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], <4 x i8> %b) #0 {
   %cmp = icmp sle <4 x i8> %a, %b
   %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
-  store <4 x i8> %val, <4 x i8> addrspace(1)* %out
+  store <4 x i8> %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -126,10 +126,10 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, [8
 
 ; EG: MIN_INT
 ; EG: MIN_INT
-define amdgpu_kernel void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #0 {
   %cmp = icmp sle <2 x i16> %a, %b
   %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
-  store <2 x i16> %val, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -152,10 +152,10 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
-define amdgpu_kernel void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16> %a, <4 x i16> %b) #0 {
   %cmp = icmp sle <4 x i16> %a, %b
   %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
-  store <4 x i16> %val, <4 x i16> addrspace(1)* %out
+  store <4 x i16> %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -163,16 +163,16 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <
 ; GCN: v_min_i32_e32
 
 ; EG: MIN_INT
-define amdgpu_kernel void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %aptr, i32 %tid
-  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %bptr, i32 %tid
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %a.gep, align 4
-  %b = load i32, i32 addrspace(1)* %b.gep, align 4
+  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid
+  %b.gep = getelementptr inbounds i32, ptr addrspace(1) %bptr, i32 %tid
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
+  %a = load i32, ptr addrspace(1) %a.gep, align 4
+  %b = load i32, ptr addrspace(1) %b.gep, align 4
   %cmp = icmp slt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out.gep, align 4
+  store i32 %val, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
@@ -183,17 +183,17 @@ define amdgpu_kernel void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrs
 ; GFX10:  v_min_i16
 
 ; EG: MIN_INT
-define amdgpu_kernel void @v_test_imin_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %aptr, i32 %tid
-  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %bptr, i32 %tid
-  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %aptr, i32 %tid
+  %b.gep = getelementptr inbounds i16, ptr addrspace(1) %bptr, i32 %tid
+  %out.gep = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
 
-  %a = load i16, i16 addrspace(1)* %a.gep
-  %b = load i16, i16 addrspace(1)* %b.gep
+  %a = load i16, ptr addrspace(1) %a.gep
+  %b = load i16, ptr addrspace(1) %b.gep
   %cmp = icmp slt i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
-  store i16 %val, i16 addrspace(1)* %out.gep
+  store i16 %val, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -201,10 +201,10 @@ define amdgpu_kernel void @v_test_imin_slt_i16(i16 addrspace(1)* %out, i16 addrs
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define amdgpu_kernel void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
   %cmp = icmp slt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -214,10 +214,10 @@ define amdgpu_kernel void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i
 
 ; EG: MIN_INT
 ; EG: MIN_INT
-define amdgpu_kernel void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %cmp = icmp slt <2 x i32> %a, %b
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
-  store <2 x i32> %val, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -225,10 +225,10 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <
 ; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
 
 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
-define amdgpu_kernel void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
+define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a) #0 {
   %cmp = icmp slt i32 %a, 8
   %val = select i1 %cmp, i32 %a, i32 8
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -236,10 +236,10 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %
 ; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
 
 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
-define amdgpu_kernel void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
+define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a) #0 {
   %cmp = icmp sle i32 %a, 8
   %val = select i1 %cmp, i32 %a, i32 8
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -247,16 +247,16 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %
 ; GCN: v_min_u32_e32
 
 ; EG: MIN_UINT
-define amdgpu_kernel void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
-  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %a.gep, align 4
-  %b = load i32, i32 addrspace(1)* %b.gep, align 4
+  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
+  %a = load i32, ptr addrspace(1) %a.gep, align 4
+  %b = load i32, ptr addrspace(1) %b.gep, align 4
   %cmp = icmp ule i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out.gep, align 4
+  store i32 %val, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
@@ -270,17 +270,17 @@ define amdgpu_kernel void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrs
 ; EG: MIN_UINT
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define amdgpu_kernel void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %a.ptr, <3 x i32> addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %a.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a.ptr, i32 %tid
-  %b.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b.ptr, i32 %tid
-  %out.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid
+  %a.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %out, i32 %tid
 
-  %a = load <3 x i32>, <3 x i32> addrspace(1)* %a.gep
-  %b = load <3 x i32>, <3 x i32> addrspace(1)* %b.gep
+  %a = load <3 x i32>, ptr addrspace(1) %a.gep
+  %b = load <3 x i32>, ptr addrspace(1) %b.gep
   %cmp = icmp ule <3 x i32> %a, %b
   %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
-  store <3 x i32> %val, <3 x i32> addrspace(1)* %out.gep
+  store <3 x i32> %val, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -304,17 +304,17 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <
 ; EG: MIN_UINT
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define amdgpu_kernel void @v_test_umin_ule_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %a.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %a.ptr, i32 %tid
-  %b.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %b.ptr, i32 %tid
-  %out.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %out, i32 %tid
+  %a.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %out, i32 %tid
 
-  %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.gep
-  %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.gep
+  %a = load <3 x i16>, ptr addrspace(1) %a.gep
+  %b = load <3 x i16>, ptr addrspace(1) %b.gep
   %cmp = icmp ule <3 x i16> %a, %b
   %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store <3 x i16> %val, <3 x i16> addrspace(1)* %out.gep
+  store <3 x i16> %val, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -322,10 +322,10 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(<3 x i16> addrspace(1)* %out, <
 ; GCN: s_min_u32
 
 ; EG: MIN_UINT
-define amdgpu_kernel void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
   %cmp = icmp ule i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -333,16 +333,16 @@ define amdgpu_kernel void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i
 ; GCN: v_min_u32_e32
 
 ; EG: MIN_UINT
-define amdgpu_kernel void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
-  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %a.gep, align 4
-  %b = load i32, i32 addrspace(1)* %b.gep, align 4
+  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
+  %a = load i32, ptr addrspace(1) %a.gep, align 4
+  %b = load i32, ptr addrspace(1) %b.gep, align 4
   %cmp = icmp ult i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out.gep, align 4
+  store i32 %val, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
@@ -357,17 +357,17 @@ define amdgpu_kernel void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrs
 ; GFX10:     v_min_u16
 
 ; EG: MIN_UINT
-define amdgpu_kernel void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %a.ptr, i8 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %a.gep = getelementptr inbounds i8, i8 addrspace(1)* %a.ptr, i32 %tid
-  %b.gep = getelementptr inbounds i8, i8 addrspace(1)* %b.ptr, i32 %tid
-  %out.gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i32 %tid
+  %a.gep = getelementptr inbounds i8, ptr addrspace(1) %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds i8, ptr addrspace(1) %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds i8, ptr addrspace(1) %out, i32 %tid
 
-  %a = load i8, i8 addrspace(1)* %a.gep, align 1
-  %b = load i8, i8 addrspace(1)* %b.gep, align 1
+  %a = load i8, ptr addrspace(1) %a.gep, align 1
+  %b = load i8, ptr addrspace(1) %b.gep, align 1
   %cmp = icmp ult i8 %a, %b
   %val = select i1 %cmp, i8 %a, i8 %b
-  store i8 %val, i8 addrspace(1)* %out.gep, align 1
+  store i8 %val, ptr addrspace(1) %out.gep, align 1
   ret void
 }
 
@@ -375,10 +375,10 @@ define amdgpu_kernel void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspac
 ; GCN: s_min_u32
 
 ; EG: MIN_UINT
-define amdgpu_kernel void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
   %cmp = icmp ult i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -391,13 +391,13 @@ define amdgpu_kernel void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i
 ; GCN: s_endpgm
 
 ; EG-NOT: MIN_UINT
-define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
+define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
+  %a = load i32, ptr addrspace(1) %aptr, align 4
+  %b = load i32, ptr addrspace(1) %bptr, align 4
   %cmp = icmp ult i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out0, align 4
-  store i1 %cmp, i1 addrspace(1)* %out1
+  store i32 %val, ptr addrspace(1) %out0, align 4
+  store i1 %cmp, ptr addrspace(1) %out1
   ret void
 }
 
@@ -409,13 +409,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0
 ; GCN: s_endpgm
 
 ; EG-NOT: MIN_UINT
-define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 {
-  %a = load i16, i16 addrspace(1)* %aptr, align 2
-  %b = load i16, i16 addrspace(1)* %bptr, align 2
+define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
+  %a = load i16, ptr addrspace(1) %aptr, align 2
+  %b = load i16, ptr addrspace(1) %bptr, align 2
   %cmp = icmp ult i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
-  store i16 %val, i16 addrspace(1)* %out0, align 2
-  store i1 %cmp, i1 addrspace(1)* %out1
+  store i16 %val, ptr addrspace(1) %out0, align 2
+  store i1 %cmp, ptr addrspace(1) %out1
   ret void
 }
 
@@ -424,10 +424,10 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0
 ; GCN: s_min_u32
 
 ; EG: MIN_UINT
-define amdgpu_kernel void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
+define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32> %a, <1 x i32> %b) #0 {
   %cmp = icmp ult <1 x i32> %a, %b
   %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
-  store <1 x i32> %val, <1 x i32> addrspace(1)* %out
+  store <1 x i32> %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -449,10 +449,10 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <
 ; EG: MIN_UINT
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define amdgpu_kernel void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) #0 {
+define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b) #0 {
   %cmp = icmp ult <8 x i32> %a, %b
   %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
-  store <8 x i32> %val, <8 x i32> addrspace(1)* %out
+  store <8 x i32> %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -484,10 +484,10 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <
 ; EG: MIN_UINT
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define amdgpu_kernel void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) #0 {
+define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16> %a, <8 x i16> %b) #0 {
   %cmp = icmp ult <8 x i16> %a, %b
   %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
-  store <8 x i16> %val, <8 x i16> addrspace(1)* %out
+  store <8 x i16> %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -500,13 +500,13 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <
 ; GCN: {{flat|global}}_store_{{dword|b32}} v{{.+}}, [[VMIN]]
 
 ; EG: MIN_UINT
-define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 {
+define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspace(1) %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 {
   %a.ext = zext i16 %a to i32
   %b.ext = zext i16 %b to i32
   %cmp = icmp ult i32 %a.ext, %b.ext
   %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
   %mask = and i32 %val, 65535
-  store i32 %mask, i32 addrspace(1)* %out
+  store i32 %mask, ptr addrspace(1) %out
   ret void
 }
 
@@ -523,14 +523,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspac
 ; GCN: {{flat|global}}_store_{{dword|b32}} v{{.+}}, [[VMIN]]
 
 ; EG: MIN_INT
-define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) #0 {
+define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace(1) %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) #0 {
   %a.ext = sext i16 %a to i32
   %b.ext = sext i16 %b to i32
   %cmp = icmp slt i32 %a.ext, %b.ext
   %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
   %shl = shl i32 %val, 16
   %sextinreg = ashr i32 %shl, 16
-  store i32 %sextinreg, i32 addrspace(1)* %out
+  store i32 %sextinreg, ptr addrspace(1) %out
   ret void
 }
 
@@ -538,10 +538,10 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define amdgpu_kernel void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i16 %b) #0 {
   %cmp = icmp sle i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
-  store i16 %val, i16 addrspace(1)* %out
+  store i16 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -551,10 +551,10 @@ define amdgpu_kernel void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define amdgpu_kernel void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
   %tmp = icmp ult i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
-  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i64 %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -563,10 +563,10 @@ define amdgpu_kernel void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define amdgpu_kernel void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
   %tmp = icmp ule i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
-  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i64 %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -575,10 +575,10 @@ define amdgpu_kernel void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64
 
 ; EG-DAG: MIN_UINT
 ; EG-DAG: MIN_INT
-define amdgpu_kernel void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
   %tmp = icmp slt i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
-  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i64 %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -587,10 +587,10 @@ define amdgpu_kernel void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64
 
 ; EG-DAG: MIN_UINT
 ; EG-DAG: MIN_INT
-define amdgpu_kernel void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
   %tmp = icmp sle i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
-  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i64 %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -605,16 +605,16 @@ define amdgpu_kernel void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64
 
 ; EG: MIN_INT
 ; EG: MIN_INT
-define amdgpu_kernel void @v_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid
-  %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid
-  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
-  %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.gep
-  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
+  %a = load <2 x i16>, ptr addrspace(1) %a.gep
+  %b = load <2 x i16>, ptr addrspace(1) %b.gep
   %cmp = icmp sle <2 x i16> %a, %b
   %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
-  store <2 x i16> %val, <2 x i16> addrspace(1)* %out.gep
+  store <2 x i16> %val, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -630,16 +630,16 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define amdgpu_kernel void @v_test_imin_ule_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid
-  %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid
-  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
-  %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.gep
-  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
+  %a = load <2 x i16>, ptr addrspace(1) %a.gep
+  %b = load <2 x i16>, ptr addrspace(1) %b.gep
   %cmp = icmp ule <2 x i16> %a, %b
   %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
-  store <2 x i16> %val, <2 x i16> addrspace(1)* %out.gep
+  store <2 x i16> %val, ptr addrspace(1) %out.gep
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/min3.ll b/llvm/test/CodeGen/AMDGPU/min3.ll
index 1e82aca0099d0..11e4f738d6d24 100644
--- a/llvm/test/CodeGen/AMDGPU/min3.ll
+++ b/llvm/test/CodeGen/AMDGPU/min3.ll
@@ -4,63 +4,63 @@
 
 ; GCN-LABEL: {{^}}v_test_imin3_slt_i32:
 ; GCN: v_min3_i32
-define amdgpu_kernel void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_imin3_slt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0
-  %b = load i32, i32 addrspace(1)* %gep1
-  %c = load i32, i32 addrspace(1)* %gep2
+  %gep0 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i32, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load i32, ptr addrspace(1) %gep0
+  %b = load i32, ptr addrspace(1) %gep1
+  %c = load i32, ptr addrspace(1) %gep2
   %icmp0 = icmp slt i32 %a, %b
   %i0 = select i1 %icmp0, i32 %a, i32 %b
   %icmp1 = icmp slt i32 %i0, %c
   %i1 = select i1 %icmp1, i32 %i0, i32 %c
-  store i32 %i1, i32 addrspace(1)* %outgep
+  store i32 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_umin3_ult_i32:
 ; GCN: v_min3_u32
-define amdgpu_kernel void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_umin3_ult_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0
-  %b = load i32, i32 addrspace(1)* %gep1
-  %c = load i32, i32 addrspace(1)* %gep2
+  %gep0 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i32, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load i32, ptr addrspace(1) %gep0
+  %b = load i32, ptr addrspace(1) %gep1
+  %c = load i32, ptr addrspace(1) %gep2
   %icmp0 = icmp ult i32 %a, %b
   %i0 = select i1 %icmp0, i32 %a, i32 %b
   %icmp1 = icmp ult i32 %i0, %c
   %i1 = select i1 %icmp1, i32 %i0, i32 %c
-  store i32 %i1, i32 addrspace(1)* %outgep
+  store i32 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_umin_umin_umin:
 ; GCN: v_min_i32
 ; GCN: v_min3_i32
-define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_umin_umin_umin(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid2 = mul i32 %tid, 2
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
+  %gep0 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i32, ptr addrspace(1) %cptr, i32 %tid
 
-  %gep3 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid2
-  %gep4 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid2
-  %gep5 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid2
+  %gep3 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid2
+  %gep4 = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid2
+  %gep5 = getelementptr i32, ptr addrspace(1) %cptr, i32 %tid2
 
-  %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
+  %outgep0 = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %outgep1 = getelementptr i32, ptr addrspace(1) %out, i32 %tid2
 
-  %a = load i32, i32 addrspace(1)* %gep0
-  %b = load i32, i32 addrspace(1)* %gep1
-  %c = load i32, i32 addrspace(1)* %gep2
-  %d = load i32, i32 addrspace(1)* %gep3
+  %a = load i32, ptr addrspace(1) %gep0
+  %b = load i32, ptr addrspace(1) %gep1
+  %c = load i32, ptr addrspace(1) %gep2
+  %d = load i32, ptr addrspace(1) %gep3
 
   %icmp0 = icmp slt i32 %a, %b
   %i0 = select i1 %icmp0, i32 %a, i32 %b
@@ -71,30 +71,30 @@ define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 add
   %icmp2 = icmp slt i32 %i0, %i1
   %i2 = select i1 %icmp2, i32 %i0, i32 %i1
 
-  store i32 %i2, i32 addrspace(1)* %outgep1
+  store i32 %i2, ptr addrspace(1) %outgep1
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_umin3_2_uses:
 ; GCN-NOT: v_min3
-define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_umin3_2_uses(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid2 = mul i32 %tid, 2
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
+  %gep0 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i32, ptr addrspace(1) %cptr, i32 %tid
 
-  %gep3 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid2
-  %gep4 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid2
-  %gep5 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid2
+  %gep3 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid2
+  %gep4 = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid2
+  %gep5 = getelementptr i32, ptr addrspace(1) %cptr, i32 %tid2
 
-  %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
+  %outgep0 = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %outgep1 = getelementptr i32, ptr addrspace(1) %out, i32 %tid2
 
-  %a = load i32, i32 addrspace(1)* %gep0
-  %b = load i32, i32 addrspace(1)* %gep1
-  %c = load i32, i32 addrspace(1)* %gep2
-  %d = load i32, i32 addrspace(1)* %gep3
+  %a = load i32, ptr addrspace(1) %gep0
+  %b = load i32, ptr addrspace(1) %gep1
+  %c = load i32, ptr addrspace(1) %gep2
+  %d = load i32, ptr addrspace(1) %gep3
 
   %icmp0 = icmp slt i32 %a, %b
   %i0 = select i1 %icmp0, i32 %a, i32 %b
@@ -105,8 +105,8 @@ define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrs
   %icmp2 = icmp slt i32 %i0, %c
   %i2 = select i1 %icmp2, i32 %i0, i32 %c
 
-  store i32 %i2, i32 addrspace(1)* %outgep0
-  store i32 %i0, i32 addrspace(1)* %outgep1
+  store i32 %i2, ptr addrspace(1) %outgep0
+  store i32 %i0, ptr addrspace(1) %outgep1
   ret void
 }
 
@@ -117,20 +117,20 @@ define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrs
 ; VI: v_min_i16
 
 ; GFX9: v_min3_i16
-define amdgpu_kernel void @v_test_imin3_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_imin3_slt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
-  %a = load i16, i16 addrspace(1)* %gep0
-  %b = load i16, i16 addrspace(1)* %gep1
-  %c = load i16, i16 addrspace(1)* %gep2
+  %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i16, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid
+  %a = load i16, ptr addrspace(1) %gep0
+  %b = load i16, ptr addrspace(1) %gep1
+  %c = load i16, ptr addrspace(1) %gep2
   %icmp0 = icmp slt i16 %a, %b
   %i0 = select i1 %icmp0, i16 %a, i16 %b
   %icmp1 = icmp slt i16 %i0, %c
   %i1 = select i1 %icmp1, i16 %i0, i16 %c
-  store i16 %i1, i16 addrspace(1)* %outgep
+  store i16 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -141,20 +141,20 @@ define amdgpu_kernel void @v_test_imin3_slt_i16(i16 addrspace(1)* %out, i16 addr
 ; VI: v_min_u16
 
 ; GFX9: v_min3_u16
-define amdgpu_kernel void @v_test_umin3_ult_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_umin3_ult_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
-  %a = load i16, i16 addrspace(1)* %gep0
-  %b = load i16, i16 addrspace(1)* %gep1
-  %c = load i16, i16 addrspace(1)* %gep2
+  %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i16, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid
+  %a = load i16, ptr addrspace(1) %gep0
+  %b = load i16, ptr addrspace(1) %gep1
+  %c = load i16, ptr addrspace(1) %gep2
   %icmp0 = icmp ult i16 %a, %b
   %i0 = select i1 %icmp0, i16 %a, i16 %b
   %icmp1 = icmp ult i16 %i0, %c
   %i1 = select i1 %icmp1, i16 %i0, i16 %c
-  store i16 %i1, i16 addrspace(1)* %outgep
+  store i16 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -165,20 +165,20 @@ define amdgpu_kernel void @v_test_umin3_ult_i16(i16 addrspace(1)* %out, i16 addr
 ; VI: v_min_i16
 
 ; GFX9: v_min3_i16
-define amdgpu_kernel void @v_test_imin3_slt_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr, i8 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_imin3_slt_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i8, i8 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i8, i8 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i8, i8 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
-  %a = load i8, i8 addrspace(1)* %gep0
-  %b = load i8, i8 addrspace(1)* %gep1
-  %c = load i8, i8 addrspace(1)* %gep2
+  %gep0 = getelementptr i8, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i8, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i8, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i8, ptr addrspace(1) %out, i32 %tid
+  %a = load i8, ptr addrspace(1) %gep0
+  %b = load i8, ptr addrspace(1) %gep1
+  %c = load i8, ptr addrspace(1) %gep2
   %icmp0 = icmp slt i8 %a, %b
   %i0 = select i1 %icmp0, i8 %a, i8 %b
   %icmp1 = icmp slt i8 %i0, %c
   %i1 = select i1 %icmp1, i8 %i0, i8 %c
-  store i8 %i1, i8 addrspace(1)* %outgep
+  store i8 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -189,20 +189,20 @@ define amdgpu_kernel void @v_test_imin3_slt_i8(i8 addrspace(1)* %out, i8 addrspa
 ; VI: v_min_u16
 
 ; GFX9: v_min3_u16
-define amdgpu_kernel void @v_test_umin3_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr, i8 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_umin3_ult_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i8, i8 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i8, i8 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i8, i8 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
-  %a = load i8, i8 addrspace(1)* %gep0
-  %b = load i8, i8 addrspace(1)* %gep1
-  %c = load i8, i8 addrspace(1)* %gep2
+  %gep0 = getelementptr i8, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i8, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i8, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i8, ptr addrspace(1) %out, i32 %tid
+  %a = load i8, ptr addrspace(1) %gep0
+  %b = load i8, ptr addrspace(1) %gep1
+  %c = load i8, ptr addrspace(1) %gep2
   %icmp0 = icmp ult i8 %a, %b
   %i0 = select i1 %icmp0, i8 %a, i8 %b
   %icmp1 = icmp ult i8 %i0, %c
   %i1 = select i1 %icmp1, i8 %i0, i8 %c
-  store i8 %i1, i8 addrspace(1)* %outgep
+  store i8 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -213,20 +213,20 @@ define amdgpu_kernel void @v_test_umin3_ult_i8(i8 addrspace(1)* %out, i8 addrspa
 ; VI: v_min_i16
 
 ; GFX9: v_min3_i16
-define amdgpu_kernel void @v_test_imin3_slt_i7(i7 addrspace(1)* %out, i7 addrspace(1)* %aptr, i7 addrspace(1)* %bptr, i7 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_imin3_slt_i7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i7, i7 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i7, i7 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i7, i7 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i7, i7 addrspace(1)* %out, i32 %tid
-  %a = load i7, i7 addrspace(1)* %gep0
-  %b = load i7, i7 addrspace(1)* %gep1
-  %c = load i7, i7 addrspace(1)* %gep2
+  %gep0 = getelementptr i7, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i7, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i7, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i7, ptr addrspace(1) %out, i32 %tid
+  %a = load i7, ptr addrspace(1) %gep0
+  %b = load i7, ptr addrspace(1) %gep1
+  %c = load i7, ptr addrspace(1) %gep2
   %icmp0 = icmp slt i7 %a, %b
   %i0 = select i1 %icmp0, i7 %a, i7 %b
   %icmp1 = icmp slt i7 %i0, %c
   %i1 = select i1 %icmp1, i7 %i0, i7 %c
-  store i7 %i1, i7 addrspace(1)* %outgep
+  store i7 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -237,96 +237,96 @@ define amdgpu_kernel void @v_test_imin3_slt_i7(i7 addrspace(1)* %out, i7 addrspa
 ; VI: v_min_u16
 
 ; GFX9: v_min3_u16
-define amdgpu_kernel void @v_test_umin3_ult_i7(i7 addrspace(1)* %out, i7 addrspace(1)* %aptr, i7 addrspace(1)* %bptr, i7 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_umin3_ult_i7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i7, i7 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i7, i7 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i7, i7 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i7, i7 addrspace(1)* %out, i32 %tid
-  %a = load i7, i7 addrspace(1)* %gep0
-  %b = load i7, i7 addrspace(1)* %gep1
-  %c = load i7, i7 addrspace(1)* %gep2
+  %gep0 = getelementptr i7, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i7, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i7, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i7, ptr addrspace(1) %out, i32 %tid
+  %a = load i7, ptr addrspace(1) %gep0
+  %b = load i7, ptr addrspace(1) %gep1
+  %c = load i7, ptr addrspace(1) %gep2
   %icmp0 = icmp ult i7 %a, %b
   %i0 = select i1 %icmp0, i7 %a, i7 %b
   %icmp1 = icmp ult i7 %i0, %c
   %i1 = select i1 %icmp1, i7 %i0, i7 %c
-  store i7 %i1, i7 addrspace(1)* %outgep
+  store i7 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_imin3_slt_i33:
 ; GCN-NOT: v_min3
-define amdgpu_kernel void @v_test_imin3_slt_i33(i33 addrspace(1)* %out, i33 addrspace(1)* %aptr, i33 addrspace(1)* %bptr, i33 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_imin3_slt_i33(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i33, i33 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i33, i33 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i33, i33 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i33, i33 addrspace(1)* %out, i32 %tid
-  %a = load i33, i33 addrspace(1)* %gep0
-  %b = load i33, i33 addrspace(1)* %gep1
-  %c = load i33, i33 addrspace(1)* %gep2
+  %gep0 = getelementptr i33, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i33, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i33, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i33, ptr addrspace(1) %out, i32 %tid
+  %a = load i33, ptr addrspace(1) %gep0
+  %b = load i33, ptr addrspace(1) %gep1
+  %c = load i33, ptr addrspace(1) %gep2
   %icmp0 = icmp slt i33 %a, %b
   %i0 = select i1 %icmp0, i33 %a, i33 %b
   %icmp1 = icmp slt i33 %i0, %c
   %i1 = select i1 %icmp1, i33 %i0, i33 %c
-  store i33 %i1, i33 addrspace(1)* %outgep
+  store i33 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_umin3_ult_i33:
 ; GCN-NOT: v_min3
-define amdgpu_kernel void @v_test_umin3_ult_i33(i33 addrspace(1)* %out, i33 addrspace(1)* %aptr, i33 addrspace(1)* %bptr, i33 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_umin3_ult_i33(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i33, i33 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i33, i33 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i33, i33 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i33, i33 addrspace(1)* %out, i32 %tid
-  %a = load i33, i33 addrspace(1)* %gep0
-  %b = load i33, i33 addrspace(1)* %gep1
-  %c = load i33, i33 addrspace(1)* %gep2
+  %gep0 = getelementptr i33, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i33, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i33, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i33, ptr addrspace(1) %out, i32 %tid
+  %a = load i33, ptr addrspace(1) %gep0
+  %b = load i33, ptr addrspace(1) %gep1
+  %c = load i33, ptr addrspace(1) %gep2
   %icmp0 = icmp ult i33 %a, %b
   %i0 = select i1 %icmp0, i33 %a, i33 %b
   %icmp1 = icmp ult i33 %i0, %c
   %i1 = select i1 %icmp1, i33 %i0, i33 %c
-  store i33 %i1, i33 addrspace(1)* %outgep
+  store i33 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_imin3_slt_i64:
 ; GCN-NOT: v_min3
-define amdgpu_kernel void @v_test_imin3_slt_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_imin3_slt_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i64, i64 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i64, i64 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
-  %a = load i64, i64 addrspace(1)* %gep0
-  %b = load i64, i64 addrspace(1)* %gep1
-  %c = load i64, i64 addrspace(1)* %gep2
+  %gep0 = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i64, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i64, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i64, ptr addrspace(1) %out, i32 %tid
+  %a = load i64, ptr addrspace(1) %gep0
+  %b = load i64, ptr addrspace(1) %gep1
+  %c = load i64, ptr addrspace(1) %gep2
   %icmp0 = icmp slt i64 %a, %b
   %i0 = select i1 %icmp0, i64 %a, i64 %b
   %icmp1 = icmp slt i64 %i0, %c
   %i1 = select i1 %icmp1, i64 %i0, i64 %c
-  store i64 %i1, i64 addrspace(1)* %outgep
+  store i64 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_umin3_ult_i64:
 ; GCN-NOT: v_min3
-define amdgpu_kernel void @v_test_umin3_ult_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 addrspace(1)* %cptr) #0 {
+define amdgpu_kernel void @v_test_umin3_ult_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr i64, i64 addrspace(1)* %bptr, i32 %tid
-  %gep2 = getelementptr i64, i64 addrspace(1)* %cptr, i32 %tid
-  %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
-  %a = load i64, i64 addrspace(1)* %gep0
-  %b = load i64, i64 addrspace(1)* %gep1
-  %c = load i64, i64 addrspace(1)* %gep2
+  %gep0 = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr i64, ptr addrspace(1) %bptr, i32 %tid
+  %gep2 = getelementptr i64, ptr addrspace(1) %cptr, i32 %tid
+  %outgep = getelementptr i64, ptr addrspace(1) %out, i32 %tid
+  %a = load i64, ptr addrspace(1) %gep0
+  %b = load i64, ptr addrspace(1) %gep1
+  %c = load i64, ptr addrspace(1) %gep2
   %icmp0 = icmp ult i64 %a, %b
   %i0 = select i1 %icmp0, i64 %a, i64 %b
   %icmp1 = icmp ult i64 %i0, %c
   %i1 = select i1 %icmp1, i64 %i0, i64 %c
-  store i64 %i1, i64 addrspace(1)* %outgep
+  store i64 %i1, ptr addrspace(1) %outgep
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index 7f82f5d0a8079..a9f370f491dfd 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -14,7 +14,7 @@ define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
   ret i32 %sminmax
 }
 
-define amdgpu_ps void @s_test_minmax_i32(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 addrspace(1)* inreg %out) {
+define amdgpu_ps void @s_test_minmax_i32(i32 inreg %a, i32 inreg %b, i32 inreg %c, ptr addrspace(1) inreg %out) {
 ; SDAG-LABEL: s_test_minmax_i32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_max_i32 s0, s0, s1
@@ -38,7 +38,7 @@ define amdgpu_ps void @s_test_minmax_i32(i32 inreg %a, i32 inreg %b, i32 inreg %
 ; GISEL-NEXT:    s_endpgm
   %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
   %sminmax = call i32 @llvm.smin.i32(i32 %smax, i32 %c)
-  store i32 %sminmax, i32 addrspace(1)* %out
+  store i32 %sminmax, ptr addrspace(1) %out
   ret void
 }
 
@@ -78,7 +78,7 @@ define i32 @test_maxmin_commuted_i32(i32 %a, i32 %b, i32 %c) {
   ret i32 %smaxmin
 }
 
-define void @test_smed3_i32(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) {
+define void @test_smed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
 ; GFX11-LABEL: test_smed3_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -91,7 +91,7 @@ define void @test_smed3_i32(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) {
   %tmp1 = call i32 @llvm.smax.i32(i32 %x, i32 %y)
   %tmp2 = call i32 @llvm.smin.i32(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @llvm.smax.i32(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -107,7 +107,7 @@ define i32 @test_minmax_u32(i32 %a, i32 %b, i32 %c) {
   ret i32 %uminmax
 }
 
-define amdgpu_ps void @s_test_minmax_u32(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 addrspace(1)* inreg %out) {
+define amdgpu_ps void @s_test_minmax_u32(i32 inreg %a, i32 inreg %b, i32 inreg %c, ptr addrspace(1) inreg %out) {
 ; SDAG-LABEL: s_test_minmax_u32:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_max_u32 s0, s0, s1
@@ -131,7 +131,7 @@ define amdgpu_ps void @s_test_minmax_u32(i32 inreg %a, i32 inreg %b, i32 inreg %
 ; GISEL-NEXT:    s_endpgm
   %smax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
   %sminmax = call i32 @llvm.umin.i32(i32 %smax, i32 %c)
-  store i32 %sminmax, i32 addrspace(1)* %out
+  store i32 %sminmax, ptr addrspace(1) %out
   ret void
 }
 
@@ -171,7 +171,7 @@ define i32 @test_maxmin_commuted_u32(i32 %a, i32 %b, i32 %c) {
   ret i32 %umaxmin
 }
 
-define void @test_umed3_i32(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) {
+define void @test_umed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
 ; GFX11-LABEL: test_umed3_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -184,7 +184,7 @@ define void @test_umed3_i32(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) {
   %tmp1 = call i32 @llvm.umax.i32(i32 %x, i32 %y)
   %tmp2 = call i32 @llvm.umin.i32(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @llvm.umax.i32(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -211,7 +211,7 @@ define float @test_minmax_f32_ieee_true(float %a, float %b, float %c) {
   ret float %minmax
 }
 
-define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg %b, float inreg %c, float addrspace(1)* inreg %out) {
+define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg %b, float inreg %c, ptr addrspace(1) inreg %out) {
 ; SDAG-LABEL: s_test_minmax_f32_ieee_false:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
@@ -233,7 +233,7 @@ define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg
 ; GISEL-NEXT:    s_endpgm
   %smax = call float @llvm.maxnum.f32(float %a, float %b)
   %sminmax = call float @llvm.minnum.f32(float %smax, float %c)
-  store float %sminmax, float addrspace(1)* %out
+  store float %sminmax, ptr addrspace(1) %out
   ret void
 }
 
@@ -280,7 +280,7 @@ define amdgpu_ps float @test_maxmin_commuted_f32_ieee_false(float %a, float %b,
   ret float %maxmin
 }
 
-define void @test_med3_f32(float addrspace(1)* %arg, float %x, float %y, float %z) #0 {
+define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z) #0 {
 ; GFX11-LABEL: test_med3_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -293,7 +293,7 @@ define void @test_med3_f32(float addrspace(1)* %arg, float %x, float %y, float %
   %tmp1 = call float @llvm.maxnum.f32(float %x, float %y)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %z)
   %tmp3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
-  store float %tmp3, float addrspace(1)* %arg
+  store float %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -307,7 +307,7 @@ define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) {
   ret half %minmax
 }
 
-define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b, half inreg %c, half addrspace(1)* inreg %out) {
+define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b, half inreg %c, ptr addrspace(1) inreg %out) {
 ; SDAG-LABEL: s_test_minmax_f16_ieee_false:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
@@ -329,7 +329,7 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b
 ; GISEL-NEXT:    s_endpgm
   %smax = call half @llvm.maxnum.f16(half %a, half %b)
   %sminmax = call half @llvm.minnum.f16(half %smax, half %c)
-  store half %sminmax, half addrspace(1)* %out
+  store half %sminmax, ptr addrspace(1) %out
   ret void
 }
 
@@ -393,7 +393,7 @@ define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) {
   ret half %maxmin
 }
 
-define void @test_med3_f16(half addrspace(1)* %arg, half %x, half %y, half %z) #0 {
+define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0 {
 ; GFX11-LABEL: test_med3_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -406,7 +406,7 @@ define void @test_med3_f16(half addrspace(1)* %arg, half %x, half %y, half %z) #
   %tmp1 = call half @llvm.maxnum.f16(half %x, half %y)
   %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %z)
   %tmp3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2)
-  store half %tmp3, half addrspace(1)* %arg
+  store half %tmp3, ptr addrspace(1) %arg
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/mmo-target-flags-folding.ll b/llvm/test/CodeGen/AMDGPU/mmo-target-flags-folding.ll
index 3746810ac942e..0810a19bc8269 100644
--- a/llvm/test/CodeGen/AMDGPU/mmo-target-flags-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/mmo-target-flags-folding.ll
@@ -5,19 +5,17 @@
 
 ; GCN-LABEL: {{^}}test_load_folding_mmo_flags:
 ; GCN: global_load_dwordx2
-define amdgpu_kernel void @test_load_folding_mmo_flags(<2 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_load_folding_mmo_flags(ptr addrspace(1) %arg) {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %arrayidx = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id
-  %i1 = bitcast <2 x float> addrspace(1)* %arrayidx to i64 addrspace(1)*
-  %i2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %arrayidx, i64 0, i32 0
-  %i3 = load float, float addrspace(1)* %i2, align 4
-  %idx = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arrayidx, i64 0, i32 1
-  %i4 = load float, float addrspace(1)* %idx, align 4
-  %i5 = load i64, i64 addrspace(1)* %i1, align 4, !amdgpu.noclobber !0
-  store i64 %i5, i64 addrspace(1)* undef, align 4
+  %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id
+  %i3 = load float, ptr addrspace(1) %arrayidx, align 4
+  %idx = getelementptr inbounds <2 x float>, ptr addrspace(1) %arrayidx, i64 0, i32 1
+  %i4 = load float, ptr addrspace(1) %idx, align 4
+  %i5 = load i64, ptr addrspace(1) %arrayidx, align 4, !amdgpu.noclobber !0
+  store i64 %i5, ptr addrspace(1) undef, align 4
   %mul = fmul float %i3, %i4
-  store float %mul, float addrspace(1)* undef, align 4
+  store float %mul, ptr addrspace(1) undef, align 4
   unreachable
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
index da7b926aec5f2..a73a153f4679c 100644
--- a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
+++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
@@ -19,7 +19,7 @@ define amdgpu_kernel void @nocall_ideal() {
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    ds_write_b32 v0, v0
 ; CHECK-NEXT:    s_endpgm
-store i32 0, i32 addrspace(3)* @used_by_kernel
+store i32 0, ptr addrspace(3) @used_by_kernel
   ret void
 }
 ; CHECK: ; LDSByteSize: 4 bytes
@@ -115,7 +115,7 @@ define amdgpu_kernel void @withcall() {
 ; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; G_GFX10-NEXT:    s_endpgm
-  store i32 0, i32 addrspace(3)* @used_by_both
+  store i32 0, ptr addrspace(3) @used_by_both
   call void @nonkernel()
   ret void
 }
@@ -130,7 +130,7 @@ define amdgpu_kernel void @nocall_false_sharing() {
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    ds_write_b32 v0, v0
 ; CHECK-NEXT:    s_endpgm
-  store i32 0, i32 addrspace(3)* @used_by_both
+  store i32 0, ptr addrspace(3) @used_by_both
   ret void
 }
 ; CHECK: ; LDSByteSize: 4 bytes
@@ -182,7 +182,7 @@ define void @nonkernel() {
 ; G_GFX10-NEXT:    ds_write_b64 v2, v[0:1]
 ; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store i32 0, i32 addrspace(3)* @used_by_both
-  store double 0.0, double addrspace(3)* @used_by_function
+  store i32 0, ptr addrspace(3) @used_by_both
+  store double 0.0, ptr addrspace(3) @used_by_function
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/mul.i16.ll b/llvm/test/CodeGen/AMDGPU/mul.i16.ll
index 6cdff0047d13f..0f7eb012e81a9 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.i16.ll
@@ -17,7 +17,7 @@ define i16 @v_mul_i16(i16 %a, i16 %b) {
 ; GCN: s_mul_i16
 define amdgpu_kernel void @s_mul_i16(i16 %a, i16 %b) {
   %r.val = mul i16 %a, %b
-  store volatile i16 %r.val, i16 addrspace(1)* null
+  store volatile i16 %r.val, ptr addrspace(1) null
   ret void
 }
 
@@ -25,14 +25,14 @@ define amdgpu_kernel void @s_mul_i16(i16 %a, i16 %b) {
 ; GCN-LABEL: {{^}}v_mul_i16_uniform_load:
 ; GCN: v_mul_lo_u32
 define amdgpu_kernel void @v_mul_i16_uniform_load(
-    i16 addrspace(1)* %r,
-    i16 addrspace(1)* %a,
-    i16 addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load i16, i16 addrspace(1)* %a
-  %b.val = load i16, i16 addrspace(1)* %b
+  %a.val = load i16, ptr addrspace(1) %a
+  %b.val = load i16, ptr addrspace(1) %b
   %r.val = mul i16 %a.val, %b.val
-  store i16 %r.val, i16 addrspace(1)* %r
+  store i16 %r.val, ptr addrspace(1) %r
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 5678e90dc3d95..85dd59a0c4217 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -14,12 +14,12 @@
 ; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define amdgpu_kernel void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
+  %a = load <2 x i32>, ptr addrspace(1) %in
+  %b = load <2 x i32>, ptr addrspace(1) %b_ptr
   %result = mul <2 x i32> %a, %b
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -34,12 +34,12 @@ define amdgpu_kernel void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32
 ; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define amdgpu_kernel void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
+  %a = load <4 x i32>, ptr addrspace(1) %in
+  %b = load <4 x i32>, ptr addrspace(1) %b_ptr
   %result = mul <4 x i32> %a, %b
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -48,10 +48,10 @@ define amdgpu_kernel void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> a
 ; GCN: s_load_dword
 ; GCN: s_mul_i32
 ; GCN: buffer_store_dword
-define amdgpu_kernel void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, i64 %b) {
   %mul = mul i64 %b, %a
   %trunc = trunc i64 %mul to i32
-  store i32 %trunc, i32 addrspace(1)* %out, align 8
+  store i32 %trunc, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -60,12 +60,12 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a
 ; GCN: s_load_dword
 ; GCN: v_mul_lo_u32
 ; GCN: buffer_store_dword
-define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %a = load i64, i64 addrspace(1)* %aptr, align 8
-  %b = load i64, i64 addrspace(1)* %bptr, align 8
+define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+  %a = load i64, ptr addrspace(1) %aptr, align 8
+  %b = load i64, ptr addrspace(1) %bptr, align 8
   %mul = mul i64 %b, %a
   %trunc = trunc i64 %mul to i32
-  store i32 %trunc, i32 addrspace(1)* %out, align 8
+  store i32 %trunc, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -77,11 +77,11 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 ad
 ; SI-DAG: s_mulk_i32
 ; SI-DAG: v_mul_hi_i32
 ; VI: v_mad_i64_i32
-define amdgpu_kernel void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
 entry:
   %0 = sext i32 %in to i64
   %1 = mul i64 %0, 80
-  store i64 %1, i64 addrspace(1)* %out
+  store i64 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -92,11 +92,11 @@ entry:
 ; SI-DAG: v_mul_hi_i32
 ; VI: v_mad_i64_i32
 ; GCN: s_endpgm
-define amdgpu_kernel void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %val = load i32, i32 addrspace(1)* %in, align 4
+define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %val = load i32, ptr addrspace(1) %in, align 4
   %ext = sext i32 %val to i64
   %mul = mul i64 %ext, 80
-  store i64 %mul, i64 addrspace(1)* %out, align 8
+  store i64 %mul, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -105,11 +105,11 @@ define amdgpu_kernel void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(
 ; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
 ; VI: v_mad_i64_i32 v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, 9, 0
 ; GCN: s_endpgm
-define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %val = load i32, i32 addrspace(1)* %in, align 4
+define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %val = load i32, ptr addrspace(1) %in, align 4
   %ext = sext i32 %val to i64
   %mul = mul i64 %ext, 9
-  store i64 %mul, i64 addrspace(1)* %out, align 8
+  store i64 %mul, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -120,20 +120,20 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 a
 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_dword [[VRESULT]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind {
+define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind {
   %mul = mul i32 %a, %b
-  store i32 %mul, i32 addrspace(1)* %out, align 4
+  store i32 %mul, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_mul_i32:
 ; GCN: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
+define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %a = load i32, ptr addrspace(1) %in
+  %b = load i32, ptr addrspace(1) %b_ptr
   %result = mul i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -150,31 +150,31 @@ define amdgpu_kernel void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %
 ; GFX9PLUS-DAG: s_mul_i32
 ; GFX9PLUS-DAG: s_mul_i32
 ; GFX9PLUS: s_endpgm
-define amdgpu_kernel void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
   %mul = mul i64 %a, %b
-  store i64 %mul, i64 addrspace(1)* %out, align 8
+  store i64 %mul, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_mul_i64:
 ; GCN: v_mul_lo_u32
-define amdgpu_kernel void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
-  %a = load i64, i64 addrspace(1)* %aptr, align 8
-  %b = load i64, i64 addrspace(1)* %bptr, align 8
+define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
+  %a = load i64, ptr addrspace(1) %aptr, align 8
+  %b = load i64, ptr addrspace(1) %bptr, align 8
   %mul = mul i64 %a, %b
-  store i64 %mul, i64 addrspace(1)* %out, align 8
+  store i64 %mul, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}mul32_in_branch:
 ; GCN: s_mul_i32
-define amdgpu_kernel void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = icmp eq i32 %a, 0
   br i1 %0, label %if, label %else
 
 if:
-  %1 = load i32, i32 addrspace(1)* %in
+  %1 = load i32, ptr addrspace(1) %in
   br label %endif
 
 else:
@@ -183,7 +183,7 @@ else:
 
 endif:
   %3 = phi i32 [%1, %if], [%2, %else]
-  store i32 %3, i32 addrspace(1)* %out
+  store i32 %3, ptr addrspace(1) %out
   ret void
 }
 
@@ -192,13 +192,13 @@ endif:
 ; SI-DAG: v_mul_hi_u32
 ; VI: v_mad_u64_u32
 ; GCN: s_endpgm
-define amdgpu_kernel void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
 
 if:
-  %1 = load i64, i64 addrspace(1)* %in
+  %1 = load i64, ptr addrspace(1) %in
   br label %endif
 
 else:
@@ -207,7 +207,7 @@ else:
 
 endif:
   %3 = phi i64 [%1, %if], [%2, %else]
-  store i64 %3, i64 addrspace(1)* %out
+  store i64 %3, ptr addrspace(1) %out
   ret void
 }
 
@@ -243,9 +243,9 @@ endif:
 
 
 ; GCN: buffer_store_dwordx4
-define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 {
+define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 {
   %mul = mul i128 %a, %b
-  store i128 %mul, i128 addrspace(1)* %out
+  store i128 %mul, ptr addrspace(1) %out
   ret void
 }
 
@@ -283,15 +283,15 @@ define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, [8 x i32], i128 %
 ; VI-DAG: v_mul_lo_u32
 
 ; GCN: {{buffer|flat}}_store_dwordx4
-define amdgpu_kernel void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.a = getelementptr inbounds i128, i128 addrspace(1)* %aptr, i32 %tid
-  %gep.b = getelementptr inbounds i128, i128 addrspace(1)* %bptr, i32 %tid
-  %gep.out = getelementptr inbounds i128, i128 addrspace(1)* %bptr, i32 %tid
-  %a = load i128, i128 addrspace(1)* %gep.a
-  %b = load i128, i128 addrspace(1)* %gep.b
+  %gep.a = getelementptr inbounds i128, ptr addrspace(1) %aptr, i32 %tid
+  %gep.b = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid
+  %gep.out = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid
+  %a = load i128, ptr addrspace(1) %gep.a
+  %b = load i128, ptr addrspace(1) %gep.b
   %mul = mul i128 %a, %b
-  store i128 %mul, i128 addrspace(1)* %gep.out
+  store i128 %mul, ptr addrspace(1) %gep.out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index f8bf756556585..c3529debe693d 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s
 
 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
-define amdgpu_kernel void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 ; SI-LABEL: test_smul24_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -93,11 +93,11 @@ entry:
   %b.shl = shl i32 %b, 8
   %b.24 = ashr i32 %b.shl, 8
   %mul24 = mul i32 %a.24, %b.24
-  store i32 %mul24, i32 addrspace(1)* %out
+  store i32 %mul24, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 ; SI-LABEL: test_smulhi24_i64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -179,7 +179,7 @@ entry:
   %mul48 = mul i64 %a.24.i64, %b.24.i64
   %mul48.hi = lshr i64 %mul48, 32
   %mul24hi = trunc i64 %mul48.hi to i32
-  store i32 %mul24hi, i32 addrspace(1)* %out
+  store i32 %mul24hi, ptr addrspace(1) %out
   ret void
 }
 
@@ -298,7 +298,7 @@ define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; unnecessary extension instructions because after legalization they
 ; will not be removed by SimplifyDemandedBits because there are
 ; multiple uses by the separate mul and mulhi.
-define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 {
+define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 {
 ; SI-LABEL: test_smul24_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -396,11 +396,11 @@ define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i3
   %shr2.i = ashr i32 %shl1.i, 8
   %conv3.i = sext i32 %shr2.i to i64
   %mul.i = mul i64 %conv3.i, %conv.i
-  store i64 %mul.i, i64 addrspace(1)* %out
+  store i64 %mul.i, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 ; SI-LABEL: test_smul24_i64_square:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -483,11 +483,11 @@ define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a
   %shr.i = ashr i32 %shl.i, 8
   %conv.i = sext i32 %shr.i to i64
   %mul.i = mul i64 %conv.i, %conv.i
-  store i64 %mul.i, i64 addrspace(1)* %out
+  store i64 %mul.i, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 {
+define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) #0 {
 ; SI-LABEL: test_smul24_i33:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -600,11 +600,11 @@ entry:
   %b.24 = ashr i33 %b.shl, 9
   %mul24 = mul i33 %a.24, %b.24
   %ext = sext i33 %mul24 to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
+define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) {
 ; SI-LABEL: test_smulhi24_i33:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
@@ -692,11 +692,11 @@ entry:
   %hi = lshr i33 %tmp2, 32
   %trunc = trunc i33 %hi to i32
 
-  store i32 %trunc, i32 addrspace(1)* %out
+  store i32 %trunc, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {
+define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {
 ; SI-LABEL: simplify_i24_crash:
 ; SI:       ; %bb.0: ; %bb
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -833,7 +833,7 @@ bb11:
   %tmp19 = shl <2 x i32> %tmp16, <i32 8, i32 8>
   %tmp20 = ashr <2 x i32> %tmp19, <i32 8, i32 8>
   %tmp21 = mul <2 x i32> %tmp18, %tmp20
-  store <2 x i32> %tmp21, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %tmp21, ptr addrspace(1) %out
   br label %bb7
 
 bb7:

diff  --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index 524bd7f6995d3..ffc533decc042 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -6,7 +6,7 @@
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 
-define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; SI-LABEL: test_umul24_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -57,11 +57,11 @@ entry:
   %1 = shl i32 %b, 8
   %b_24 = lshr i32 %1, 8
   %2 = mul i32 %a_24, %b_24
-  store i32 %2, i32 addrspace(1)* %out
+  store i32 %2, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i16 %b) {
 ; SI-LABEL: test_umul24_i16_sext:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -106,11 +106,11 @@ define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a,
 entry:
   %mul = mul i16 %a, %b
   %ext = sext i16 %mul to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: test_umul24_i16_vgpr_sext:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -173,17 +173,17 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16
 ; GFX9-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
-  %ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x
-  %ptr_b = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.y
-  %a = load i16, i16 addrspace(1)* %ptr_a
-  %b = load i16, i16 addrspace(1)* %ptr_b
+  %ptr_a = getelementptr i16, ptr addrspace(1) %in, i32 %tid.x
+  %ptr_b = getelementptr i16, ptr addrspace(1) %in, i32 %tid.y
+  %a = load i16, ptr addrspace(1) %ptr_a
+  %b = load i16, ptr addrspace(1) %ptr_b
   %mul = mul i16 %a, %b
   %val = sext i16 %mul to i32
-  store i32 %val, i32 addrspace(1)* %out
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b) {
 ; SI-LABEL: test_umul24_i16:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -228,11 +228,11 @@ define amdgpu_kernel void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %
 entry:
   %mul = mul i16 %a, %b
   %ext = zext i16 %mul to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: test_umul24_i16_vgpr:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -293,17 +293,17 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addr
 ; GFX9-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
-  %ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x
-  %ptr_b = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.y
-  %a = load i16, i16 addrspace(1)* %ptr_a
-  %b = load i16, i16 addrspace(1)* %ptr_b
+  %ptr_a = getelementptr i16, ptr addrspace(1) %in, i32 %tid.x
+  %ptr_b = getelementptr i16, ptr addrspace(1) %in, i32 %tid.y
+  %a = load i16, ptr addrspace(1) %ptr_a
+  %b = load i16, ptr addrspace(1) %ptr_b
   %mul = mul i16 %a, %b
   %val = zext i16 %mul to i32
-  store i32 %val, i32 addrspace(1)* %out
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) {
+define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
 ; SI-LABEL: test_umul24_i8_vgpr:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_mov_b32_e32 v3, v0
@@ -366,17 +366,17 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrsp
 entry:
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
-  %a.ptr = getelementptr i8, i8 addrspace(1)* %a, i32 %tid.x
-  %b.ptr = getelementptr i8, i8 addrspace(1)* %b, i32 %tid.y
-  %a.l = load i8, i8 addrspace(1)* %a.ptr
-  %b.l = load i8, i8 addrspace(1)* %b.ptr
+  %a.ptr = getelementptr i8, ptr addrspace(1) %a, i32 %tid.x
+  %b.ptr = getelementptr i8, ptr addrspace(1) %b, i32 %tid.y
+  %a.l = load i8, ptr addrspace(1) %a.ptr
+  %b.l = load i8, ptr addrspace(1) %b.ptr
   %mul = mul i8 %a.l, %b.l
   %ext = sext i8 %mul to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; SI-LABEL: test_umulhi24_i32_i64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -425,11 +425,11 @@ entry:
   %mul48 = mul i64 %a.24.i64, %b.24.i64
   %mul48.hi = lshr i64 %mul48, 32
   %mul24hi = trunc i64 %mul48.hi to i32
-  store i32 %mul24hi, i32 addrspace(1)* %out
+  store i32 %mul24hi, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @test_umulhi24(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; SI-LABEL: test_umulhi24:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -482,12 +482,12 @@ entry:
   %mul48 = mul i64 %a.24, %b.24
   %mul48.hi = lshr i64 %mul48, 32
   %mul24.hi = trunc i64 %mul48.hi to i32
-  store i32 %mul24.hi, i32 addrspace(1)* %out
+  store i32 %mul24.hi, ptr addrspace(1) %out
   ret void
 }
 
 ; Multiply with 24-bit inputs and 64-bit output.
-define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @test_umul24_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; SI-LABEL: test_umul24_i64:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -547,7 +547,7 @@ entry:
   %tmp1 = shl i64 %b, 40
   %b_24 = lshr i64 %tmp1, 40
   %tmp2 = mul i64 %a_24, %b_24
-  store i64 %tmp2, i64 addrspace(1)* %out
+  store i64 %tmp2, ptr addrspace(1) %out
   ret void
 }
 
@@ -582,7 +582,7 @@ define <2 x i64> @test_umul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
   ret <2 x i64> %mul
 }
 
-define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
+define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i32], i64 %a) {
 ; SI-LABEL: test_umul24_i64_square:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0x13
@@ -627,11 +627,11 @@ entry:
   %tmp0 = shl i64 %a, 40
   %a.24 = lshr i64 %tmp0, 40
   %tmp2 = mul i64 %a.24, %a.24
-  store i64 %tmp2, i64 addrspace(1)* %out
+  store i64 %tmp2, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; SI-LABEL: test_umulhi16_i32:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -681,11 +681,11 @@ entry:
   %mul = mul i32 %a.16, %b.16
   %hi = lshr i32 %mul, 16
   %mulhi = trunc i32 %hi to i16
-  store i16 %mulhi, i16 addrspace(1)* %out
+  store i16 %mulhi, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) {
+define amdgpu_kernel void @test_umul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) {
 ; SI-LABEL: test_umul24_i33:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -743,11 +743,11 @@ entry:
   %b_24 = lshr i33 %tmp1, 9
   %tmp2 = mul i33 %a_24, %b_24
   %ext = zext i33 %tmp2 to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
+define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) {
 ; SI-LABEL: test_umulhi24_i33:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
@@ -799,7 +799,7 @@ entry:
   %tmp2 = mul i33 %a_24, %b_24
   %hi = lshr i33 %tmp2, 32
   %trunc = trunc i33 %hi to i32
-  store i32 %trunc, i32 addrspace(1)* %out
+  store i32 %trunc, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index 5a159955665d1..f58bed3b98e97 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -38,7 +38,7 @@
 ; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
 
 ; IR: exit0:
-; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: store volatile i32 9, ptr addrspace(1) undef
 ; IR: br label %UnifiedReturnBlock
 
 ; IR: Flow1:
@@ -51,7 +51,7 @@
 ; IR: br i1 %15, label %exit1, label %Flow2
 
 ; IR: exit1:
-; IR: store volatile i32 17, i32 addrspace(3)* undef
+; IR: store volatile i32 17, ptr addrspace(3) undef
 ; IR:  br label %Flow2
 
 ; IR: UnifiedReturnBlock:
@@ -103,7 +103,7 @@
 
 ; GCN: ; %UnifiedReturnBlock
 ; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2) #0 {
 entry:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %tmp1 = add i32 0, %tmp
@@ -111,14 +111,14 @@ entry:
   %tmp3 = add i64 0, %tmp2
   %tmp4 = shl i64 %tmp3, 32
   %tmp5 = ashr exact i64 %tmp4, 32
-  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
-  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
+  %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
   %tmp8 = sext i32 %tmp7 to i64
-  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
-  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
+  %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
   %tmp13 = zext i32 %tmp10 to i64
-  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
-  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
+  %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
   %Pivot = icmp slt i32 %tmp16, 2
   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 
@@ -131,11 +131,11 @@ LeafBlock1:                                       ; preds = %entry
   br i1 %SwitchLeaf2, label %exit0, label %exit1
 
 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
-  store volatile i32 9, i32 addrspace(1)* undef
+  store volatile i32 9, ptr addrspace(1) undef
   ret void
 
 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
-  store volatile i32 17, i32 addrspace(3)* undef
+  store volatile i32 17, ptr addrspace(3) undef
   ret void
 }
 
@@ -158,7 +158,7 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 ; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable:
 ; GCN: ; %UnifiedUnreachableBlock
 ; GCN-NEXT: .Lfunc_end
-define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2) #0 {
 entry:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %tmp1 = add i32 0, %tmp
@@ -166,14 +166,14 @@ entry:
   %tmp3 = add i64 0, %tmp2
   %tmp4 = shl i64 %tmp3, 32
   %tmp5 = ashr exact i64 %tmp4, 32
-  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
-  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
+  %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
   %tmp8 = sext i32 %tmp7 to i64
-  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
-  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
+  %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
   %tmp13 = zext i32 %tmp10 to i64
-  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
-  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
+  %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
   %Pivot = icmp slt i32 %tmp16, 2
   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 
@@ -186,11 +186,11 @@ LeafBlock1:                                       ; preds = %entry
   br i1 %SwitchLeaf2, label %exit0, label %exit1
 
 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
-  store volatile i32 9, i32 addrspace(1)* undef
+  store volatile i32 9, ptr addrspace(1) undef
   unreachable
 
 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
-  store volatile i32 17, i32 addrspace(3)* undef
+  store volatile i32 17, ptr addrspace(3) undef
   unreachable
 }
 
@@ -220,7 +220,7 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 ; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
 
 ; IR: exit0:
-; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: store volatile i32 9, ptr addrspace(1) undef
 ; IR: br label %UnifiedReturnBlock
 
 ; IR: {{^}}Flow1:
@@ -233,13 +233,13 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 ; IR: br i1 %15, label %exit1, label %Flow2
 
 ; IR: exit1:
-; IR: store volatile i32 17, i32 addrspace(3)* undef
+; IR: store volatile i32 17, ptr addrspace(3) undef
 ; IR: br label %Flow2
 
 ; IR: UnifiedReturnBlock:
 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %11)
 ; IR: ret void
-define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
+define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2, i32 %arg3) #0 {
 entry:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %tmp1 = add i32 0, %tmp
@@ -247,14 +247,14 @@ entry:
   %tmp3 = add i64 0, %tmp2
   %tmp4 = shl i64 %tmp3, 32
   %tmp5 = ashr exact i64 %tmp4, 32
-  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
-  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
+  %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
   %tmp8 = sext i32 %tmp7 to i64
-  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
-  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
+  %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
   %tmp13 = zext i32 %tmp10 to i64
-  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
-  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
+  %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
   %divergent.cond0 = icmp slt i32 %tmp16, 2
   br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1
 
@@ -267,11 +267,11 @@ LeafBlock1:                                       ; preds = %entry
   br i1 %uniform.cond0, label %exit0, label %exit1
 
 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
-  store volatile i32 9, i32 addrspace(1)* undef
+  store volatile i32 9, ptr addrspace(1) undef
   ret void
 
 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
-  store volatile i32 17, i32 addrspace(3)* undef
+  store volatile i32 17, ptr addrspace(3) undef
   ret void
 }
 
@@ -288,7 +288,7 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
 ; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
 
-define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
+define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2, i32 %arg3) #0 {
 entry:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %tmp1 = add i32 0, %tmp
@@ -296,14 +296,14 @@ entry:
   %tmp3 = add i64 0, %tmp2
   %tmp4 = shl i64 %tmp3, 32
   %tmp5 = ashr exact i64 %tmp4, 32
-  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
-  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
+  %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
   %tmp8 = sext i32 %tmp7 to i64
-  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
-  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
+  %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
   %tmp13 = zext i32 %tmp10 to i64
-  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
-  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
+  %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
   %Pivot = icmp slt i32 %tmp16, 2
   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 
@@ -316,11 +316,11 @@ LeafBlock1:                                       ; preds = %entry
   br i1 %SwitchLeaf2, label %exit0, label %exit1
 
 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
-  store volatile i32 9, i32 addrspace(1)* undef
+  store volatile i32 9, ptr addrspace(1) undef
   ret void
 
 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
-  store volatile i32 17, i32 addrspace(3)* undef
+  store volatile i32 17, ptr addrspace(3) undef
   ret void
 }
 
@@ -348,11 +348,11 @@ LeafBlock1:                                       ; preds = %entry
   br i1 %SwitchLeaf2, label %exit0, label %exit1
 
 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
-  store i32 9, i32 addrspace(1)* undef
+  store i32 9, ptr addrspace(1) undef
   ret float 1.0
 
 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
-  store i32 17, i32 addrspace(3)* undef
+  store i32 17, ptr addrspace(3) undef
   ret float 2.0
 }
 
@@ -390,11 +390,11 @@ LeafBlock1:                                       ; preds = %entry
   br i1 %divergent.cond1, label %exit0, label %exit1
 
 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
-  store i32 9, i32 addrspace(1)* undef
+  store i32 9, ptr addrspace(1) undef
   ret float 1.0
 
 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
-  store i32 17, i32 addrspace(3)* undef
+  store i32 17, ptr addrspace(3) undef
   ret float 2.0
 }
 
@@ -413,7 +413,7 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 ; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
 
 ; IR: exit0:
-; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
+; IR-NEXT: store volatile i32 17, ptr addrspace(3) undef
 ; IR-NEXT: br label %UnifiedReturnBlock
 
 ; IR: Flow1:
@@ -426,14 +426,14 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 ; IR: br i1 %15, label %exit1, label %Flow2
 
 ; IR: exit1:
-; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef
+; IR-NEXT: store volatile i32 9, ptr addrspace(1) undef
 ; IR-NEXT: call void @llvm.amdgcn.unreachable()
 ; IR-NEXT: br label %Flow2
 
 ; IR: UnifiedReturnBlock:
 ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %11)
 ; IR-NEXT: ret void
-define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2) #0 {
 entry:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %tmp1 = add i32 0, %tmp
@@ -441,14 +441,14 @@ entry:
   %tmp3 = add i64 0, %tmp2
   %tmp4 = shl i64 %tmp3, 32
   %tmp5 = ashr exact i64 %tmp4, 32
-  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
-  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
+  %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
   %tmp8 = sext i32 %tmp7 to i64
-  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
-  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
+  %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
   %tmp13 = zext i32 %tmp10 to i64
-  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
-  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
+  %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
   %Pivot = icmp slt i32 %tmp16, 2
   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 
@@ -461,11 +461,11 @@ LeafBlock1:                                       ; preds = %entry
   br i1 %SwitchLeaf2, label %exit0, label %exit1
 
 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
-  store volatile i32 17, i32 addrspace(3)* undef
+  store volatile i32 17, ptr addrspace(3) undef
   ret void
 
 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
-  store volatile i32 9, i32 addrspace(1)* undef
+  store volatile i32 9, ptr addrspace(1) undef
   unreachable
 }
 
@@ -475,21 +475,21 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 ; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable(
 
 ; IR: exit0:                                            ; preds = %Flow2
-; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
+; IR-NEXT: store volatile i32 17, ptr addrspace(3) undef
 ; IR-NEXT: br label %UnifiedReturnBlock
 
 
 ; IR: indirect.exit1:
-; IR: %load = load volatile i32, i32 addrspace(1)* undef
-; IR: store volatile i32 %load, i32 addrspace(1)* undef
-; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: %load = load volatile i32, ptr addrspace(1) undef
+; IR: store volatile i32 %load, ptr addrspace(1) undef
+; IR: store volatile i32 9, ptr addrspace(1) undef
 ; IR: call void @llvm.amdgcn.unreachable()
 ; IR-NEXT: br label %Flow2
 
 ; IR: UnifiedReturnBlock:                               ; preds = %exit0, %Flow2
 ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %11)
 ; IR-NEXT: ret void
-define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2) #0 {
 entry:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %tmp1 = add i32 0, %tmp
@@ -497,14 +497,14 @@ entry:
   %tmp3 = add i64 0, %tmp2
   %tmp4 = shl i64 %tmp3, 32
   %tmp5 = ashr exact i64 %tmp4, 32
-  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
-  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
+  %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
   %tmp8 = sext i32 %tmp7 to i64
-  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
-  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
+  %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
   %tmp13 = zext i32 %tmp10 to i64
-  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
-  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
+  %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
   %Pivot = icmp slt i32 %tmp16, 2
   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 
@@ -517,21 +517,21 @@ LeafBlock1:                                       ; preds = %entry
   br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1
 
 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
-  store volatile i32 17, i32 addrspace(3)* undef
+  store volatile i32 17, ptr addrspace(3) undef
   ret void
 
 indirect.exit1:
-  %load = load volatile i32, i32 addrspace(1)* undef
-  store volatile i32 %load, i32 addrspace(1)* undef
+  %load = load volatile i32, ptr addrspace(1) undef
+  store volatile i32 %load, ptr addrspace(1) undef
   br label %exit1
 
 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
-  store volatile i32 9, i32 addrspace(1)* undef
+  store volatile i32 9, ptr addrspace(1) undef
   unreachable
 }
 
 ; IR-LABEL: @multi_divergent_region_exit_ret_switch(
-define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2) #0 {
 entry:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %tmp1 = add i32 0, %tmp
@@ -539,14 +539,14 @@ entry:
   %tmp3 = add i64 0, %tmp2
   %tmp4 = shl i64 %tmp3, 32
   %tmp5 = ashr exact i64 %tmp4, 32
-  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
-  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
+  %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
   %tmp8 = sext i32 %tmp7 to i64
-  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
-  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
+  %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
   %tmp13 = zext i32 %tmp10 to i64
-  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
-  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
+  %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
   switch i32 %tmp16, label %exit1
     [ i32 1, label %LeafBlock
       i32 2, label %LeafBlock1
@@ -561,11 +561,11 @@ LeafBlock1:                                       ; preds = %entry
   br i1 %SwitchLeaf2, label %exit0, label %exit1
 
 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
-  store volatile i32 17, i32 addrspace(3)* undef
+  store volatile i32 17, ptr addrspace(3) undef
   ret void
 
 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
-  store volatile i32 9, i32 addrspace(1)* undef
+  store volatile i32 9, ptr addrspace(1) undef
   unreachable
 }
 
@@ -581,15 +581,15 @@ divergent.multi.exit.region:
   br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1
 
 divergent.ret0:
-  store volatile i32 11, i32 addrspace(3)* undef
+  store volatile i32 11, ptr addrspace(3) undef
   ret void
 
 divergent.ret1:
-  store volatile i32 42, i32 addrspace(3)* undef
+  store volatile i32 42, ptr addrspace(3) undef
   ret void
 
 uniform.ret:
-  store volatile i32 9, i32 addrspace(1)* undef
+  store volatile i32 9, ptr addrspace(1) undef
   ret void
 }
 
@@ -605,30 +605,30 @@ divergent.multi.exit.region:
   br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1
 
 divergent.if:
-  %vgpr0 = load volatile float, float addrspace(1)* undef
+  %vgpr0 = load volatile float, ptr addrspace(1) undef
   %divergent.cond1 = fcmp ogt float %vgpr0, 1.0
   br i1 %divergent.cond1, label %divergent.then, label %divergent.endif
 
 divergent.then:
-  %vgpr1 = load volatile float, float addrspace(1)* undef
+  %vgpr1 = load volatile float, ptr addrspace(1) undef
   %divergent.cond2 = fcmp olt float %vgpr1, 4.0
-  store volatile i32 33, i32 addrspace(1)* undef
+  store volatile i32 33, ptr addrspace(1) undef
   br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif
 
 divergent.endif:
-  store volatile i32 38, i32 addrspace(1)* undef
+  store volatile i32 38, ptr addrspace(1) undef
   br label %divergent.ret0
 
 divergent.ret0:
-  store volatile i32 11, i32 addrspace(3)* undef
+  store volatile i32 11, ptr addrspace(3) undef
   ret void
 
 divergent.ret1:
-  store volatile i32 42, i32 addrspace(3)* undef
+  store volatile i32 42, ptr addrspace(3) undef
   ret void
 
 uniform.ret:
-  store volatile i32 9, i32 addrspace(1)* undef
+  store volatile i32 9, ptr addrspace(1) undef
   ret void
 }
 
@@ -655,30 +655,30 @@ uniform.multi.exit.region:
   br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1
 
 uniform.if:
-  %sgpr0 = load volatile i32, i32 addrspace(4)* undef
+  %sgpr0 = load volatile i32, ptr addrspace(4) undef
   %uniform.cond1 = icmp slt i32 %sgpr0, 1
   br i1 %uniform.cond1, label %uniform.then, label %uniform.endif
 
 uniform.then:
-  %sgpr1 = load volatile i32, i32 addrspace(4)* undef
+  %sgpr1 = load volatile i32, ptr addrspace(4) undef
   %uniform.cond2 = icmp sge i32 %sgpr1, 4
-  store volatile i32 33, i32 addrspace(1)* undef
+  store volatile i32 33, ptr addrspace(1) undef
   br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif
 
 uniform.endif:
-  store volatile i32 38, i32 addrspace(1)* undef
+  store volatile i32 38, ptr addrspace(1) undef
   br label %uniform.ret0
 
 uniform.ret0:
-  store volatile i32 11, i32 addrspace(3)* undef
+  store volatile i32 11, ptr addrspace(3) undef
   ret void
 
 uniform.ret1:
-  store volatile i32 42, i32 addrspace(3)* undef
+  store volatile i32 42, ptr addrspace(3) undef
   ret void
 
 divergent.ret:
-  store volatile i32 9, i32 addrspace(1)* undef
+  store volatile i32 9, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll
index 73d36325a88e9..f9ff3dd281290 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll
@@ -17,14 +17,14 @@ define void @spill_v2i32() {
 entry:
   %alloca = alloca <2 x i32>, i32 2, align 4, addrspace(5)
 
-  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %alloca, i32 1
-  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
+  %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
+  %a = load volatile <2 x i32>, ptr addrspace(5) %aptr
 
   ; Force %a to spill.
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 
-  %outptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %alloca, i32 1
-  store volatile <2 x i32> %a, <2 x i32> addrspace(5)* %outptr
+  %outptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
+  store volatile <2 x i32> %a, ptr addrspace(5) %outptr
 
   ret void
 }
@@ -45,14 +45,14 @@ define void @spill_v2f32() {
 entry:
   %alloca = alloca <2 x i32>, i32 2, align 4, addrspace(5)
 
-  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %alloca, i32 1
-  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
+  %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
+  %a = load volatile <2 x i32>, ptr addrspace(5) %aptr
 
   ; Force %a to spill.
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 
-  %outptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %alloca, i32 1
-  store volatile <2 x i32> %a, <2 x i32> addrspace(5)* %outptr
+  %outptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
+  store volatile <2 x i32> %a, ptr addrspace(5) %outptr
 
   ret void
 }
@@ -75,14 +75,14 @@ define void @spill_v3i32() {
 entry:
   %alloca = alloca <3 x i32>, i32 2, align 4, addrspace(5)
 
-  %aptr = getelementptr <3 x i32>, <3 x i32> addrspace(5)* %alloca, i32 1
-  %a = load volatile <3 x i32>, <3 x i32> addrspace(5)* %aptr
+  %aptr = getelementptr <3 x i32>, ptr addrspace(5) %alloca, i32 1
+  %a = load volatile <3 x i32>, ptr addrspace(5) %aptr
 
   ; Force %a to spill.
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 
-  %outptr = getelementptr <3 x i32>, <3 x i32> addrspace(5)* %alloca, i32 1
-  store volatile <3 x i32> %a, <3 x i32> addrspace(5)* %outptr
+  %outptr = getelementptr <3 x i32>, ptr addrspace(5) %alloca, i32 1
+  store volatile <3 x i32> %a, ptr addrspace(5) %outptr
 
   ret void
 }
@@ -105,14 +105,14 @@ define void @spill_v3f32() {
 entry:
   %alloca = alloca <3 x i32>, i32 2, align 4, addrspace(5)
 
-  %aptr = getelementptr <3 x i32>, <3 x i32> addrspace(5)* %alloca, i32 1
-  %a = load volatile <3 x i32>, <3 x i32> addrspace(5)* %aptr
+  %aptr = getelementptr <3 x i32>, ptr addrspace(5) %alloca, i32 1
+  %a = load volatile <3 x i32>, ptr addrspace(5) %aptr
 
   ; Force %a to spill.
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 
-  %outptr = getelementptr <3 x i32>, <3 x i32> addrspace(5)* %alloca, i32 1
-  store volatile <3 x i32> %a, <3 x i32> addrspace(5)* %outptr
+  %outptr = getelementptr <3 x i32>, ptr addrspace(5) %alloca, i32 1
+  store volatile <3 x i32> %a, ptr addrspace(5) %outptr
 
   ret void
 }
@@ -137,14 +137,14 @@ define void @spill_v4i32() {
 entry:
   %alloca = alloca <4 x i32>, i32 2, align 4, addrspace(5)
 
-  %aptr = getelementptr <4 x i32>, <4 x i32> addrspace(5)* %alloca, i32 1
-  %a = load volatile <4 x i32>, <4 x i32> addrspace(5)* %aptr
+  %aptr = getelementptr <4 x i32>, ptr addrspace(5) %alloca, i32 1
+  %a = load volatile <4 x i32>, ptr addrspace(5) %aptr
 
   ; Force %a to spill.
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 
-  %outptr = getelementptr <4 x i32>, <4 x i32> addrspace(5)* %alloca, i32 1
-  store volatile <4 x i32> %a, <4 x i32> addrspace(5)* %outptr
+  %outptr = getelementptr <4 x i32>, ptr addrspace(5) %alloca, i32 1
+  store volatile <4 x i32> %a, ptr addrspace(5) %outptr
 
   ret void
 }
@@ -169,14 +169,14 @@ define void @spill_v4f32() {
 entry:
   %alloca = alloca <4 x i32>, i32 2, align 4, addrspace(5)
 
-  %aptr = getelementptr <4 x i32>, <4 x i32> addrspace(5)* %alloca, i32 1
-  %a = load volatile <4 x i32>, <4 x i32> addrspace(5)* %aptr
+  %aptr = getelementptr <4 x i32>, ptr addrspace(5) %alloca, i32 1
+  %a = load volatile <4 x i32>, ptr addrspace(5) %aptr
 
   ; Force %a to spill.
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 
-  %outptr = getelementptr <4 x i32>, <4 x i32> addrspace(5)* %alloca, i32 1
-  store volatile <4 x i32> %a, <4 x i32> addrspace(5)* %outptr
+  %outptr = getelementptr <4 x i32>, ptr addrspace(5) %alloca, i32 1
+  store volatile <4 x i32> %a, ptr addrspace(5) %outptr
 
   ret void
 }
@@ -202,14 +202,14 @@ define void @spill_v5i32() {
 entry:
   %alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5)
 
-  %aptr = getelementptr <5 x i32>, <5 x i32> addrspace(5)* %alloca, i32 1
-  %a = load volatile <5 x i32>, <5 x i32> addrspace(5)* %aptr
+  %aptr = getelementptr <5 x i32>, ptr addrspace(5) %alloca, i32 1
+  %a = load volatile <5 x i32>, ptr addrspace(5) %aptr
 
   ; Force %a to spill.
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 
-  %outptr = getelementptr <5 x i32>, <5 x i32> addrspace(5)* %alloca, i32 1
-  store volatile <5 x i32> %a, <5 x i32> addrspace(5)* %outptr
+  %outptr = getelementptr <5 x i32>, ptr addrspace(5) %alloca, i32 1
+  store volatile <5 x i32> %a, ptr addrspace(5) %outptr
 
   ret void
 }
@@ -235,14 +235,14 @@ define void @spill_v5f32() {
 entry:
   %alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5)
 
-  %aptr = getelementptr <5 x i32>, <5 x i32> addrspace(5)* %alloca, i32 1
-  %a = load volatile <5 x i32>, <5 x i32> addrspace(5)* %aptr
+  %aptr = getelementptr <5 x i32>, ptr addrspace(5) %alloca, i32 1
+  %a = load volatile <5 x i32>, ptr addrspace(5) %aptr
 
   ; Force %a to spill.
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 
-  %outptr = getelementptr <5 x i32>, <5 x i32> addrspace(5)* %alloca, i32 1
-  store volatile <5 x i32> %a, <5 x i32> addrspace(5)* %outptr
+  %outptr = getelementptr <5 x i32>, ptr addrspace(5) %alloca, i32 1
+  store volatile <5 x i32> %a, ptr addrspace(5) %outptr
 
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index 463aacd8e28eb..f7479c8475592 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -119,7 +119,7 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
 ; OPT-NEXT:    [[LSR_IV:%.*]] = phi i32 [ undef, [[BB]] ], [ [[TMP2:%.*]], [[FLOW4]] ]
 ; OPT-NEXT:    [[LSR_IV_NEXT:%.*]] = add i32 [[LSR_IV]], 1
 ; OPT-NEXT:    [[CMP0:%.*]] = icmp slt i32 [[LSR_IV_NEXT]], 0
-; OPT-NEXT:    [[LOAD0:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT:    [[LOAD0:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4
 ; OPT-NEXT:    br label [[NODEBLOCK:%.*]]
 ; OPT:       NodeBlock:
 ; OPT-NEXT:    [[PIVOT:%.*]] = icmp sge i32 [[LOAD0]], 1
@@ -141,7 +141,7 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
 ; OPT-NEXT:    [[TMP5:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP4]])
 ; OPT-NEXT:    br i1 [[TMP5]], label [[BB9:%.*]], label [[BB1]]
 ; OPT:       case0:
-; OPT-NEXT:    [[LOAD1:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT:    [[LOAD1:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4
 ; OPT-NEXT:    [[CMP1:%.*]] = icmp sge i32 [[TMP]], [[LOAD1]]
 ; OPT-NEXT:    br label [[FLOW5]]
 ; OPT:       Flow:
@@ -150,7 +150,7 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
 ; OPT-NEXT:    [[TMP8:%.*]] = phi i1 [ false, [[FLOW3]] ], [ true, [[NODEBLOCK]] ]
 ; OPT-NEXT:    br i1 [[TMP8]], label [[LEAFBLOCK:%.*]], label [[FLOW4]]
 ; OPT:       case1:
-; OPT-NEXT:    [[LOAD2:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT:    [[LOAD2:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4
 ; OPT-NEXT:    [[CMP2]] = icmp sge i32 [[TMP]], [[LOAD2]]
 ; OPT-NEXT:    br label [[FLOW3]]
 ; OPT:       Flow5:
@@ -226,19 +226,19 @@ bb1:
   %lsr.iv = phi i32 [ undef, %bb ], [ %lsr.iv.next, %case0 ], [ %lsr.iv.next, %case1 ]
   %lsr.iv.next = add i32 %lsr.iv, 1
   %cmp0 = icmp slt i32 %lsr.iv.next, 0
-  %load0 = load volatile i32, i32 addrspace(1)* undef, align 4
+  %load0 = load volatile i32, ptr addrspace(1) undef, align 4
   switch i32 %load0, label %bb9 [
   i32 0, label %case0
   i32 1, label %case1
   ]
 
 case0:
-  %load1 = load volatile i32, i32 addrspace(1)* undef, align 4
+  %load1 = load volatile i32, ptr addrspace(1) undef, align 4
   %cmp1 = icmp slt i32 %tmp, %load1
   br i1 %cmp1, label %bb1, label %bb9
 
 case1:
-  %load2 = load volatile i32, i32 addrspace(1)* undef, align 4
+  %load2 = load volatile i32, ptr addrspace(1) undef, align 4
   %cmp2 = icmp slt i32 %tmp, %load2
   br i1 %cmp2, label %bb1, label %bb9
 

diff  --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index a4fb87e18a15d..f11959c53dc19 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -16,7 +16,7 @@ define hidden fastcc void @callee_has_fp() #1 {
 ; CHECK-NEXT:    s_mov_b32 s33, s4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
-  store volatile i32 1, i32 addrspace(5)* %alloca
+  store volatile i32 1, ptr addrspace(5) %alloca
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index 31ce0da580f89..17bbcde2cc62a 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -7,7 +7,7 @@
 ; the condition that appears to have no uses until the loop is
 ; completely processed.
 
-define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* nocapture %arg) #0 {
+define amdgpu_kernel void @reduced_nested_loop_conditions(ptr addrspace(3) nocapture %arg) #0 {
 ; GCN-LABEL: reduced_nested_loop_conditions:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s0, s[0:1], 0x9
@@ -49,8 +49,8 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* noca
 ; IR-LABEL: @reduced_nested_loop_conditions(
 ; IR-NEXT:  bb:
 ; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #[[ATTR4:[0-9]+]]
-; IR-NEXT:    [[MY_TMP1:%.*]] = getelementptr inbounds i64, i64 addrspace(3)* [[ARG:%.*]], i32 [[MY_TMP]]
-; IR-NEXT:    [[MY_TMP2:%.*]] = load volatile i64, i64 addrspace(3)* [[MY_TMP1]], align 4
+; IR-NEXT:    [[MY_TMP1:%.*]] = getelementptr inbounds i64, ptr addrspace(3) [[ARG:%.*]], i32 [[MY_TMP]]
+; IR-NEXT:    [[MY_TMP2:%.*]] = load volatile i64, ptr addrspace(3) [[MY_TMP1]], align 4
 ; IR-NEXT:    br label [[BB5:%.*]]
 ; IR:       bb3:
 ; IR-NEXT:    br i1 true, label [[BB4:%.*]], label [[BB13:%.*]]
@@ -83,8 +83,8 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* noca
 ; IR-NEXT:    br i1 [[MY_TMP14]], label [[BB16:%.*]], label [[BB20:%.*]]
 ; IR:       bb16:
 ; IR-NEXT:    [[MY_TMP17:%.*]] = extractelement <2 x i32> [[MY_TMP15]], i64 1
-; IR-NEXT:    [[MY_TMP18:%.*]] = getelementptr inbounds i32, i32 addrspace(3)* undef, i32 [[MY_TMP17]]
-; IR-NEXT:    [[MY_TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[MY_TMP18]], align 4
+; IR-NEXT:    [[MY_TMP18:%.*]] = getelementptr inbounds i32, ptr addrspace(3) undef, i32 [[MY_TMP17]]
+; IR-NEXT:    [[MY_TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[MY_TMP18]], align 4
 ; IR-NEXT:    br label [[BB20]]
 ; IR:       bb20:
 ; IR-NEXT:    [[MY_TMP21]] = phi i32 [ [[MY_TMP19]], [[BB16]] ], [ 0, [[BB13]] ]
@@ -95,8 +95,8 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* noca
 ; IR-NEXT:    ret void
 bb:
   %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
-  %my.tmp1 = getelementptr inbounds i64, i64 addrspace(3)* %arg, i32 %my.tmp
-  %my.tmp2 = load volatile i64, i64 addrspace(3)* %my.tmp1
+  %my.tmp1 = getelementptr inbounds i64, ptr addrspace(3) %arg, i32 %my.tmp
+  %my.tmp2 = load volatile i64, ptr addrspace(3) %my.tmp1
   br label %bb5
 
 bb3:                                              ; preds = %bb9
@@ -128,8 +128,8 @@ bb13:                                             ; preds = %bb8, %bb3
 
 bb16:                                             ; preds = %bb13
   %my.tmp17 = extractelement <2 x i32> %my.tmp15, i64 1
-  %my.tmp18 = getelementptr inbounds i32, i32 addrspace(3)* undef, i32 %my.tmp17
-  %my.tmp19 = load volatile i32, i32 addrspace(3)* %my.tmp18
+  %my.tmp18 = getelementptr inbounds i32, ptr addrspace(3) undef, i32 %my.tmp17
+  %my.tmp19 = load volatile i32, ptr addrspace(3) %my.tmp18
   br label %bb20
 
 bb20:                                             ; preds = %bb16, %bb13
@@ -143,7 +143,7 @@ bb23:                                             ; preds = %bb10
 
 ; Earlier version of above, before a run of the structurizer.
 
-define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %arg) #0 {
+define amdgpu_kernel void @nested_loop_conditions(ptr addrspace(1) nocapture %arg) #0 {
 ; GCN-LABEL: nested_loop_conditions:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -190,15 +190,15 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
 ; GCN-NEXT:    s_endpgm
 ; IR-LABEL: @nested_loop_conditions(
 ; IR-NEXT:  bb:
-; IR-NEXT:    [[MY_TMP1134:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
+; IR-NEXT:    [[MY_TMP1134:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4
 ; IR-NEXT:    [[MY_TMP1235:%.*]] = icmp slt i32 [[MY_TMP1134]], 9
 ; IR-NEXT:    br i1 [[MY_TMP1235]], label [[BB14_LR_PH:%.*]], label [[FLOW:%.*]]
 ; IR:       bb14.lr.ph:
 ; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #[[ATTR4]]
 ; IR-NEXT:    [[MY_TMP1:%.*]] = zext i32 [[MY_TMP]] to i64
-; IR-NEXT:    [[MY_TMP2:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[ARG:%.*]], i64 [[MY_TMP1]]
-; IR-NEXT:    [[MY_TMP3:%.*]] = load i64, i64 addrspace(1)* [[MY_TMP2]], align 16
-; IR-NEXT:    [[MY_TMP932:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
+; IR-NEXT:    [[MY_TMP2:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[ARG:%.*]], i64 [[MY_TMP1]]
+; IR-NEXT:    [[MY_TMP3:%.*]] = load i64, ptr addrspace(1) [[MY_TMP2]], align 16
+; IR-NEXT:    [[MY_TMP932:%.*]] = load <4 x i32>, ptr addrspace(1) undef, align 16
 ; IR-NEXT:    [[MY_TMP1033:%.*]] = extractelement <4 x i32> [[MY_TMP932]], i64 0
 ; IR-NEXT:    br label [[BB14:%.*]]
 ; IR:       Flow3:
@@ -244,7 +244,7 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
 ; IR-NEXT:    [[TMP17:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP16]])
 ; IR-NEXT:    br i1 [[TMP17]], label [[FLOW2:%.*]], label [[BB14]]
 ; IR:       bb18:
-; IR-NEXT:    [[MY_TMP19:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
+; IR-NEXT:    [[MY_TMP19:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4
 ; IR-NEXT:    [[MY_TMP20:%.*]] = icmp slt i32 [[MY_TMP19]], 9
 ; IR-NEXT:    br i1 [[MY_TMP20]], label [[BB21]], label [[BB18]]
 ; IR:       bb21:
@@ -258,10 +258,10 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
 ; IR-NEXT:    [[MY_TMP29:%.*]] = select i1 [[MY_TMP28]], i64 1, i64 2
 ; IR-NEXT:    [[MY_TMP30:%.*]] = extractelement <4 x i32> [[MY_TMP936]], i64 [[MY_TMP29]]
 ; IR-NEXT:    [[MY_TMP7:%.*]] = zext i32 [[MY_TMP30]] to i64
-; IR-NEXT:    [[MY_TMP8:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 [[MY_TMP7]]
-; IR-NEXT:    [[MY_TMP9]] = load <4 x i32>, <4 x i32> addrspace(1)* [[MY_TMP8]], align 16
+; IR-NEXT:    [[MY_TMP8:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) undef, i64 [[MY_TMP7]]
+; IR-NEXT:    [[MY_TMP9]] = load <4 x i32>, ptr addrspace(1) [[MY_TMP8]], align 16
 ; IR-NEXT:    [[MY_TMP10]] = extractelement <4 x i32> [[MY_TMP9]], i64 0
-; IR-NEXT:    [[MY_TMP11:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
+; IR-NEXT:    [[MY_TMP11:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4
 ; IR-NEXT:    [[MY_TMP12]] = icmp sge i32 [[MY_TMP11]], 9
 ; IR-NEXT:    br label [[FLOW1]]
 ; IR:       Flow2:
@@ -274,19 +274,19 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
 ; IR-NEXT:    br label [[FLOW3]]
 ; IR:       bb31:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]])
-; IR-NEXT:    store volatile i32 0, i32 addrspace(1)* undef, align 4
+; IR-NEXT:    store volatile i32 0, ptr addrspace(1) undef, align 4
 ; IR-NEXT:    ret void
 bb:
-  %my.tmp1134 = load volatile i32, i32 addrspace(1)* undef
+  %my.tmp1134 = load volatile i32, ptr addrspace(1) undef
   %my.tmp1235 = icmp slt i32 %my.tmp1134, 9
   br i1 %my.tmp1235, label %bb14.lr.ph, label %bb13
 
 bb14.lr.ph:                                       ; preds = %bb
   %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %my.tmp1 = zext i32 %my.tmp to i64
-  %my.tmp2 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %my.tmp1
-  %my.tmp3 = load i64, i64 addrspace(1)* %my.tmp2, align 16
-  %my.tmp932 = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
+  %my.tmp2 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %my.tmp1
+  %my.tmp3 = load i64, ptr addrspace(1) %my.tmp2, align 16
+  %my.tmp932 = load <4 x i32>, ptr addrspace(1) undef, align 16
   %my.tmp1033 = extractelement <4 x i32> %my.tmp932, i64 0
   br label %bb14
 
@@ -307,7 +307,7 @@ bb16:                                             ; preds = %bb14
   br label %bb18
 
 bb18:                                             ; preds = %bb18, %bb16
-  %my.tmp19 = load volatile i32, i32 addrspace(1)* undef
+  %my.tmp19 = load volatile i32, ptr addrspace(1) undef
   %my.tmp20 = icmp slt i32 %my.tmp19, 9
   br i1 %my.tmp20, label %bb21, label %bb18
 
@@ -322,10 +322,10 @@ bb21:                                             ; preds = %bb18
   %my.tmp29 = select i1 %my.tmp28, i64 1, i64 2
   %my.tmp30 = extractelement <4 x i32> %my.tmp936, i64 %my.tmp29
   %my.tmp7 = zext i32 %my.tmp30 to i64
-  %my.tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 %my.tmp7
-  %my.tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %my.tmp8, align 16
+  %my.tmp8 = getelementptr inbounds <4 x i32>, ptr addrspace(1) undef, i64 %my.tmp7
+  %my.tmp9 = load <4 x i32>, ptr addrspace(1) %my.tmp8, align 16
   %my.tmp10 = extractelement <4 x i32> %my.tmp9, i64 0
-  %my.tmp11 = load volatile i32, i32 addrspace(1)* undef
+  %my.tmp11 = load volatile i32, ptr addrspace(1) undef
   %my.tmp12 = icmp slt i32 %my.tmp11, 9
   br i1 %my.tmp12, label %bb14, label %bb4.bb13_crit_edge
 
@@ -333,7 +333,7 @@ bb31.loopexit:                                    ; preds = %bb14
   br label %bb31
 
 bb31:                                             ; preds = %bb31.loopexit, %bb13
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll
index eb16f8517ff9c..2d0ca8be978f1 100644
--- a/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll
@@ -9,9 +9,9 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16:
 ; SI: s_load_dword s
 ; SI: buffer_store_short v
-define amdgpu_kernel void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i32_to_i16(ptr addrspace(1) %out, i32 %arg) nounwind {
   %trunc = trunc i32 %arg to i16
-  store i16 %trunc, i16 addrspace(1)* %out
+  store i16 %trunc, ptr addrspace(1) %out
   ret void
 }
 
@@ -21,103 +21,103 @@ define amdgpu_kernel void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_short v
-define amdgpu_kernel void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i32_to_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
-  %load = load i32, i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i16, ptr addrspace(1) %out, i32 %tid
+  %load = load i32, ptr addrspace(1) %gep.in
   %trunc = trunc i32 %load to i16
-  store i16 %trunc, i16 addrspace(1)* %gep.out
+  store i16 %trunc, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define amdgpu_kernel void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i32_to_i8(ptr addrspace(1) %out, i32 %arg) nounwind {
   %trunc = trunc i32 %arg to i8
-  store i8 %trunc, i8 addrspace(1)* %out
+  store i8 %trunc, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define amdgpu_kernel void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i32_to_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
-  %load = load i32, i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i8, ptr addrspace(1) %out, i32 %tid
+  %load = load i32, ptr addrspace(1) %gep.in
   %trunc = trunc i32 %load to i8
-  store i8 %trunc, i8 addrspace(1)* %gep.out
+  store i8 %trunc, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define amdgpu_kernel void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i32_to_i1(ptr addrspace(1) %out, i32 %arg) nounwind {
   %trunc = trunc i32 %arg to i1
-  store i1 %trunc, i1 addrspace(1)* %out
+  store i1 %trunc, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define amdgpu_kernel void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i32_to_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid
-  %load = load i32, i32 addrspace(1)* %gep.in
+  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i1, ptr addrspace(1) %out, i32 %tid
+  %load = load i32, ptr addrspace(1) %gep.in
   %trunc = trunc i32 %load to i1
-  store i1 %trunc, i1 addrspace(1)* %gep.out
+  store i1 %trunc, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:
 ; SI: s_load_dword s
 ; SI: buffer_store_dword v
-define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i64_to_i32(ptr addrspace(1) %out, [8 x i32], i64 %arg) nounwind {
   %trunc = trunc i64 %arg to i32
-  store i32 %trunc, i32 addrspace(1)* %out
+  store i32 %trunc, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_dword v
-define amdgpu_kernel void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i64_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %load = load i64, i64 addrspace(1)* %gep.in
+  %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %load = load i64, ptr addrspace(1) %gep.in
   %trunc = trunc i64 %load to i32
-  store i32 %trunc, i32 addrspace(1)* %gep.out
+  store i32 %trunc, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:
 ; SI: s_load_dword s
 ; SI: buffer_store_dword v
-define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
+define amdgpu_kernel void @srl_kernarg_i64_to_i32(ptr addrspace(1) %out, [8 x i32], i64 %arg) nounwind {
   %srl = lshr i64 %arg, 32
   %trunc = trunc i64 %srl to i32
-  store i32 %trunc, i32 addrspace(1)* %out
+  store i32 %trunc, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_dword v
-define amdgpu_kernel void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @srl_buffer_load_i64_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %load = load i64, i64 addrspace(1)* %gep.in
+  %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %load = load i64, ptr addrspace(1) %gep.in
   %srl = lshr i64 %load, 32
   %trunc = trunc i64 %srl to i32
-  store i32 %trunc, i32 addrspace(1)* %gep.out
+  store i32 %trunc, ptr addrspace(1) %gep.out
   ret void
 }
 
@@ -125,68 +125,68 @@ define amdgpu_kernel void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i6
 ; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define amdgpu_kernel void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i16_to_i8(ptr addrspace(1) %out, i16 %arg) nounwind {
   %trunc = trunc i16 %arg to i8
-  store i8 %trunc, i8 addrspace(1)* %out
+  store i8 %trunc, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8:
 ; SI: buffer_load_ubyte v
 ; SI: buffer_store_byte v
-define amdgpu_kernel void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i16_to_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
-  %load = load i16, i16 addrspace(1)* %gep.in
+  %gep.in = getelementptr i16, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i8, ptr addrspace(1) %out, i32 %tid
+  %load = load i16, ptr addrspace(1) %gep.in
   %trunc = trunc i16 %load to i8
-  store i8 %trunc, i8 addrspace(1)* %gep.out
+  store i8 %trunc, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
+define amdgpu_kernel void @srl_kernarg_i64_to_i8(ptr addrspace(1) %out, [8 x i32], i64 %arg) nounwind {
   %srl = lshr i64 %arg, 32
   %trunc = trunc i64 %srl to i8
-  store i8 %trunc, i8 addrspace(1)* %out
+  store i8 %trunc, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define amdgpu_kernel void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @srl_buffer_load_i64_to_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
-  %load = load i64, i64 addrspace(1)* %gep.in
+  %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i8, ptr addrspace(1) %out, i32 %tid
+  %load = load i64, ptr addrspace(1) %gep.in
   %srl = lshr i64 %load, 32
   %trunc = trunc i64 %srl to i8
-  store i8 %trunc, i8 addrspace(1)* %gep.out
+  store i8 %trunc, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i64_to_i8(ptr addrspace(1) %out, [8 x i32], i64 %arg) nounwind {
   %trunc = trunc i64 %arg to i8
-  store i8 %trunc, i8 addrspace(1)* %out
+  store i8 %trunc, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define amdgpu_kernel void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i64_to_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
-  %load = load i64, i64 addrspace(1)* %gep.in
+  %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %gep.out = getelementptr i8, ptr addrspace(1) %out, i32 %tid
+  %load = load i64, ptr addrspace(1) %gep.in
   %trunc = trunc i64 %load to i8
-  store i8 %trunc, i8 addrspace(1)* %gep.out
+  store i8 %trunc, ptr addrspace(1) %gep.out
   ret void
 }
 
@@ -194,22 +194,22 @@ define amdgpu_kernel void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out,
 ; SI: s_load_dword [[LOAD:s[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0x0
 ; SI: s_waitcnt lgkmcnt(0)
 ; SI: s_and_b32 s{{[0-9]+}}, [[LOAD]], 0xffff
-define amdgpu_kernel void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
+define amdgpu_kernel void @smrd_mask_i32_to_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 entry:
-  %val = load i32, i32 addrspace(4)* %in
+  %val = load i32, ptr addrspace(4) %in
   %mask = and i32 %val, 65535
-  store i32 %mask, i32 addrspace(1)* %out
+  store i32 %mask, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}extract_hi_i64_bitcast_v2i32:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_dword v
-define amdgpu_kernel void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
-  %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
+define amdgpu_kernel void @extract_hi_i64_bitcast_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %ld = load <2 x i32>, ptr addrspace(1) %in
   %bc = bitcast <2 x i32> %ld to i64
   %hi = lshr i64 %bc, 32
   %trunc = trunc i64 %hi to i32
-  store i32 %trunc, i32 addrspace(1)* %out
+  store i32 %trunc, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll b/llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll
index 22f15a2506ae9..4b0e5b4cd7b32 100644
--- a/llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll
@@ -6,10 +6,10 @@
 ; GCN:  v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
 ; GCN:  flat_load_dword v{{[0-9]+}}, v[[[LO_VREG]]:[[HI_VREG]]]
 
-define amdgpu_kernel void @volatile_load(i32 addrspace(1)* %arg, [8 x i32], i32 addrspace(1)* nocapture %arg1) {
+define amdgpu_kernel void @volatile_load(ptr addrspace(1) %arg, [8 x i32], ptr addrspace(1) nocapture %arg1) {
 bb:
-  %tmp18 = load volatile i32, i32 addrspace(1)* %arg, align 4
-  %tmp26 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 5
-  store i32 %tmp18, i32 addrspace(1)* %tmp26, align 4
+  %tmp18 = load volatile i32, ptr addrspace(1) %arg, align 4
+  %tmp26 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 5
+  store i32 %tmp18, ptr addrspace(1) %tmp26, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index d28151a2147fc..7e8d9ae929c0b 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -6,7 +6,7 @@
 ; Test splitting flat instruction offsets into the low and high bits
 ; when the offset doesn't fit in the offset field.
 
-define i8 @flat_inst_valu_offset_1(i8* %p) {
+define i8 @flat_inst_valu_offset_1(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -31,12 +31,12 @@ define i8 @flat_inst_valu_offset_1(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 1
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 1
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_11bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_11bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -61,12 +61,12 @@ define i8 @flat_inst_valu_offset_11bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 2047
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 2047
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_12bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_12bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -91,12 +91,12 @@ define i8 @flat_inst_valu_offset_12bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 4095
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 4095
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_13bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -125,12 +125,12 @@ define i8 @flat_inst_valu_offset_13bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 8191
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 8191
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_neg_11bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -159,12 +159,12 @@ define i8 @flat_inst_valu_offset_neg_11bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 -2048
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 -2048
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_neg_12bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_neg_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -193,12 +193,12 @@ define i8 @flat_inst_valu_offset_neg_12bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 -4096
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 -4096
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_neg_13bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_neg_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -227,12 +227,12 @@ define i8 @flat_inst_valu_offset_neg_13bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 -8192
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 -8192
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_2x_11bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_2x_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -257,12 +257,12 @@ define i8 @flat_inst_valu_offset_2x_11bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 4095
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 4095
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_2x_12bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_2x_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -291,12 +291,12 @@ define i8 @flat_inst_valu_offset_2x_12bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 8191
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 8191
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_2x_13bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_2x_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -325,12 +325,12 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 16383
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 16383
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_2x_neg_11bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -359,12 +359,12 @@ define i8 @flat_inst_valu_offset_2x_neg_11bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 -4096
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 -4096
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_2x_neg_12bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_2x_neg_12bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -393,12 +393,12 @@ define i8 @flat_inst_valu_offset_2x_neg_12bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 -8192
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 -8192
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_2x_neg_13bit_max(i8* %p) {
+define i8 @flat_inst_valu_offset_2x_neg_13bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -427,13 +427,13 @@ define i8 @flat_inst_valu_offset_2x_neg_13bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 -16384
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 -16384
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
 ; Fill 11-bit low-bits (1ull << 33) | 2047
-define i8 @flat_inst_valu_offset_64bit_11bit_split0(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -462,13 +462,13 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 8589936639
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 8589936639
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
 ; Fill 11-bit low-bits (1ull << 33) | 2048
-define i8 @flat_inst_valu_offset_64bit_11bit_split1(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -497,13 +497,13 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:2048
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 8589936640
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 8589936640
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
 ; Fill 12-bit low-bits (1ull << 33) | 4095
-define i8 @flat_inst_valu_offset_64bit_12bit_split0(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -532,13 +532,13 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 8589938687
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 8589938687
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
 ; Fill 12-bit low-bits (1ull << 33) | 4096
-define i8 @flat_inst_valu_offset_64bit_12bit_split1(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -567,13 +567,13 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 8589938688
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 8589938688
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
 ; Fill 13-bit low-bits (1ull << 33) | 8191
-define i8 @flat_inst_valu_offset_64bit_13bit_split0(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -602,13 +602,13 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 8589942783
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 8589942783
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
 ; Fill 13-bit low-bits (1ull << 33) | 8192
-define i8 @flat_inst_valu_offset_64bit_13bit_split1(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -637,13 +637,13 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 8589942784
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 8589942784
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
-define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -673,13 +673,13 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 -9223372036854773761
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 -9223372036854773761
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
-define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -709,13 +709,13 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 -9223372036854773760
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 -9223372036854773760
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
-define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -745,13 +745,13 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 -9223372036854771713
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 -9223372036854771713
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
-define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -781,13 +781,13 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 -9223372036854771712
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 -9223372036854771712
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
-define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -817,13 +817,13 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 -9223372036854767617
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 -9223372036854767617
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
-define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(i8* %p) {
+define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -853,12 +853,12 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(i8* %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8* %p, i64 -9223372036854767616
-  %load = load i8, i8* %gep, align 4
+  %gep = getelementptr i8, ptr %p, i64 -9223372036854767616
+  %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_1(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -893,13 +893,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(i8* %p) {
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 1
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 1
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -934,13 +934,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 2047
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 2047
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -975,13 +975,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 4095
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 4095
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1020,13 +1020,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 8191
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 8191
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1065,13 +1065,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 -2048
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 -2048
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_neg_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1110,13 +1110,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 -4096
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 -4096
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_neg_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1155,13 +1155,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 -8192
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 -8192
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_2x_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1196,13 +1196,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 4095
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 4095
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_2x_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1241,13 +1241,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 8191
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 8191
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_2x_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1286,13 +1286,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 16383
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 16383
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1331,13 +1331,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 -4096
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 -4096
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1376,13 +1376,13 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 -8192
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 -8192
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1421,14 +1421,14 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(i8* %p) {
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 -16384
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 -16384
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
 ; Fill 11-bit low-bits (1ull << 33) | 2047
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1466,14 +1466,14 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(i8* %p) {
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 8589936639
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 8589936639
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
 ; Fill 11-bit low-bits (1ull << 33) | 2048
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1511,14 +1511,14 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(i8* %p) {
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 8589936640
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 8589936640
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
 ; Fill 12-bit low-bits (1ull << 33) | 4095
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1556,14 +1556,14 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(i8* %p) {
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 8589938687
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 8589938687
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
 ; Fill 12-bit low-bits (1ull << 33) | 4096
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1602,14 +1602,14 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(i8* %p) {
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 8589938688
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 8589938688
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
 ; Fill 13-bit low-bits (1ull << 33) | 8191
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1648,14 +1648,14 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(i8* %p) {
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 8589942783
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 8589942783
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
 ; Fill 13-bit low-bits (1ull << 33) | 8192
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1694,14 +1694,14 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(i8* %p) {
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 8589942784
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 8589942784
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1742,14 +1742,14 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(i8*
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 -9223372036854773761
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 -9223372036854773761
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1790,14 +1790,14 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(i8*
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 -9223372036854773760
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 -9223372036854773760
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1838,14 +1838,14 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(i8*
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 -9223372036854771713
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 -9223372036854771713
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1886,14 +1886,14 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(i8*
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 -9223372036854771712
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 -9223372036854771712
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1934,14 +1934,14 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(i8*
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 -9223372036854767617
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 -9223372036854767617
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }
 
 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
-define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(i8* %p) {
+define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1982,8 +1982,8 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(i8*
 ; GFX11-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8* %p, i64 -9223372036854767616
-  %load = load volatile i8, i8* %gep, align 1
-  store i8 %load, i8* undef
+  %gep = getelementptr i8, ptr %p, i64 -9223372036854767616
+  %load = load volatile i8, ptr %gep, align 1
+  store i8 %load, ptr undef
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index 62fa890736c10..1ed006b621ae4 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -6,7 +6,7 @@
 ; Test splitting flat instruction offsets into the low and high bits
 ; when the offset doesn't fit in the offset field.
 
-define i8 @global_inst_valu_offset_1(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_1(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29,12 +29,12 @@ define i8 @global_inst_valu_offset_1(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 1
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 1
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_11bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_11bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -57,12 +57,12 @@ define i8 @global_inst_valu_offset_11bit_max(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 2047
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 2047
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_12bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_12bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -87,12 +87,12 @@ define i8 @global_inst_valu_offset_12bit_max(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_13bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -121,12 +121,12 @@ define i8 @global_inst_valu_offset_13bit_max(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_neg_11bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -149,12 +149,12 @@ define i8 @global_inst_valu_offset_neg_11bit_max(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -2048
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -2048
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_neg_12bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_neg_12bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_neg_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -179,12 +179,12 @@ define i8 @global_inst_valu_offset_neg_12bit_max(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_neg_13bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_neg_13bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_neg_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -213,12 +213,12 @@ define i8 @global_inst_valu_offset_neg_13bit_max(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_2x_11bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_2x_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -243,12 +243,12 @@ define i8 @global_inst_valu_offset_2x_11bit_max(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_2x_12bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_2x_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -277,12 +277,12 @@ define i8 @global_inst_valu_offset_2x_12bit_max(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_2x_13bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_2x_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -311,12 +311,12 @@ define i8 @global_inst_valu_offset_2x_13bit_max(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 16383
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 16383
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -341,12 +341,12 @@ define i8 @global_inst_valu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -375,12 +375,12 @@ define i8 @global_inst_valu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
-define i8 @global_inst_valu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -409,13 +409,13 @@ define i8 @global_inst_valu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -16384
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16384
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
 ; Fill 11-bit low-bits (1ull << 33) | 2047
-define i8 @global_inst_valu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -444,13 +444,13 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936639
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
 ; Fill 11-bit low-bits (1ull << 33) | 2048
-define i8 @global_inst_valu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -479,13 +479,13 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:2048
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936640
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
 ; Fill 12-bit low-bits (1ull << 33) | 4095
-define i8 @global_inst_valu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -514,13 +514,13 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938687
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
 ; Fill 12-bit low-bits (1ull << 33) | 4096
-define i8 @global_inst_valu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -549,13 +549,13 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938688
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
 ; Fill 13-bit low-bits (1ull << 33) | 8191
-define i8 @global_inst_valu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -584,13 +584,13 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942783
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
 ; Fill 13-bit low-bits (1ull << 33) | 8192
-define i8 @global_inst_valu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -619,13 +619,13 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942784
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
-define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -655,13 +655,13 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)*
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-2049
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
-define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -691,13 +691,13 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)*
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
-define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -727,13 +727,13 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)*
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771713
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
-define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -763,13 +763,13 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)*
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771712
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
-define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -799,13 +799,13 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)*
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767617
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
-define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)* %p) {
+define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -835,12 +835,12 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)*
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767616
-  %load = load i8, i8 addrspace(1)* %gep, align 4
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616
+  %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_1(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -871,13 +871,13 @@ define amdgpu_kernel void @global_inst_salu_offset_1(i8 addrspace(1)* %p) {
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 1
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 1
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_11bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -908,13 +908,13 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(i8 addrspace(1)* %p
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 2047
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 2047
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_12bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -945,13 +945,13 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(i8 addrspace(1)* %p
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_13bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -982,13 +982,13 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(i8 addrspace(1)* %p
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1019,13 +1019,13 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(i8 addrspace(1)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -2048
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -2048
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_neg_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1057,13 +1057,13 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(i8 addrspace(1)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_neg_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1099,13 +1099,13 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(i8 addrspace(1)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_2x_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1136,13 +1136,13 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(i8 addrspace(1)*
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_2x_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1173,13 +1173,13 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(i8 addrspace(1)*
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_2x_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1210,13 +1210,13 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(i8 addrspace(1)*
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 16383
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 16383
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1248,13 +1248,13 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(i8 addrspace
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1290,13 +1290,13 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(i8 addrspace
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1332,14 +1332,14 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(i8 addrspace
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -16384
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16384
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
 ; Fill 11-bit low-bits (1ull << 33) | 2047
-define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1375,14 +1375,14 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(i8 addrspa
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936639
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
 ; Fill 11-bit low-bits (1ull << 33) | 2048
-define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1418,14 +1418,14 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(i8 addrspa
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936640
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
 ; Fill 12-bit low-bits (1ull << 33) | 4095
-define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1461,14 +1461,14 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(i8 addrspa
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938687
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
 ; Fill 12-bit low-bits (1ull << 33) | 4096
-define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1504,14 +1504,14 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(i8 addrspa
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938688
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
 ; Fill 13-bit low-bits (1ull << 33) | 8191
-define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1547,14 +1547,14 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(i8 addrspa
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942783
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
 ; Fill 13-bit low-bits (1ull << 33) | 8192
-define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1590,14 +1590,14 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(i8 addrspa
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942784
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
-define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1634,14 +1634,14 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(i
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
 ; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
-define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1678,14 +1678,14 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(i
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
-define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1722,14 +1722,14 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(i
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771713
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
 ; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
-define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1766,14 +1766,14 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(i
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771712
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
-define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1810,14 +1810,14 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(i
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767617
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }
 
 ; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
-define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)* %p) {
+define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1854,8 +1854,8 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(i
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767616
-  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
-  store i8 %load, i8 addrspace(1)* undef
+  %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616
+  %load = load volatile i8, ptr addrspace(1) %gep, align 1
+  store i8 %load, ptr addrspace(1) undef
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/opencl-image-metadata.ll b/llvm/test/CodeGen/AMDGPU/opencl-image-metadata.ll
index ceae2bfbd1c3f..b05de19ca3405 100644
--- a/llvm/test/CodeGen/AMDGPU/opencl-image-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/opencl-image-metadata.ll
@@ -6,9 +6,9 @@
 
 ; EG: CF_END
 ; SI: s_endpgm
-define amdgpu_kernel void @kernel(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel(ptr addrspace(1) %out) {
 entry:
-  store i32 0, i32 addrspace(1)* %out
+  store i32 0, ptr addrspace(1) %out
   ret void
 }
 
@@ -16,7 +16,7 @@ attributes #3 = { nounwind }
 
 !opencl.kernels = !{!0}
 
-!0 = !{void (i32 addrspace(1)*)* @kernel, !1, !2, !3, !4, !5}
+!0 = !{ptr @kernel, !1, !2, !3, !4, !5}
 !1 = !{!"kernel_arg_addr_space", i32 0}
 !2 = !{!"kernel_arg_access_qual", !"none"}
 !3 = !{!"kernel_arg_type", !"int*"}

diff  --git a/llvm/test/CodeGen/AMDGPU/opencl-printf-and-hostcall.ll b/llvm/test/CodeGen/AMDGPU/opencl-printf-and-hostcall.ll
index 8bfbfe23a083e..d8cef318f27b9 100644
--- a/llvm/test/CodeGen/AMDGPU/opencl-printf-and-hostcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/opencl-printf-and-hostcall.ll
@@ -5,15 +5,14 @@
 define amdgpu_kernel void @test_kernel(i32 %n) {
 entry:
   %str = alloca [9 x i8], align 1, addrspace(5)
-  %arraydecay = getelementptr inbounds [9 x i8], [9 x i8] addrspace(5)* %str, i32 0, i32 0
-  %call1 = call i32 (i8 addrspace(4)*, ...) @printf(i8 addrspace(4)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @.str, i32 0, i32 0), i8 addrspace(5)* %arraydecay, i32 %n)
-  %call2 = call <2 x i64> (i8*, i32, i64, i64, i64, i64, i64, i64, i64, i64) @__ockl_hostcall_internal(i8* undef, i32 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9)
+  %call1 = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) @.str, ptr addrspace(5) %str, i32 %n)
+  %call2 = call <2 x i64> (ptr, i32, i64, i64, i64, i64, i64, i64, i64, i64) @__ockl_hostcall_internal(ptr undef, i32 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9)
   ret void
 }
 
-declare i32 @printf(i8 addrspace(4)*, ...)
+declare i32 @printf(ptr addrspace(4), ...)
 
-declare <2 x i64> @__ockl_hostcall_internal(i8*, i32, i64, i64, i64, i64, i64, i64, i64, i64)
+declare <2 x i64> @__ockl_hostcall_internal(ptr, i32, i64, i64, i64, i64, i64, i64, i64, i64)
 
 ; CHECK-NOT: error:
 ; CHECK-NOT: warning:

diff  --git a/llvm/test/CodeGen/AMDGPU/operand-folding.ll b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
index 38a038c7b81d8..a399b509014dd 100644
--- a/llvm/test/CodeGen/AMDGPU/operand-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
@@ -3,7 +3,7 @@
 
 ; CHECK-LABEL: {{^}}fold_sgpr:
 ; CHECK: v_add_i32_e32 v{{[0-9]+}}, vcc, s
-define amdgpu_kernel void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) #1 {
+define amdgpu_kernel void @fold_sgpr(ptr addrspace(1) %out, i32 %fold) #1 {
 entry:
   %tmp0 = icmp ne i32 %fold, 0
   br i1 %tmp0, label %if, label %endif
@@ -11,8 +11,8 @@ entry:
 if:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %offset = add i32 %fold, %id
-  %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset
-  store i32 0, i32 addrspace(1)* %tmp1
+  %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %offset
+  store i32 0, ptr addrspace(1) %tmp1
   br label %endif
 
 endif:
@@ -21,7 +21,7 @@ endif:
 
 ; CHECK-LABEL: {{^}}fold_imm:
 ; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5
-define amdgpu_kernel void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) #1 {
+define amdgpu_kernel void @fold_imm(ptr addrspace(1) %out, i32 %cmp) #1 {
 entry:
   %fold = add i32 3, 2
   %tmp0 = icmp ne i32 %cmp, 0
@@ -30,7 +30,7 @@ entry:
 if:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %val = or i32 %id, %fold
-  store i32 %val, i32 addrspace(1)* %out
+  store i32 %val, ptr addrspace(1) %out
   br label %endif
 
 endif:
@@ -47,10 +47,10 @@ endif:
 ; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]
 ; CHECK: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]],
 
-define amdgpu_kernel void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) #1 {
+define amdgpu_kernel void @fold_64bit_constant_add(ptr addrspace(1) %out, i32 %cmp, i64 %val) #1 {
 entry:
   %tmp0 = add i64 %val, 1
-  store i64 %tmp0, i64 addrspace(1)* %out
+  store i64 %tmp0, ptr addrspace(1) %out
   ret void
 }
 
@@ -62,7 +62,7 @@ entry:
 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
 
-define amdgpu_kernel void @vector_inline(<4 x i32> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @vector_inline(ptr addrspace(1) %out) #1 {
 entry:
   %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp0, 1
@@ -73,7 +73,7 @@ entry:
   %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2
   %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3
   %tmp4 = xor <4 x i32> <i32 5, i32 5, i32 5, i32 5>, %vec3
-  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %tmp4, ptr addrspace(1) %out
   ret void
 }
 
@@ -81,11 +81,11 @@ entry:
 ; CHECK-LABEL: {{^}}imm_one_use:
 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}}
 
-define amdgpu_kernel void @imm_one_use(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @imm_one_use(ptr addrspace(1) %out) #1 {
 entry:
   %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = xor i32 %tmp0, 100
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 ; CHECK-LABEL: {{^}}vector_imm:
@@ -94,7 +94,7 @@ entry:
 ; CHECK: v_xor_b32_e32 v{{[0-9]}}, 0x64, v{{[0-9]}}
 ; CHECK: v_xor_b32_e32 v{{[0-9]}}, 0x64, v{{[0-9]}}
 
-define amdgpu_kernel void @vector_imm(<4 x i32> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @vector_imm(ptr addrspace(1) %out) #1 {
 entry:
   %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp0, 1
@@ -105,7 +105,7 @@ entry:
   %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2
   %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3
   %tmp4 = xor <4 x i32> <i32 100, i32 100, i32 100, i32 100>, %vec3
-  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %tmp4, ptr addrspace(1) %out
   ret void
 }
 
@@ -115,12 +115,12 @@ entry:
 ; CHECK: v_mac_f32_e32 v[[LO]], 0x41200000, v[[HI]]
 ; CHECK: buffer_store_dword v[[LO]]
 define amdgpu_kernel void @no_fold_tied_subregister() #1 {
-  %tmp1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
+  %tmp1 = load volatile <2 x float>, ptr addrspace(1) undef
   %tmp2 = extractelement <2 x float> %tmp1, i32 0
   %tmp3 = extractelement <2 x float> %tmp1, i32 1
   %tmp4 = fmul float %tmp3, 10.0
   %tmp5 = fadd float %tmp4, %tmp2
-  store volatile float %tmp5, float addrspace(1)* undef
+  store volatile float %tmp5, ptr addrspace(1) undef
   ret void
 }
 
@@ -130,12 +130,12 @@ define amdgpu_kernel void @no_fold_tied_subregister() #1 {
 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define void @no_extra_fold_on_same_opnd() #1 {
 entry:
-  %s0 = load i32, i32 addrspace(5)* undef, align 4
+  %s0 = load i32, ptr addrspace(5) undef, align 4
   %s0.i64= zext i32 %s0 to i64
   br label %for.body.i.i
 
 for.body.i.i:
-  %s1 = load i32, i32 addrspace(1)* undef, align 8
+  %s1 = load i32, ptr addrspace(1) undef, align 8
   %s1.i64 = sext i32 %s1 to i64
   %xor = xor i64 %s1.i64, %s0.i64
   %flag = icmp ult i64 %xor, 8

diff  --git a/llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll b/llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll
index 4e37fd8500e4a..d1469ed6c6743 100644
--- a/llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll
@@ -93,10 +93,10 @@ define amdgpu_kernel void @test_long_add4(<4 x i64> %arg) #0 {
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_endpgm
 entry:
-  %load0 = load <4 x i64>, <4 x i64> addrspace(1)* null, align 32
-  %load1 = load <4 x i64>, <4 x i64> addrspace(1)* null, align 32
+  %load0 = load <4 x i64>, ptr addrspace(1) null, align 32
+  %load1 = load <4 x i64>, ptr addrspace(1) null, align 32
   %add = add <4 x i64> %load0, %load1
-  store <4 x i64> %add, <4 x i64> addrspace(1)* null, align 32
+  store <4 x i64> %add, ptr addrspace(1) null, align 32
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
index e387d88285656..d8d327259dec6 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7 %s
 
 
-define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) #0 {
+define amdgpu_kernel void @s_pack_v2f16(ptr addrspace(4) %in0, ptr addrspace(4) %in1) #0 {
 ; GFX9-LABEL: s_pack_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -47,8 +47,8 @@ define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(4)* %in0, i32 addrspace(4)
 ; GFX7-NEXT:    ; use s0
 ; GFX7-NEXT:    ;;#ASMEND
 ; GFX7-NEXT:    s_endpgm
-  %val0 = load volatile i32, i32 addrspace(4)* %in0
-  %val1 = load volatile i32, i32 addrspace(4)* %in1
+  %val0 = load volatile i32, ptr addrspace(4) %in0
+  %val1 = load volatile i32, ptr addrspace(4) %in1
   %lo.i = trunc i32 %val0 to i16
   %hi.i = trunc i32 %val1 to i16
   %lo = bitcast i16 %lo.i to half
@@ -61,7 +61,7 @@ define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(4)* %in0, i32 addrspace(4)
   ret void
 }
 
-define amdgpu_kernel void @s_pack_v2f16_imm_lo(i32 addrspace(4)* %in1) #0 {
+define amdgpu_kernel void @s_pack_v2f16_imm_lo(ptr addrspace(4) %in1) #0 {
 ; GFX9-LABEL: s_pack_v2f16_imm_lo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -99,7 +99,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_lo(i32 addrspace(4)* %in1) #0 {
 ; GFX7-NEXT:    ; use s0
 ; GFX7-NEXT:    ;;#ASMEND
 ; GFX7-NEXT:    s_endpgm
-  %val1 = load i32, i32 addrspace(4)* %in1
+  %val1 = load i32, ptr addrspace(4) %in1
   %hi.i = trunc i32 %val1 to i16
   %hi = bitcast i16 %hi.i to half
   %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0
@@ -110,7 +110,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_lo(i32 addrspace(4)* %in1) #0 {
   ret void
 }
 
-define amdgpu_kernel void @s_pack_v2f16_imm_hi(i32 addrspace(4)* %in0) #0 {
+define amdgpu_kernel void @s_pack_v2f16_imm_hi(ptr addrspace(4) %in0) #0 {
 ; GFX9-LABEL: s_pack_v2f16_imm_hi:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -148,7 +148,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_hi(i32 addrspace(4)* %in0) #0 {
 ; GFX7-NEXT:    ; use s0
 ; GFX7-NEXT:    ;;#ASMEND
 ; GFX7-NEXT:    s_endpgm
-  %val0 = load i32, i32 addrspace(4)* %in0
+  %val0 = load i32, ptr addrspace(4) %in0
   %lo.i = trunc i32 %val0 to i16
   %lo = bitcast i16 %lo.i to half
   %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
@@ -159,7 +159,7 @@ define amdgpu_kernel void @s_pack_v2f16_imm_hi(i32 addrspace(4)* %in0) #0 {
   ret void
 }
 
-define amdgpu_kernel void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; GFX9-LABEL: v_pack_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -222,10 +222,10 @@ define amdgpu_kernel void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1)
 ; GFX7-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
-  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
-  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
-  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
+  %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
+  %val0 = load volatile i32, ptr addrspace(1) %in0.gep
+  %val1 = load volatile i32, ptr addrspace(1) %in1.gep
   %lo.i = trunc i32 %val0 to i16
   %hi.i = trunc i32 %val1 to i16
   %lo = bitcast i16 %lo.i to half
@@ -237,7 +237,7 @@ define amdgpu_kernel void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; GFX9-LABEL: v_pack_v2f16_user:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -305,10 +305,10 @@ define amdgpu_kernel void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspa
 ; GFX7-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
-  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
-  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
-  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
+  %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
+  %val0 = load volatile i32, ptr addrspace(1) %in0.gep
+  %val1 = load volatile i32, ptr addrspace(1) %in1.gep
   %lo.i = trunc i32 %val0 to i16
   %hi.i = trunc i32 %val1 to i16
   %lo = bitcast i16 %lo.i to half
@@ -317,11 +317,11 @@ define amdgpu_kernel void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspa
   %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
   %vec.i32 = bitcast <2 x half> %vec.1 to i32
   %foo = add i32 %vec.i32, 9
-  store volatile i32 %foo, i32 addrspace(1)* undef
+  store volatile i32 %foo, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @v_pack_v2f16_imm_lo(i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 {
 ; GFX9-LABEL: v_pack_v2f16_imm_lo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -372,8 +372,8 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(i32 addrspace(1)* %in1) #0 {
 ; GFX7-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
-  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
+  %val1 = load volatile i32, ptr addrspace(1) %in1.gep
   %hi.i = trunc i32 %val1 to i16
   %hi = bitcast i16 %hi.i to half
   %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0
@@ -383,7 +383,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(i32 addrspace(1)* %in1) #0 {
   ret void
 }
 
-define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 {
 ; GFX9-LABEL: v_pack_v2f16_inline_imm_lo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -434,8 +434,8 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(i32 addrspace(1)* %in1) #0
 ; GFX7-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
-  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
+  %val1 = load volatile i32, ptr addrspace(1) %in1.gep
   %hi.i = trunc i32 %val1 to i16
   %hi = bitcast i16 %hi.i to half
   %vec.0 = insertelement <2 x half> undef, half 4.0, i32 0
@@ -445,7 +445,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(i32 addrspace(1)* %in1) #0
   ret void
 }
 
-define amdgpu_kernel void @v_pack_v2f16_imm_hi(i32 addrspace(1)* %in0) #0 {
+define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 {
 ; GFX9-LABEL: v_pack_v2f16_imm_hi:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -496,8 +496,8 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(i32 addrspace(1)* %in0) #0 {
 ; GFX7-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
-  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
+  %val0 = load volatile i32, ptr addrspace(1) %in0.gep
   %lo.i = trunc i32 %val0 to i16
   %lo = bitcast i16 %lo.i to half
   %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
@@ -507,7 +507,7 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(i32 addrspace(1)* %in0) #0 {
   ret void
 }
 
-define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(i32 addrspace(1)* %in0) #0 {
+define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) #0 {
 ; GFX9-LABEL: v_pack_v2f16_inline_f16imm_hi:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -558,8 +558,8 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(i32 addrspace(1)* %in0)
 ; GFX7-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
-  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
+  %val0 = load volatile i32, ptr addrspace(1) %in0.gep
   %lo.i = trunc i32 %val0 to i16
   %lo = bitcast i16 %lo.i to half
   %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
@@ -569,7 +569,7 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(i32 addrspace(1)* %in0)
   ret void
 }
 
-define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(i32 addrspace(1)* %in0) #0 {
+define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 {
 ; GFX9-LABEL: v_pack_v2f16_inline_imm_hi:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -619,8 +619,8 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(i32 addrspace(1)* %in0) #0
 ; GFX7-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
-  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
+  %val0 = load volatile i32, ptr addrspace(1) %in0.gep
   %lo.i = trunc i32 %val0 to i16
   %lo = bitcast i16 %lo.i to half
   %vec.0 = insertelement <2 x half> undef, half %lo, i32 0

diff  --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
index 77f614cdf2eb0..754a925fd38f0 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s
 
 
-define amdgpu_kernel void @s_pack_v2i16(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) #0 {
+define amdgpu_kernel void @s_pack_v2i16(ptr addrspace(4) %in0, ptr addrspace(4) %in1) #0 {
 ; GFX9-LABEL: s_pack_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -47,8 +47,8 @@ define amdgpu_kernel void @s_pack_v2i16(i32 addrspace(4)* %in0, i32 addrspace(4)
 ; GFX7-NEXT:    ; use s0
 ; GFX7-NEXT:    ;;#ASMEND
 ; GFX7-NEXT:    s_endpgm
-  %val0 = load volatile i32, i32 addrspace(4)* %in0
-  %val1 = load volatile i32, i32 addrspace(4)* %in1
+  %val0 = load volatile i32, ptr addrspace(4) %in0
+  %val1 = load volatile i32, ptr addrspace(4) %in1
   %lo = trunc i32 %val0 to i16
   %hi = trunc i32 %val1 to i16
   %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
@@ -59,7 +59,7 @@ define amdgpu_kernel void @s_pack_v2i16(i32 addrspace(4)* %in0, i32 addrspace(4)
   ret void
 }
 
-define amdgpu_kernel void @s_pack_v2i16_imm_lo(i32 addrspace(4)* %in1) #0 {
+define amdgpu_kernel void @s_pack_v2i16_imm_lo(ptr addrspace(4) %in1) #0 {
 ; GFX9-LABEL: s_pack_v2i16_imm_lo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -97,7 +97,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_lo(i32 addrspace(4)* %in1) #0 {
 ; GFX7-NEXT:    ; use s0
 ; GFX7-NEXT:    ;;#ASMEND
 ; GFX7-NEXT:    s_endpgm
-  %val1 = load i32, i32 addrspace(4)* %in1
+  %val1 = load i32, ptr addrspace(4) %in1
   %hi = trunc i32 %val1 to i16
   %vec.0 = insertelement <2 x i16> undef, i16 456, i32 0
   %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
@@ -107,7 +107,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_lo(i32 addrspace(4)* %in1) #0 {
   ret void
 }
 
-define amdgpu_kernel void @s_pack_v2i16_imm_hi(i32 addrspace(4)* %in0) #0 {
+define amdgpu_kernel void @s_pack_v2i16_imm_hi(ptr addrspace(4) %in0) #0 {
 ; GFX9-LABEL: s_pack_v2i16_imm_hi:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -145,7 +145,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_hi(i32 addrspace(4)* %in0) #0 {
 ; GFX7-NEXT:    ; use s0
 ; GFX7-NEXT:    ;;#ASMEND
 ; GFX7-NEXT:    s_endpgm
-  %val0 = load i32, i32 addrspace(4)* %in0
+  %val0 = load i32, ptr addrspace(4) %in0
   %lo = trunc i32 %val0 to i16
   %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
   %vec.1 = insertelement <2 x i16> %vec.0, i16 456, i32 1
@@ -155,7 +155,7 @@ define amdgpu_kernel void @s_pack_v2i16_imm_hi(i32 addrspace(4)* %in0) #0 {
   ret void
 }
 
-define amdgpu_kernel void @v_pack_v2i16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; GFX9-LABEL: v_pack_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -218,10 +218,10 @@ define amdgpu_kernel void @v_pack_v2i16(i32 addrspace(1)* %in0, i32 addrspace(1)
 ; GFX7-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
-  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
-  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
-  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
+  %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
+  %val0 = load volatile i32, ptr addrspace(1) %in0.gep
+  %val1 = load volatile i32, ptr addrspace(1) %in1.gep
   %lo = trunc i32 %val0 to i16
   %hi = trunc i32 %val1 to i16
   %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
@@ -231,7 +231,7 @@ define amdgpu_kernel void @v_pack_v2i16(i32 addrspace(1)* %in0, i32 addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @v_pack_v2i16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; GFX9-LABEL: v_pack_v2i16_user:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -299,21 +299,21 @@ define amdgpu_kernel void @v_pack_v2i16_user(i32 addrspace(1)* %in0, i32 addrspa
 ; GFX7-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
-  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
-  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
-  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
+  %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
+  %val0 = load volatile i32, ptr addrspace(1) %in0.gep
+  %val1 = load volatile i32, ptr addrspace(1) %in1.gep
   %lo = trunc i32 %val0 to i16
   %hi = trunc i32 %val1 to i16
   %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
   %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
   %vec.i32 = bitcast <2 x i16> %vec.1 to i32
   %foo = add i32 %vec.i32, 9
-  store volatile i32 %foo, i32 addrspace(1)* undef
+  store volatile i32 %foo, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @v_pack_v2i16_imm_lo(i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 {
 ; GFX9-LABEL: v_pack_v2i16_imm_lo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -364,8 +364,8 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(i32 addrspace(1)* %in1) #0 {
 ; GFX7-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
-  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
+  %val1 = load volatile i32, ptr addrspace(1) %in1.gep
   %hi = trunc i32 %val1 to i16
   %vec.0 = insertelement <2 x i16> undef, i16 123, i32 0
   %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
@@ -374,7 +374,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(i32 addrspace(1)* %in1) #0 {
   ret void
 }
 
-define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 {
 ; GFX9-LABEL: v_pack_v2i16_inline_imm_lo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -424,8 +424,8 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(i32 addrspace(1)* %in1) #0
 ; GFX7-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
-  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %in1.gep = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %tid.ext
+  %val1 = load volatile i32, ptr addrspace(1) %in1.gep
   %hi = trunc i32 %val1 to i16
   %vec.0 = insertelement <2 x i16> undef, i16 64, i32 0
   %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
@@ -434,7 +434,7 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(i32 addrspace(1)* %in1) #0
   ret void
 }
 
-define amdgpu_kernel void @v_pack_v2i16_imm_hi(i32 addrspace(1)* %in0) #0 {
+define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 {
 ; GFX9-LABEL: v_pack_v2i16_imm_hi:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -485,8 +485,8 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(i32 addrspace(1)* %in0) #0 {
 ; GFX7-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
-  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
+  %val0 = load volatile i32, ptr addrspace(1) %in0.gep
   %lo = trunc i32 %val0 to i16
   %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
   %vec.1 = insertelement <2 x i16> %vec.0, i16 123, i32 1
@@ -495,7 +495,7 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(i32 addrspace(1)* %in0) #0 {
   ret void
 }
 
-define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(i32 addrspace(1)* %in0) #0 {
+define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 {
 ; GFX9-LABEL: v_pack_v2i16_inline_imm_hi:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -545,8 +545,8 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(i32 addrspace(1)* %in0) #0
 ; GFX7-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
-  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %in0.gep = getelementptr inbounds i32, ptr addrspace(1) %in0, i64 %tid.ext
+  %val0 = load volatile i32, ptr addrspace(1) %in0.gep
   %lo = trunc i32 %val0 to i16
   %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
   %vec.1 = insertelement <2 x i16> %vec.0, i16 7, i32 1

diff  --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 43f044f89db51..2af6adb4d16c0 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -4,48 +4,48 @@
 ; GCN-LABEL: {{^}}fadd_v2_vv:
 ; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
-define amdgpu_kernel void @fadd_v2_vv(<2 x float> addrspace(1)* %a) {
+define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %add = fadd <2 x float> %load, %load
-  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %add, ptr addrspace(1) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}fadd_v2_vs:
 ; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @fadd_v2_vs(<2 x float> addrspace(1)* %a, <2 x float> %x) {
+define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %add = fadd <2 x float> %load, %x
-  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %add, ptr addrspace(1) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}fadd_v4_vs:
 ; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX90A-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @fadd_v4_vs(<4 x float> addrspace(1)* %a, <4 x float> %x) {
+define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i32 %id
-  %load = load <4 x float>, <4 x float> addrspace(1)* %gep, align 16
+  %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <4 x float>, ptr addrspace(1) %gep, align 16
   %add = fadd <4 x float> %load, %x
-  store <4 x float> %add, <4 x float> addrspace(1)* %gep, align 16
+  store <4 x float> %add, ptr addrspace(1) %gep, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}fadd_v32_vs:
 ; GFX900-COUNT-32: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX90A-COUNT-16: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @fadd_v32_vs(<32 x float> addrspace(1)* %a, <32 x float> %x) {
+define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %a, i32 %id
-  %load = load <32 x float>, <32 x float> addrspace(1)* %gep, align 128
+  %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <32 x float>, ptr addrspace(1) %gep, align 128
   %add = fadd <32 x float> %load, %x
-  store <32 x float> %add, <32 x float> addrspace(1)* %gep, align 128
+  store <32 x float> %add, ptr addrspace(1) %gep, align 128
   ret void
 }
 
@@ -53,39 +53,39 @@ define amdgpu_kernel void @fadd_v32_vs(<32 x float> addrspace(1)* %a, <32 x floa
 ; GFX90A:         s_mov_b32 s[[K:[0-9]+]], 0x42c80000
 ; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}}
 ; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
-define amdgpu_kernel void @fadd_v2_v_imm(<2 x float> addrspace(1)* %a) {
+define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %add = fadd <2 x float> %load, <float 100.0, float 100.0>
-  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %add, ptr addrspace(1) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}fadd_v2_v_v_splat:
 ; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0
 ; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}}
-define amdgpu_kernel void @fadd_v2_v_v_splat(<2 x float> addrspace(1)* %a) {
+define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %fid = bitcast i32 %id to float
   %tmp1 = insertelement <2 x float> undef, float %fid, i64 0
   %k = insertelement <2 x float> %tmp1, float %fid, i64 1
   %add = fadd <2 x float> %load, %k
-  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %add, ptr addrspace(1) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}fadd_v2_v_lit_splat:
 ; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
 ; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}}
-define amdgpu_kernel void @fadd_v2_v_lit_splat(<2 x float> addrspace(1)* %a) {
+define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %add = fadd <2 x float> %load, <float 1.0, float 1.0>
-  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %add, ptr addrspace(1) %gep, align 8
   ret void
 }
 
@@ -94,12 +94,12 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(<2 x float> addrspace(1)* %a) {
 ; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
 ; GFX90A-DAG: s_mov_b64 [[K:s\[[0-9:]+\]]], 0x3f800000
 ; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], [[K]]
-define amdgpu_kernel void @fadd_v2_v_lit_hi0(<2 x float> addrspace(1)* %a) {
+define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %add = fadd <2 x float> %load, <float 1.0, float 0.0>
-  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %add, ptr addrspace(1) %gep, align 8
   ret void
 }
 
@@ -109,12 +109,12 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(<2 x float> addrspace(1)* %a) {
 ; GFX90A-DAG: s_mov_b32 s[[LO:[0-9]+]], 0
 ; GFX90A-DAG: s_mov_b32 s[[HI:[0-9]+]], 1.0
 ; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[LO]]:[[HI]]]{{$}}
-define amdgpu_kernel void @fadd_v2_v_lit_lo0(<2 x float> addrspace(1)* %a) {
+define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %add = fadd <2 x float> %load, <float 0.0, float 1.0>
-  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %add, ptr addrspace(1) %gep, align 8
   ret void
 }
 
@@ -124,27 +124,27 @@ define amdgpu_kernel void @fadd_v2_v_lit_lo0(<2 x float> addrspace(1)* %a) {
 ; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 1.0
 ; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 2.0
 ; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(<2 x float> addrspace(1)* %a) {
+define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %add = fadd <2 x float> %load, <float 1.0, float 2.0>
-  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %add, ptr addrspace(1) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}fadd_v2_v_fneg:
 ; GFX900-COUNT-2: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
-define amdgpu_kernel void @fadd_v2_v_fneg(<2 x float> addrspace(1)* %a, float %x) {
+define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, float %x) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %fneg = fsub float -0.0, %x
   %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
   %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
   %add = fadd <2 x float> %load, %k
-  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %add, ptr addrspace(1) %gep, align 8
   ret void
 }
 
@@ -152,15 +152,15 @@ define amdgpu_kernel void @fadd_v2_v_fneg(<2 x float> addrspace(1)* %a, float %x
 ; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1]{{$}}
-define amdgpu_kernel void @fadd_v2_v_fneg_lo(<2 x float> addrspace(1)* %a, float %x) {
+define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, float %x) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %fneg = fsub float -0.0, %x
   %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
   %k = insertelement <2 x float> %tmp1, float %x, i64 1
   %add = fadd <2 x float> %load, %k
-  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %add, ptr addrspace(1) %gep, align 8
   ret void
 }
 
@@ -168,15 +168,15 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo(<2 x float> addrspace(1)* %a, float
 ; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_hi:[0,1]{{$}}
-define amdgpu_kernel void @fadd_v2_v_fneg_hi(<2 x float> addrspace(1)* %a, float %x) {
+define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, float %x) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %fneg = fsub float -0.0, %x
   %tmp1 = insertelement <2 x float> undef, float %x, i64 0
   %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
   %add = fadd <2 x float> %load, %k
-  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %add, ptr addrspace(1) %gep, align 8
   ret void
 }
 
@@ -184,15 +184,15 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi(<2 x float> addrspace(1)* %a, float
 ; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] neg_lo:[0,1]{{$}}
-define amdgpu_kernel void @fadd_v2_v_fneg_lo2(<2 x float> addrspace(1)* %a, float %x, float %y) {
+define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, float %y) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %fneg = fsub float -0.0, %x
   %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
   %k = insertelement <2 x float> %tmp1, float %y, i64 1
   %add = fadd <2 x float> %load, %k
-  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %add, ptr addrspace(1) %gep, align 8
   ret void
 }
 
@@ -200,63 +200,63 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo2(<2 x float> addrspace(1)* %a, floa
 ; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1]{{$}}
-define amdgpu_kernel void @fadd_v2_v_fneg_hi2(<2 x float> addrspace(1)* %a, float %x, float %y) {
+define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, float %y) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %fneg = fsub float -0.0, %x
   %tmp1 = insertelement <2 x float> undef, float %y, i64 0
   %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
   %add = fadd <2 x float> %load, %k
-  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %add, ptr addrspace(1) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}fmul_v2_vv:
 ; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
-define amdgpu_kernel void @fmul_v2_vv(<2 x float> addrspace(1)* %a) {
+define amdgpu_kernel void @fmul_v2_vv(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %mul = fmul <2 x float> %load, %load
-  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %mul, ptr addrspace(1) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}fmul_v2_vs:
 ; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @fmul_v2_vs(<2 x float> addrspace(1)* %a, <2 x float> %x) {
+define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %mul = fmul <2 x float> %load, %x
-  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %mul, ptr addrspace(1) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}fmul_v4_vs:
 ; GFX900-COUNT-4: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX90A-COUNT-2: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @fmul_v4_vs(<4 x float> addrspace(1)* %a, <4 x float> %x) {
+define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i32 %id
-  %load = load <4 x float>, <4 x float> addrspace(1)* %gep, align 16
+  %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <4 x float>, ptr addrspace(1) %gep, align 16
   %mul = fmul <4 x float> %load, %x
-  store <4 x float> %mul, <4 x float> addrspace(1)* %gep, align 16
+  store <4 x float> %mul, ptr addrspace(1) %gep, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}fmul_v32_vs:
 ; GFX900-COUNT-32: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX90A-COUNT-16: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @fmul_v32_vs(<32 x float> addrspace(1)* %a, <32 x float> %x) {
+define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %a, i32 %id
-  %load = load <32 x float>, <32 x float> addrspace(1)* %gep, align 128
+  %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <32 x float>, ptr addrspace(1) %gep, align 128
   %mul = fmul <32 x float> %load, %x
-  store <32 x float> %mul, <32 x float> addrspace(1)* %gep, align 128
+  store <32 x float> %mul, ptr addrspace(1) %gep, align 128
   ret void
 }
 
@@ -264,39 +264,39 @@ define amdgpu_kernel void @fmul_v32_vs(<32 x float> addrspace(1)* %a, <32 x floa
 ; GFX90A:         s_mov_b32 s[[K:[0-9]+]], 0x42c80000
 ; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}}
 ; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
-define amdgpu_kernel void @fmul_v2_v_imm(<2 x float> addrspace(1)* %a) {
+define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %mul = fmul <2 x float> %load, <float 100.0, float 100.0>
-  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %mul, ptr addrspace(1) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}fmul_v2_v_v_splat:
 ; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0
 ; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}}
-define amdgpu_kernel void @fmul_v2_v_v_splat(<2 x float> addrspace(1)* %a) {
+define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %fid = bitcast i32 %id to float
   %tmp1 = insertelement <2 x float> undef, float %fid, i64 0
   %k = insertelement <2 x float> %tmp1, float %fid, i64 1
   %mul = fmul <2 x float> %load, %k
-  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %mul, ptr addrspace(1) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}fmul_v2_v_lit_splat:
 ; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
 ; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}}
-define amdgpu_kernel void @fmul_v2_v_lit_splat(<2 x float> addrspace(1)* %a) {
+define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %mul = fmul <2 x float> %load, <float 4.0, float 4.0>
-  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %mul, ptr addrspace(1) %gep, align 8
   ret void
 }
 
@@ -306,75 +306,75 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(<2 x float> addrspace(1)* %a) {
 ; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 4.0
 ; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 0x40400000
 ; GFX90A:     v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(<2 x float> addrspace(1)* %a) {
+define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %mul = fmul <2 x float> %load, <float 4.0, float 3.0>
-  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %mul, ptr addrspace(1) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}fmul_v2_v_fneg:
 ; GFX900-COUNT-2: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}}
 ; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
-define amdgpu_kernel void @fmul_v2_v_fneg(<2 x float> addrspace(1)* %a, float %x) {
+define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, float %x) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %fneg = fsub float -0.0, %x
   %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
   %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
   %mul = fmul <2 x float> %load, %k
-  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %mul, ptr addrspace(1) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}fma_v2_vv:
 ; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
-define amdgpu_kernel void @fma_v2_vv(<2 x float> addrspace(1)* %a) {
+define amdgpu_kernel void @fma_v2_vv(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %load, <2 x float> %load)
-  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %fma, ptr addrspace(1) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}fma_v2_vs:
 ; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @fma_v2_vs(<2 x float> addrspace(1)* %a, <2 x float> %x) {
+define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %x, <2 x float> %x)
-  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %fma, ptr addrspace(1) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}fma_v4_vs:
 ; GFX900-COUNT-4: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX90A-COUNT-2: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @fma_v4_vs(<4 x float> addrspace(1)* %a, <4 x float> %x) {
+define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i32 %id
-  %load = load <4 x float>, <4 x float> addrspace(1)* %gep, align 16
+  %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <4 x float>, ptr addrspace(1) %gep, align 16
   %fma = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %load, <4 x float> %x, <4 x float> %x)
-  store <4 x float> %fma, <4 x float> addrspace(1)* %gep, align 16
+  store <4 x float> %fma, ptr addrspace(1) %gep, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}fma_v32_vs:
 ; GFX900-COUNT-32: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX90A-COUNT-16: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @fma_v32_vs(<32 x float> addrspace(1)* %a, <32 x float> %x) {
+define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %a, i32 %id
-  %load = load <32 x float>, <32 x float> addrspace(1)* %gep, align 128
+  %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <32 x float>, ptr addrspace(1) %gep, align 128
   %fma = tail call <32 x float> @llvm.fma.v32f32(<32 x float> %load, <32 x float> %x, <32 x float> %x)
-  store <32 x float> %fma, <32 x float> addrspace(1)* %gep, align 128
+  store <32 x float> %fma, ptr addrspace(1) %gep, align 128
   ret void
 }
 
@@ -383,39 +383,39 @@ define amdgpu_kernel void @fma_v32_vs(<32 x float> addrspace(1)* %a, <32 x float
 ; GCN-DAG:        v_mov_b32_e32 v[[K2:[0-9]+]], 0x43480000
 ; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[[K1]], v[[K2]]
 ; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K1]]:{{[0-9:]+}}], v[[[K2]]:{{[0-9:]+}}] op_sel_hi:[1,0,0]{{$}}
-define amdgpu_kernel void @fma_v2_v_imm(<2 x float> addrspace(1)* %a) {
+define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 100.0, float 100.0>, <2 x float> <float 200.0, float 200.0>)
-  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %fma, ptr addrspace(1) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}fma_v2_v_v_splat:
 ; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v0, v0
 ; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1], v[0:1] op_sel_hi:[1,0,0]{{$}}
-define amdgpu_kernel void @fma_v2_v_v_splat(<2 x float> addrspace(1)* %a) {
+define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %fid = bitcast i32 %id to float
   %tmp1 = insertelement <2 x float> undef, float %fid, i64 0
   %k = insertelement <2 x float> %tmp1, float %fid, i64 1
   %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %k, <2 x float> %k)
-  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %fma, ptr addrspace(1) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}fma_v2_v_lit_splat:
 ; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0
 ; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}}
-define amdgpu_kernel void @fma_v2_v_lit_splat(<2 x float> addrspace(1)* %a) {
+define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 4.0, float 4.0>, <2 x float> <float 1.0, float 1.0>)
-  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %fma, ptr addrspace(1) %gep, align 8
   ret void
 }
 
@@ -427,97 +427,97 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(<2 x float> addrspace(1)* %a) {
 ; GFX90A-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0
 ; GFX90A-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
 ; GFX90A:     v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
-define amdgpu_kernel void @fma_v2_v_unfoldable_lit(<2 x float> addrspace(1)* %a) {
+define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 4.0, float 3.0>, <2 x float> <float 1.0, float 2.0>)
-  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %fma, ptr addrspace(1) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}fma_v2_v_fneg:
 ; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}}, -s{{[0-9]+}}
 ; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0,0] neg_lo:[0,1,1] neg_hi:[0,1,1]{{$}}
-define amdgpu_kernel void @fma_v2_v_fneg(<2 x float> addrspace(1)* %a, float %x) {
+define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, float %x) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %fneg = fsub float -0.0, %x
   %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
   %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
   %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %k, <2 x float> %k)
-  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %fma, ptr addrspace(1) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo:
 ; GFX900-COUNT-2: v_sub_f32_e32
 ; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
-define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds, float addrspace(3)* %arg2) {
+define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) {
 bb:
-  %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 4
-  %scalar0 = load volatile float, float addrspace(3)* %arg2, align 4
+  %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 4
+  %scalar0 = load volatile float, ptr addrspace(3) %arg2, align 4
   %neg.scalar0 = fsub float -0.0, %scalar0
 
   %neg.scalar0.vec = insertelement <2 x float> undef, float %neg.scalar0, i32 0
   %neg.scalar0.broadcast = shufflevector <2 x float> %neg.scalar0.vec, <2 x float> undef, <2 x i32> zeroinitializer
 
   %result = fadd <2 x float> %vec0, %neg.scalar0.broadcast
-  store <2 x float> %result, <2 x float> addrspace(1)* %out, align 4
+  store <2 x float> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi:
 ; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 ; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] neg_lo:[0,0,1] neg_hi:[0,0,1]
-define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds, float addrspace(3)* %arg2) {
+define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(3)* %lds, i32 1
-  %arg2.gep = getelementptr inbounds float, float addrspace(3)* %arg2, i32 2
+  %lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1
+  %arg2.gep = getelementptr inbounds float, ptr addrspace(3) %arg2, i32 2
 
-  %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds.gep1, align 4
+  %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x float>, ptr addrspace(3) %lds.gep1, align 4
 
-  %scalar0 = load volatile float, float addrspace(3)* %arg2, align 4
-  %scalar1 = load volatile float, float addrspace(3)* %arg2.gep, align 4
+  %scalar0 = load volatile float, ptr addrspace(3) %arg2, align 4
+  %scalar1 = load volatile float, ptr addrspace(3) %arg2.gep, align 4
 
   %vec.ins0 = insertelement <2 x float> undef, float %scalar0, i32 0
   %vec2 = insertelement <2 x float> %vec.ins0, float %scalar1, i32 1
   %neg.vec2 = fsub <2 x float> <float -0.0, float -0.0>, %vec2
 
   %result = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %vec0, <2 x float> %vec1, <2 x float> %neg.vec2)
-  store <2 x float> %result, <2 x float> addrspace(1)* %out, align 4
+  store <2 x float> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}shuffle_add_f32:
 ; GFX900-COUNT-2: v_add_f32_e32
 ; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0]{{$}}
-define amdgpu_kernel void @shuffle_add_f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @shuffle_add_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 bb:
-  %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 8
-  %lds.gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(3)* %lds, i32 1
-  %vec1 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds.gep1, align 8
+  %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 8
+  %lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1
+  %vec1 = load volatile <2 x float>, ptr addrspace(3) %lds.gep1, align 8
   %vec1.swap = shufflevector <2 x float> %vec1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
   %result = fadd <2 x float> %vec0, %vec1.swap
-  store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8
+  store <2 x float> %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}shuffle_neg_add_f32:
 ; GFX900-COUNT-2: v_sub_f32_e32
 ; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
-define amdgpu_kernel void @shuffle_neg_add_f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @shuffle_neg_add_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 bb:
-  %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 8
-  %lds.gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(3)* %lds, i32 1
-  %f32 = load volatile float, float addrspace(3)* undef, align 8
-  %vec1 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds.gep1, align 8
+  %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 8
+  %lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1
+  %f32 = load volatile float, ptr addrspace(3) undef, align 8
+  %vec1 = load volatile <2 x float>, ptr addrspace(3) %lds.gep1, align 8
   %vec1.neg = fsub <2 x float> <float -0.0, float -0.0>, %vec1
   %vec1.neg.swap = shufflevector <2 x float> %vec1.neg, <2 x float> undef, <2 x i32> <i32 1, i32 0>
   %result = fadd <2 x float> %vec0, %vec1.neg.swap
-  store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8
+  store <2 x float> %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -533,41 +533,41 @@ bb:
   %i13 = fadd <2 x float> zeroinitializer, %shift8
   %i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2>
   %i15 = fsub <2 x float> %i14, zeroinitializer
-  store <2 x float> %i15, <2 x float>* undef
+  store <2 x float> %i15, ptr undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}fadd_shuffle_v4:
 ; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX90A-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]
-define amdgpu_kernel void @fadd_shuffle_v4(<4 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @fadd_shuffle_v4(ptr addrspace(1) %arg) {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %gep
+  %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid
+  %in.1 = load <4 x float>, ptr addrspace(1) %gep
   %shuf = shufflevector <4 x float> %in.1, <4 x float> undef, <4 x i32> zeroinitializer
   %add.1 = fadd <4 x float> %in.1, %shuf
-  store <4 x float> %add.1, <4 x float> addrspace(1)* %gep
+  store <4 x float> %add.1, ptr addrspace(1) %gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}fneg_v2f32_vec:
 ; GFX900-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
 ; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 neg_lo:[1,1] neg_hi:[1,1]{{$}}
-define amdgpu_kernel void @fneg_v2f32_vec(<2 x float> addrspace(1)* %a) {
+define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
-  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
+  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
   %fneg = fsub <2 x float> <float -0.0, float -0.0>, %load
-  store <2 x float> %fneg, <2 x float> addrspace(1)* %gep, align 8
+  store <2 x float> %fneg, ptr addrspace(1) %gep, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}fneg_v2f32_scalar:
 ; GCN-COUNT-2: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-define amdgpu_kernel void @fneg_v2f32_scalar(<2 x float> addrspace(1)* %a, <2 x float> %x) {
+define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x) {
   %fneg = fsub <2 x float> <float -0.0, float -0.0>, %x
-  store <2 x float> %fneg, <2 x float> addrspace(1)* %a, align 8
+  store <2 x float> %fneg, ptr addrspace(1) %a, align 8
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
index 3b256f10b8460..1ba75869ec62c 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
@@ -11,19 +11,19 @@
 ; GCN-NOT: or
 
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
-define amdgpu_kernel void @fma_vector_vector_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+define amdgpu_kernel void @fma_vector_vector_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
-  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
+  %scalar0 = load volatile half, ptr addrspace(3) %arg2, align 2
 
   %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
   %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
 
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.broadcast)
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -39,20 +39,20 @@ bb:
 ; GCN-NOT: or
 
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
-define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
-  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
+  %scalar0 = load volatile half, ptr addrspace(3) %arg2, align 2
 
   %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
   %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
   %neg.scalar0.broadcast = fsub <2 x half> <half -0.0, half -0.0>, %scalar0.broadcast
 
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -68,20 +68,20 @@ bb:
 ; GCN-NOT: or
 
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
-define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
-  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
+  %scalar0 = load volatile half, ptr addrspace(3) %arg2, align 2
 
   %neg.scalar0 = fsub half -0.0, %scalar0
   %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
   %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
 
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -97,13 +97,13 @@ bb:
 ; GCN-NOT: or
 
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
-define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
-  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
+  %scalar0 = load volatile half, ptr addrspace(3) %arg2, align 2
 
   %neg.scalar0 = fsub half -0.0, %scalar0
   %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
@@ -111,7 +111,7 @@ bb:
   %neg.neg.scalar0.broadcast = fsub <2 x half> <half -0.0, half -0.0>, %neg.scalar0.broadcast
 
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.scalar0.broadcast)
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -127,19 +127,19 @@ bb:
 ; GCN-NOT: or
 
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}}
-define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
-  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
+  %scalar0 = load volatile half, ptr addrspace(3) %arg2, align 2
 
   %neg.scalar0 = fsub half -0.0, %scalar0
   %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
   %neg.scalar0.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %scalar0, i32 1
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.scalar0)
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -155,19 +155,19 @@ bb:
 ; GCN-NOT: or
 
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_hi:[0,0,1]{{$}}
-define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
-  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
+  %scalar0 = load volatile half, ptr addrspace(3) %arg2, align 2
 
   %neg.scalar0 = fsub half -0.0, %scalar0
   %neg.scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
   %scalar0.neg.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %neg.scalar0, i32 1
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.neg.scalar0)
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -182,10 +182,10 @@ bb:
 ; GCN-NOT: or
 
 ; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[SCALAR0]] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
-define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 bb:
-  %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
-  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+  %vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4
+  %scalar0 = load volatile half, ptr addrspace(3) %arg2, align 2
   %neg.scalar0 = fsub half -0.0, %scalar0
   %neg.scalar0.bc = bitcast half %neg.scalar0 to i16
 
@@ -193,7 +193,7 @@ bb:
   %neg.scalar0.broadcast = shufflevector <2 x i16> %neg.scalar0.vec, <2 x i16> undef, <2 x i32> zeroinitializer
 
   %result = add <2 x i16> %vec0, %neg.scalar0.broadcast
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
+  store <2 x i16> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -209,22 +209,22 @@ bb:
 ; GCN: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[SCALAR1]], 16, [[SCALAR0]]
 
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]]{{$}}
-define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
-  %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
+  %arg2.gep = getelementptr inbounds half, ptr addrspace(3) %arg2, i32 2
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
 
-  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
-  %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2
+  %scalar0 = load volatile half, ptr addrspace(3) %arg2, align 2
+  %scalar1 = load volatile half, ptr addrspace(3) %arg2.gep, align 2
 
   %neg.scalar1 = fsub half -0.0, %scalar1
   %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0
   %vec2 = insertelement <2 x half> %vec.ins0, half %neg.scalar1, i32 1
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2)
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -237,23 +237,23 @@ bb:
 ; GCN: ds_read_u16_d16_hi [[PACKED]]
 
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
-define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
-  %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
+  %arg2.gep = getelementptr inbounds half, ptr addrspace(3) %arg2, i32 2
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
 
-  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
-  %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2
+  %scalar0 = load volatile half, ptr addrspace(3) %arg2, align 2
+  %scalar1 = load volatile half, ptr addrspace(3) %arg2.gep, align 2
 
   %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0
   %vec2 = insertelement <2 x half> %vec.ins0, half %scalar1, i32 1
   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
 
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2)
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -268,20 +268,20 @@ bb:
 ; GCN-NOT: or
 
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
-define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
-  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
-  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, ptr addrspace(3) %lds.gep2, align 4
 
   %vec2.fneg = fsub <2 x half> <half -0.0, half -0.0>, %vec2
   %vec2.fneg.elt1.broadcast = shufflevector <2 x half> %vec2.fneg, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.fneg.elt1.broadcast)
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -296,21 +296,21 @@ bb:
 ; GCN-NOT: or
 
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}}
-define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
-  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
-  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, ptr addrspace(3) %lds.gep2, align 4
 
   %vec2.elt1 = extractelement <2 x half> %vec2, i32 1
   %neg.vec2.elt1 = fsub half -0.0, %vec2.elt1
 
   %neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.vec2.elt1, i32 1
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.elt1.insert)
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -324,17 +324,17 @@ bb:
 ; GCN-NOT: or
 
 ; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[VEC1]] op_sel:[0,1]{{$}}
-define amdgpu_kernel void @add_vector_scalar_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @add_vector_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(3)* %lds, i32 1
+  %lds.gep1 = getelementptr inbounds <2 x i16>, ptr addrspace(3) %lds, i32 1
 
-  %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds.gep1, align 4
+  %vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x i16>, ptr addrspace(3) %lds.gep1, align 4
 
   %vec1.elt1.broadcast = shufflevector <2 x i16> %vec1, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
   %result = add <2 x i16> %vec0, %vec1.elt1.broadcast
 
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
+  store <2 x i16> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -349,20 +349,20 @@ bb:
 ; GCN-NOT: or
 
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1]{{$}}
-define amdgpu_kernel void @fma_vector_vector_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @fma_vector_vector_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
-  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
-  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, ptr addrspace(3) %lds.gep2, align 4
 
   %vec2.elt1.broadcast = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.elt1.broadcast)
 
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -377,14 +377,14 @@ bb:
 ; GCN-NOT: or
 
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]]{{$}}
-define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
-  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
-  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, ptr addrspace(3) %lds.gep2, align 4
 
   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
   %neg.vec2.elt1 = extractelement <2 x half> %neg.vec2, i32 1
@@ -392,7 +392,7 @@ bb:
   %neg.neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.neg.vec2.elt1, i32 1
 
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.vec2.elt1.insert)
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -407,19 +407,19 @@ bb:
 ; GCN-NOT: or
 
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
-define amdgpu_kernel void @fma_vector_vector_swap_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @fma_vector_vector_swap_vector(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
-  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
-  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, ptr addrspace(3) %lds.gep2, align 4
 
   %vec2.swap = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0>
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.swap)
 
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -435,20 +435,20 @@ bb:
 ; GCN-NOT: xor
 
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
-define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
-  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
-  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, ptr addrspace(3) %lds.gep2, align 4
   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
 
   %neg.vec2.swap = shufflevector <2 x half> %neg.vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0>
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.swap)
 
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -464,19 +464,19 @@ bb:
 ; GCN-NOT: xor
 
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}}
-define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
-  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
-  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, ptr addrspace(3) %lds.gep2, align 4
   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
   %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 0>
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
 
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -492,19 +492,19 @@ bb:
 ; GCN-NOT: xor
 
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_lo:[0,0,1]{{$}}
-define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
-  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
-  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, ptr addrspace(3) %lds.gep2, align 4
   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
   %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 2, i32 1>
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
 
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -520,19 +520,19 @@ bb:
 ; GCN-NOT: xor
 
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}}
-define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
-  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
-  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, ptr addrspace(3) %lds.gep2, align 4
   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
   %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 0, i32 3>
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
 
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -548,58 +548,58 @@ bb:
 ; GCN-NOT: xor
 
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1]{{$}}
-define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
-  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
-  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, ptr addrspace(3) %lds.gep2, align 4
   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
   %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 1>
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
 
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}bitcast_fneg_f32:
 ; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
-define amdgpu_kernel void @bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @bitcast_fneg_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 bb:
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %f32 = load volatile float, float addrspace(3)* undef, align 4
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %f32 = load volatile float, ptr addrspace(3) undef, align 4
   %neg.f32 = fsub float -0.0, %f32
   %bc = bitcast float %neg.f32 to <2 x half>
   %result = fadd <2 x half> %vec0, %bc
 
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}shuffle_bitcast_fneg_f32:
 ; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} op_sel:[0,1] op_sel_hi:[1,0]{{$}}
-define amdgpu_kernel void @shuffle_bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @shuffle_bitcast_fneg_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 bb:
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
 
-  %f32 = load volatile float, float addrspace(3)* undef, align 4
+  %f32 = load volatile float, ptr addrspace(3) undef, align 4
   %neg.f32 = fsub float -0.0, %f32
   %bc = bitcast float %neg.f32 to <2 x half>
   %shuf = shufflevector <2 x half> %bc, <2 x half> undef, <2 x i32> <i32 1, i32 0>
   %result = fadd <2 x half> %vec0, %shuf
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}extract_from_i64:
 ; GCN: v_lshl_or_b32
 ; GCN: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
-define amdgpu_kernel void @extract_from_i64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @extract_from_i64(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 bb:
-  %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
-  %i64 = load volatile i64, i64 addrspace(1)* undef
+  %vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4
+  %i64 = load volatile i64, ptr addrspace(1) undef
 
   %elt0 = trunc i64 %i64 to i16
   %hi = lshr i64 %i64, 16
@@ -608,7 +608,7 @@ bb:
   %ins0 = insertelement <2 x i16> undef, i16 %elt1, i32 0
   %ins1 = insertelement <2 x i16> %ins0, i16 %elt0, i32 1
   %result = add <2 x i16> %vec0, %ins1
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
+  store <2 x i16> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -626,16 +626,16 @@ bb:
 
 ; GCN: v_pk_add_f16 [[FADD:v[0-9]+]]
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
-define amdgpu_kernel void @bitcast_lo_elt_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @bitcast_lo_elt_op_sel(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
-  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
-  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, ptr addrspace(3) %lds.gep2, align 4
 
-  %scalar0 = load volatile i16, i16 addrspace(1)* undef
+  %scalar0 = load volatile i16, ptr addrspace(1) undef
   %shl = shl i16 %scalar0, 1
   %shl.bc = bitcast i16 %shl to half
 
@@ -643,7 +643,7 @@ bb:
   %shuffle = shufflevector <2 x half> %fadd, <2 x half> %vec2, <2 x i32> <i32 1, i32 0>
 
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %shuffle)
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -661,17 +661,17 @@ bb:
 
 ; GCN: v_pk_add_f16 [[FADD:v[0-9]+]]
 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
-define amdgpu_kernel void @mix_elt_types_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @mix_elt_types_op_sel(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 bb:
-  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
-  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+  %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
 
-  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
-  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
-  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
+  %vec1 = load volatile <2 x half>, ptr addrspace(3) %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, ptr addrspace(3) %lds.gep2, align 4
 
-  %scalar0 = load volatile i16, i16 addrspace(1)* undef
-  %scalar1 = load volatile half, half addrspace(1)* undef
+  %scalar0 = load volatile i16, ptr addrspace(1) undef
+  %scalar1 = load volatile half, ptr addrspace(1) undef
   %shl = shl i16 %scalar0, 1
   %shl.bc = bitcast i16 %shl to half
 
@@ -681,7 +681,7 @@ bb:
   %insert1 = shufflevector <2 x half> %fadd, <2 x half> %insert0, <2 x i32> <i32 1, i32 0>
 
   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %insert1)
-  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %result, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/packetizer.ll b/llvm/test/CodeGen/AMDGPU/packetizer.ll
index 1764d64c367f8..23b72ce458f50 100644
--- a/llvm/test/CodeGen/AMDGPU/packetizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/packetizer.ll
@@ -7,7 +7,7 @@
 ; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z
 ; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W
 
-define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
+define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
 entry:
   %shl = sub i32 32, %e
   %x = add i32 %x_arg, 1
@@ -29,6 +29,6 @@ entry:
   %xy = or i32 %x.2, %y.2
   %zw = or i32 %z.2, %w.2
   %xyzw = or i32 %xy, %zw
-  store i32 %xyzw, i32 addrspace(1)* %out
+  store i32 %xyzw, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
index 3b4a0b822b2e8..c4af5f258243e 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
@@ -14,14 +14,14 @@ target datalayout = "A5"
 define amdgpu_cs void @test_simple_indirect_call() {
 ; AKF_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call() {
 ; AKF_GCN-NEXT:    [[PC:%.*]] = call i64 @llvm.amdgcn.s.getpc()
-; AKF_GCN-NEXT:    [[FUN:%.*]] = inttoptr i64 [[PC]] to void ()*
+; AKF_GCN-NEXT:    [[FUN:%.*]] = inttoptr i64 [[PC]] to ptr
 ; AKF_GCN-NEXT:    call amdgpu_gfx void [[FUN]]()
 ; AKF_GCN-NEXT:    ret void
 ;
 ; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call
 ; ATTRIBUTOR_GCN-SAME: () #[[ATTR0:[0-9]+]] {
 ; ATTRIBUTOR_GCN-NEXT:    [[PC:%.*]] = call i64 @llvm.amdgcn.s.getpc()
-; ATTRIBUTOR_GCN-NEXT:    [[FUN:%.*]] = inttoptr i64 [[PC]] to void ()*
+; ATTRIBUTOR_GCN-NEXT:    [[FUN:%.*]] = inttoptr i64 [[PC]] to ptr
 ; ATTRIBUTOR_GCN-NEXT:    call amdgpu_gfx void [[FUN]]()
 ; ATTRIBUTOR_GCN-NEXT:    ret void
 ;
@@ -58,7 +58,7 @@ define amdgpu_cs void @test_simple_indirect_call() {
 
 
   %pc = call i64 @llvm.amdgcn.s.getpc()
-  %fun = inttoptr i64 %pc to void()*
+  %fun = inttoptr i64 %pc to ptr
   call amdgpu_gfx void %fun()
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
index 2ba029c8bb9c0..bf29873ba2800 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
@@ -8,7 +8,7 @@
 
 ; Ideally we only need 2 VGPRs for all spilling. The VGPRs are
 ; allocated per-frame index, so it's possible to get up with more.
-define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, i32 %in) #0 {
 ; GCN-LABEL: spill_sgprs_to_multiple_vgprs:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s92, SCRATCH_RSRC_DWORD0
@@ -467,7 +467,7 @@ ret:
 
 ; Some of the lanes of an SGPR spill are in one VGPR and some forced
 ; into the next available VGPR.
-define amdgpu_kernel void @split_sgpr_spill_2_vgprs(i32 addrspace(1)* %out, i32 %in) #1 {
+define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %in) #1 {
 ; GCN-LABEL: split_sgpr_spill_2_vgprs:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
@@ -709,7 +709,7 @@ ret:
 ; The first 64 SGPR spills can go to a VGPR, but there isn't a second
 ; so some spills must be to memory. The last 16 element spill runs out
 ; of lanes at the 15th element.
-define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 {
+define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %in) #1 {
 ; GCN-LABEL: no_vgprs_last_sgpr_spill:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0

diff  --git a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll
index e50c7c592f887..8e54bd51aaa5c 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll
@@ -107,7 +107,7 @@ define amdgpu_kernel void @s_trunc_srl_i64_16_to_i16(i64 %x) {
   %shift = lshr i64 %x, 16
   %trunc = trunc i64 %shift to i16
   %add = or i16 %trunc, 4
-  store i16 %add, i16 addrspace(1)* undef
+  store i16 %add, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll b/llvm/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
index 4bcfe5f3d28cc..464b25f6eed3e 100644
--- a/llvm/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
+++ b/llvm/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
@@ -10,17 +10,17 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
-define amdgpu_kernel void @dead_def_subregister(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @dead_def_subregister(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %val = load i64, i64 addrspace(1)* %in.gep
+  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %val = load i64, ptr addrspace(1) %in.gep
 
   %lshr = shl i64 %val, 24
   %and1 = and i64 %lshr, 2190433320969 ; (255 << 33) | 9
   %vec = bitcast i64 %and1 to <2 x i32>
   %elt1 = extractelement <2 x i32> %vec, i32 1
 
-  store i32 %elt1, i32 addrspace(1)* %out
+  store i32 %elt1, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/pk_max_f16_literal.ll b/llvm/test/CodeGen/AMDGPU/pk_max_f16_literal.ll
index 713006a262723..81918f5ca0eff 100644
--- a/llvm/test/CodeGen/AMDGPU/pk_max_f16_literal.ll
+++ b/llvm/test/CodeGen/AMDGPU/pk_max_f16_literal.ll
@@ -4,92 +4,92 @@
 
 ; GCN-LABEL: {{^}}test_pk_max_f16_literal_0_1:
 ; GCN: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, 1.0 op_sel:[0,1] op_sel_hi:[1,0]{{$}}
-define amdgpu_kernel void @test_pk_max_f16_literal_0_1(<2 x half> addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @test_pk_max_f16_literal_0_1(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
-  %tmp2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i64 %tmp1
-  %tmp3 = load <2 x half>, <2 x half> addrspace(1)* %tmp2, align 4
+  %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
   %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH0000, half 0xH3C00>)
-  store <2 x half> %tmp4, <2 x half> addrspace(1)* %tmp2, align 4
+  store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_pk_max_f16_literal_1_0:
 ; GCN: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, 1.0{{$}}
-define amdgpu_kernel void @test_pk_max_f16_literal_1_0(<2 x half> addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @test_pk_max_f16_literal_1_0(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
-  %tmp2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i64 %tmp1
-  %tmp3 = load <2 x half>, <2 x half> addrspace(1)* %tmp2, align 4
+  %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
   %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH3C00, half 0xH0000>) 
-  store <2 x half> %tmp4, <2 x half> addrspace(1)* %tmp2, align 4
+  store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_pk_max_f16_literal_1_1:
 ; GCN: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, 1.0 op_sel_hi:[1,0]{{$}}
-define amdgpu_kernel void @test_pk_max_f16_literal_1_1(<2 x half> addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @test_pk_max_f16_literal_1_1(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
-  %tmp2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i64 %tmp1
-  %tmp3 = load <2 x half>, <2 x half> addrspace(1)* %tmp2, align 4
+  %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
   %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH3C00, half 0xH3C00>)
-  store <2 x half> %tmp4, <2 x half> addrspace(1)* %tmp2, align 4
+  store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_pk_max_f16_literal_0_m1:
 ; GCN: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, -1.0 op_sel:[0,1] op_sel_hi:[1,0]{{$}}
-define amdgpu_kernel void @test_pk_max_f16_literal_0_m1(<2 x half> addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @test_pk_max_f16_literal_0_m1(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
-  %tmp2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i64 %tmp1
-  %tmp3 = load <2 x half>, <2 x half> addrspace(1)* %tmp2, align 4
+  %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
   %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH0000, half 0xHBC00>)
-  store <2 x half> %tmp4, <2 x half> addrspace(1)* %tmp2, align 4
+  store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_pk_max_f16_literal_m1_0:
 ; GCN: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, -1.0{{$}}
-define amdgpu_kernel void @test_pk_max_f16_literal_m1_0(<2 x half> addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @test_pk_max_f16_literal_m1_0(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
-  %tmp2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i64 %tmp1
-  %tmp3 = load <2 x half>, <2 x half> addrspace(1)* %tmp2, align 4
+  %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
   %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xHBC00, half 0xH0000>)
-  store <2 x half> %tmp4, <2 x half> addrspace(1)* %tmp2, align 4
+  store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_pk_max_f16_literal_m1_m1:
 ; GCN: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, -1.0 op_sel_hi:[1,0]{{$}}
-define amdgpu_kernel void @test_pk_max_f16_literal_m1_m1(<2 x half> addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @test_pk_max_f16_literal_m1_m1(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
-  %tmp2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i64 %tmp1
-  %tmp3 = load <2 x half>, <2 x half> addrspace(1)* %tmp2, align 4
+  %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
   %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xHBC00, half 0xHBC00>)
-  store <2 x half> %tmp4, <2 x half> addrspace(1)* %tmp2, align 4
+  store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_pk_max_f16_literal_0_0:
 ; GCN: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
-define amdgpu_kernel void @test_pk_max_f16_literal_0_0(<2 x half> addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @test_pk_max_f16_literal_0_0(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
-  %tmp2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i64 %tmp1
-  %tmp3 = load <2 x half>, <2 x half> addrspace(1)* %tmp2, align 4
+  %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
   %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH0000, half 0xH0000>)
-  store <2 x half> %tmp4, <2 x half> addrspace(1)* %tmp2, align 4
+  store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
   ret void
 }
 
@@ -97,14 +97,14 @@ bb:
 ; GFX9:  s_mov_b32 [[C:s[0-9]+]], 0x41c80000
 ; GFX9:  v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, [[C]]{{$}}
 ; GFX10: v_pk_max_f16 v{{[0-9]+}}, 0x41c8, v{{[0-9]+}} op_sel:[1,0] op_sel_hi:[0,1]{{$}}
-define amdgpu_kernel void @test_pk_max_f16_literal_0_41c8(<2 x half> addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @test_pk_max_f16_literal_0_41c8(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
-  %tmp2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i64 %tmp1
-  %tmp3 = load <2 x half>, <2 x half> addrspace(1)* %tmp2, align 4
+  %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
   %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH0000, half 0xH41C8>)
-  store <2 x half> %tmp4, <2 x half> addrspace(1)* %tmp2, align 4
+  store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
   ret void
 }
 
@@ -112,14 +112,14 @@ bb:
 ; GFX9:  s_movk_i32 [[C:s[0-9]+]], 0x41c8
 ; GFX9:  v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, [[C]]{{$}}
 ; GFX10: v_pk_max_f16 v{{[0-9]+}}, 0x41c8, v{{[0-9]+}}{{$}}
-define amdgpu_kernel void @test_pk_max_f16_literal_41c8_0(<2 x half> addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @test_pk_max_f16_literal_41c8_0(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
-  %tmp2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i64 %tmp1
-  %tmp3 = load <2 x half>, <2 x half> addrspace(1)* %tmp2, align 4
+  %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
   %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH41C8, half 0xH0>)
-  store <2 x half> %tmp4, <2 x half> addrspace(1)* %tmp2, align 4
+  store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
   ret void
 }
 
@@ -127,14 +127,14 @@ bb:
 ; GFX9:  s_mov_b32 [[C:s[0-9]+]], 0x41c842ca
 ; GFX9:  v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, [[C]]{{$}}
 ; GFX10: v_pk_max_f16 v{{[0-9]+}}, 0x41c842ca, v{{[0-9]+}}{{$}}
-define amdgpu_kernel void @test_pk_max_f16_literal_42ca_41c8(<2 x half> addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @test_pk_max_f16_literal_42ca_41c8(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
-  %tmp2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i64 %tmp1
-  %tmp3 = load <2 x half>, <2 x half> addrspace(1)* %tmp2, align 4
+  %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
   %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH42CA, half 0xH41C8>)
-  store <2 x half> %tmp4, <2 x half> addrspace(1)* %tmp2, align 4
+  store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
index 8daa7eeeaff6a..fc99e23a55c16 100644
--- a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
+++ b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
@@ -5,7 +5,7 @@
 ; of debug info. The debug info should not interfere with the
 ; bundling, which could result in an observable codegen change.
 
-define amdgpu_kernel void @dbg_clause(float addrspace(1)* %out, float addrspace(1)* %aptr) !dbg !4 {
+define amdgpu_kernel void @dbg_clause(ptr addrspace(1) %out, ptr addrspace(1) %aptr) !dbg !4 {
 ; GCN-LABEL: dbg_clause:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -20,15 +20,15 @@ define amdgpu_kernel void @dbg_clause(float addrspace(1)* %out, float addrspace(
 ; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %gep1 = getelementptr float, float addrspace(1)* %gep0, i32 8
-  %a = load float, float addrspace(1)* %gep0, align 4
+  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %gep1 = getelementptr float, ptr addrspace(1) %gep0, i32 8
+  %a = load float, ptr addrspace(1) %gep0, align 4
   call void @llvm.dbg.value(metadata float %a, metadata !8, metadata !DIExpression()), !dbg !9
-  %b = load float, float addrspace(1)* %gep1, align 4
+  %b = load float, ptr addrspace(1) %gep1, align 4
   call void @llvm.dbg.value(metadata float %b, metadata !10, metadata !DIExpression()), !dbg !11
   %fadd = fadd float %a, %b
-  store float %fadd, float addrspace(1)* %out.gep, align 4
+  store float %fadd, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
index 59a5528f4f255..5a7d811acb02a 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
@@ -3,7 +3,7 @@
 ; Make sure that AMDGPUPromoteAlloca doesn't crash if the called
 ; function is a constantexpr cast of a function.
 
-declare void @foo(float addrspace(5)*) #0
+declare void @foo(ptr addrspace(5)) #0
 declare void @foo.varargs(...) #0
 
 ; CHECK-LABEL: @crash_call_constexpr_cast(
@@ -11,7 +11,7 @@ declare void @foo.varargs(...) #0
 ; CHECK: call void
 define amdgpu_kernel void @crash_call_constexpr_cast() #0 {
   %alloca = alloca i32, addrspace(5)
-  call void bitcast (void (float addrspace(5)*)* @foo to void (i32 addrspace(5)*)*)(i32 addrspace(5)* %alloca) #0
+  call void @foo(ptr addrspace(5) %alloca) #0
   ret void
 }
 
@@ -20,7 +20,7 @@ define amdgpu_kernel void @crash_call_constexpr_cast() #0 {
 ; CHECK: call void
 define amdgpu_kernel void @crash_call_constexpr_cast_varargs() #0 {
   %alloca = alloca i32, addrspace(5)
-  call void bitcast (void (...)* @foo.varargs to void (i32 addrspace(5)*)*)(i32 addrspace(5)* %alloca) #0
+  call void @foo.varargs(ptr addrspace(5) %alloca) #0
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
index fe31cface587a..3596c96b8cd79 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
@@ -5,32 +5,29 @@
 @global_array0 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
 @global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
 
-; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) {
 ; IR: alloca [10 x i32]
 ; ASM-LABEL: {{^}}promote_alloca_size_256:
 ; ASM: .amdgpu_lds llvm.amdgcn.kernel.promote_alloca_size_256.lds, 60000, 16
 ; ASM-NOT: .amdgpu_lds
 
-define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+define amdgpu_kernel void @promote_alloca_size_256(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) {
 entry:
   %stack = alloca [10 x i32], align 4, addrspace(5)
-  %tmp = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [10 x i32], [10 x i32] addrspace(5)* %stack, i32 0, i32 %tmp
-  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [10 x i32], [10 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1
-  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
-  %arrayidx10 = getelementptr inbounds [10 x i32], [10 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
-  store i32 %tmp2, i32 addrspace(1)* %out, align 4
-  %arrayidx12 = getelementptr inbounds [10 x i32], [10 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %tmp3 = load i32, i32 addrspace(5)* %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
-  store i32 %tmp3, i32 addrspace(1)* %arrayidx13
-  %v0 = getelementptr inbounds [750 x [10 x i32]], [750 x [10 x i32]] addrspace(3)* @global_array0, i32 0, i32 0, i32 0
-  store i32 %tmp3, i32 addrspace(3)* %v0
-  %v1 = getelementptr inbounds [750 x [10 x i32]], [750 x [10 x i32]] addrspace(3)* @global_array1, i32 0, i32 0, i32 0
-  store i32 %tmp3, i32 addrspace(3)* %v1
+  %tmp = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [10 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp
+  store i32 4, ptr addrspace(5) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+  %tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [10 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1
+  store i32 5, ptr addrspace(5) %arrayidx3, align 4
+  %tmp2 = load i32, ptr addrspace(5) %stack, align 4
+  store i32 %tmp2, ptr addrspace(1) %out, align 4
+  %arrayidx12 = getelementptr inbounds [10 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %tmp3 = load i32, ptr addrspace(5) %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+  store i32 %tmp3, ptr addrspace(1) %arrayidx13
+  store i32 %tmp3, ptr addrspace(3) @global_array0
+  store i32 %tmp3, ptr addrspace(3) @global_array1
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
index 2e8a15d99ccd6..efc11bf1a606d 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
@@ -5,14 +5,13 @@
 ; This normally would be fixed by instcombine to be compare to the GEP
 ; indices
 
-define amdgpu_kernel void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_same_derived_pointer(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 ; CHECK-LABEL: @lds_promoted_alloca_icmp_same_derived_pointer(
-; CHECK-NEXT:    [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to i32 addrspace(4)*
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32 addrspace(4)* [[TMP2]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[TMP3]], align 4, !invariant.load !0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32 addrspace(4)* [[TMP2]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[TMP5]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0
+; CHECK-NEXT:    [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !invariant.load !0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0
 ; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP4]], 16
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2:![0-9]+]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
@@ -22,40 +21,39 @@ define amdgpu_kernel void @lds_promoted_alloca_icmp_same_derived_pointer(i32 add
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw nsw i32 [[TMP9]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_same_derived_pointer.alloca, i32 0, i32 [[TMP15]]
-; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[TMP16]], i32 0, i32 [[A:%.*]]
-; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[TMP16]], i32 0, i32 [[B:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 addrspace(3)* [[PTR0]], [[PTR1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_same_derived_pointer.alloca, i32 0, i32 [[TMP15]]
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP16]], i32 0, i32 [[A:%.*]]
+; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP16]], i32 0, i32 [[B:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(3) [[PTR0]], [[PTR1]]
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    store volatile i32 [[ZEXT]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; NOLDS-LABEL: @lds_promoted_alloca_icmp_same_derived_pointer(
 ; NOLDS-NEXT:    [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5)
-; NOLDS-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[A:%.*]]
-; NOLDS-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[B:%.*]]
-; NOLDS-NEXT:    [[CMP:%.*]] = icmp eq i32 addrspace(5)* [[PTR0]], [[PTR1]]
+; NOLDS-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]]
+; NOLDS-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[B:%.*]]
+; NOLDS-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(5) [[PTR0]], [[PTR1]]
 ; NOLDS-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
-; NOLDS-NEXT:    store volatile i32 [[ZEXT]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; NOLDS-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; NOLDS-NEXT:    ret void
 ;
   %alloca = alloca [16 x i32], align 4, addrspace(5)
-  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a
-  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %b
-  %cmp = icmp eq i32 addrspace(5)* %ptr0, %ptr1
+  %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
+  %ptr1 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %b
+  %cmp = icmp eq ptr addrspace(5) %ptr0, %ptr1
   %zext = zext i1 %cmp to i32
-  store volatile i32 %zext, i32 addrspace(1)* %out
+  store volatile i32 %zext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_null_rhs(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 ; CHECK-LABEL: @lds_promoted_alloca_icmp_null_rhs(
-; CHECK-NEXT:    [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to i32 addrspace(4)*
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32 addrspace(4)* [[TMP2]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[TMP3]], align 4, !invariant.load !0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32 addrspace(4)* [[TMP2]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[TMP5]], align 4, !range [[RNG1]], !invariant.load !0
+; CHECK-NEXT:    [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !invariant.load !0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4, !range [[RNG1]], !invariant.load !0
 ; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP4]], 16
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
@@ -65,37 +63,36 @@ define amdgpu_kernel void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw nsw i32 [[TMP9]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_null_rhs.alloca, i32 0, i32 [[TMP15]]
-; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[TMP16]], i32 0, i32 [[A:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 addrspace(3)* [[PTR0]], null
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_null_rhs.alloca, i32 0, i32 [[TMP15]]
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP16]], i32 0, i32 [[A:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(3) [[PTR0]], null
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    store volatile i32 [[ZEXT]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; NOLDS-LABEL: @lds_promoted_alloca_icmp_null_rhs(
 ; NOLDS-NEXT:    [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5)
-; NOLDS-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[A:%.*]]
-; NOLDS-NEXT:    [[CMP:%.*]] = icmp eq i32 addrspace(5)* [[PTR0]], null
+; NOLDS-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]]
+; NOLDS-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(5) [[PTR0]], null
 ; NOLDS-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
-; NOLDS-NEXT:    store volatile i32 [[ZEXT]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; NOLDS-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; NOLDS-NEXT:    ret void
 ;
   %alloca = alloca [16 x i32], align 4, addrspace(5)
-  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a
-  %cmp = icmp eq i32 addrspace(5)* %ptr0, null
+  %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
+  %cmp = icmp eq ptr addrspace(5) %ptr0, null
   %zext = zext i1 %cmp to i32
-  store volatile i32 %zext, i32 addrspace(1)* %out
+  store volatile i32 %zext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_null_lhs(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 ; CHECK-LABEL: @lds_promoted_alloca_icmp_null_lhs(
-; CHECK-NEXT:    [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to i32 addrspace(4)*
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32 addrspace(4)* [[TMP2]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[TMP3]], align 4, !invariant.load !0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32 addrspace(4)* [[TMP2]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[TMP5]], align 4, !range [[RNG1]], !invariant.load !0
+; CHECK-NEXT:    [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !invariant.load !0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4, !range [[RNG1]], !invariant.load !0
 ; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP4]], 16
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
@@ -105,57 +102,57 @@ define amdgpu_kernel void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw nsw i32 [[TMP9]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_null_lhs.alloca, i32 0, i32 [[TMP15]]
-; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[TMP16]], i32 0, i32 [[A:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 addrspace(3)* null, [[PTR0]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_null_lhs.alloca, i32 0, i32 [[TMP15]]
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP16]], i32 0, i32 [[A:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(3) null, [[PTR0]]
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    store volatile i32 [[ZEXT]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; NOLDS-LABEL: @lds_promoted_alloca_icmp_null_lhs(
 ; NOLDS-NEXT:    [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5)
-; NOLDS-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[A:%.*]]
-; NOLDS-NEXT:    [[CMP:%.*]] = icmp eq i32 addrspace(5)* null, [[PTR0]]
+; NOLDS-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]]
+; NOLDS-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(5) null, [[PTR0]]
 ; NOLDS-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
-; NOLDS-NEXT:    store volatile i32 [[ZEXT]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; NOLDS-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; NOLDS-NEXT:    ret void
 ;
   %alloca = alloca [16 x i32], align 4, addrspace(5)
-  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a
-  %cmp = icmp eq i32 addrspace(5)* null, %ptr0
+  %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
+  %cmp = icmp eq ptr addrspace(5) null, %ptr0
   %zext = zext i1 %cmp to i32
-  store volatile i32 %zext, i32 addrspace(1)* %out
+  store volatile i32 %zext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_unknown_ptr(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 ; CHECK-LABEL: @lds_promoted_alloca_icmp_unknown_ptr(
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5)
-; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[A:%.*]]
-; CHECK-NEXT:    [[PTR1:%.*]] = call i32 addrspace(5)* @get_unknown_pointer()
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 addrspace(5)* [[PTR0]], [[PTR1]]
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]]
+; CHECK-NEXT:    [[PTR1:%.*]] = call ptr addrspace(5) @get_unknown_pointer()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(5) [[PTR0]], [[PTR1]]
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    store volatile i32 [[ZEXT]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; NOLDS-LABEL: @lds_promoted_alloca_icmp_unknown_ptr(
 ; NOLDS-NEXT:    [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5)
-; NOLDS-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[A:%.*]]
-; NOLDS-NEXT:    [[PTR1:%.*]] = call i32 addrspace(5)* @get_unknown_pointer()
-; NOLDS-NEXT:    [[CMP:%.*]] = icmp eq i32 addrspace(5)* [[PTR0]], [[PTR1]]
+; NOLDS-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]]
+; NOLDS-NEXT:    [[PTR1:%.*]] = call ptr addrspace(5) @get_unknown_pointer()
+; NOLDS-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(5) [[PTR0]], [[PTR1]]
 ; NOLDS-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
-; NOLDS-NEXT:    store volatile i32 [[ZEXT]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; NOLDS-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; NOLDS-NEXT:    ret void
 ;
   %alloca = alloca [16 x i32], align 4, addrspace(5)
-  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a
-  %ptr1 = call i32 addrspace(5)* @get_unknown_pointer()
-  %cmp = icmp eq i32 addrspace(5)* %ptr0, %ptr1
+  %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
+  %ptr1 = call ptr addrspace(5) @get_unknown_pointer()
+  %cmp = icmp eq ptr addrspace(5) %ptr0, %ptr1
   %zext = zext i1 %cmp to i32
-  store volatile i32 %zext, i32 addrspace(1)* %out
+  store volatile i32 %zext, ptr addrspace(1) %out
   ret void
 }
 
-declare i32 addrspace(5)* @get_unknown_pointer() #0
+declare ptr addrspace(5) @get_unknown_pointer() #0
 
 attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
index a3e685a37832a..b32faecf8868c 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
@@ -4,26 +4,26 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
 
 ; CHECK-LABEL: @lds_promoted_alloca_select_invalid_pointer_operand(
 ; CHECK: %alloca = alloca i32
-; CHECK: select i1 undef, i32 addrspace(5)* undef, i32 addrspace(5)* %alloca
+; CHECK: select i1 undef, ptr addrspace(5) undef, ptr addrspace(5) %alloca
 define amdgpu_kernel void @lds_promoted_alloca_select_invalid_pointer_operand() #0 {
   %alloca = alloca i32, align 4, addrspace(5)
-  %select = select i1 undef, i32 addrspace(5)* undef, i32 addrspace(5)* %alloca
-  store i32 0, i32 addrspace(5)* %select, align 4
+  %select = select i1 undef, ptr addrspace(5) undef, ptr addrspace(5) %alloca
+  store i32 0, ptr addrspace(5) %select, align 4
   ret void
 }
 
 ; CHECK-LABEL: @lds_promote_alloca_select_two_derived_pointers(
-; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promote_alloca_select_two_derived_pointers.alloca, i32 0, i32 %{{[0-9]+}}
-; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
-; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b
-; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
-; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
+; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promote_alloca_select_two_derived_pointers.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(3) [[ARRAYGEP]], i32 0, i32 %a
+; CHECK: %ptr1 = getelementptr inbounds [16 x i32], ptr addrspace(3) [[ARRAYGEP]], i32 0, i32 %b
+; CHECK: %select = select i1 undef, ptr addrspace(3) %ptr0, ptr addrspace(3) %ptr1
+; CHECK: store i32 0, ptr addrspace(3) %select, align 4
 define amdgpu_kernel void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4, addrspace(5)
-  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a
-  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %b
-  %select = select i1 undef, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1
-  store i32 0, i32 addrspace(5)* %select, align 4
+  %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
+  %ptr1 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %b
+  %select = select i1 undef, ptr addrspace(5) %ptr0, ptr addrspace(5) %ptr1
+  store i32 0, ptr addrspace(5) %select, align 4
   ret void
 }
 
@@ -32,32 +32,32 @@ define amdgpu_kernel void @lds_promote_alloca_select_two_derived_pointers(i32 %a
 ; CHECK-LABEL: @lds_promote_alloca_select_two_allocas(
 ; CHECK: %alloca0 = alloca i32, i32 16, align 4
 ; CHECK: %alloca1 = alloca i32, i32 16, align 4
-; CHECK: %ptr0 = getelementptr inbounds i32, i32 addrspace(5)* %alloca0, i32 %a
-; CHECK: %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %alloca1, i32 %b
-; CHECK: %select = select i1 undef, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1
+; CHECK: %ptr0 = getelementptr inbounds i32, ptr addrspace(5) %alloca0, i32 %a
+; CHECK: %ptr1 = getelementptr inbounds i32, ptr addrspace(5) %alloca1, i32 %b
+; CHECK: %select = select i1 undef, ptr addrspace(5) %ptr0, ptr addrspace(5) %ptr1
 define amdgpu_kernel void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 {
   %alloca0 = alloca i32, i32 16, align 4, addrspace(5)
   %alloca1 = alloca i32, i32 16, align 4, addrspace(5)
-  %ptr0 = getelementptr inbounds i32, i32 addrspace(5)* %alloca0, i32 %a
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %alloca1, i32 %b
-  %select = select i1 undef, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1
-  store i32 0, i32 addrspace(5)* %select, align 4
+  %ptr0 = getelementptr inbounds i32, ptr addrspace(5) %alloca0, i32 %a
+  %ptr1 = getelementptr inbounds i32, ptr addrspace(5) %alloca1, i32 %b
+  %select = select i1 undef, ptr addrspace(5) %ptr0, ptr addrspace(5) %ptr1
+  store i32 0, ptr addrspace(5) %select, align 4
   ret void
 }
 
 ; TODO: Maybe this should be canonicalized to select on the constant and GEP after.
 ; CHECK-LABEL: @lds_promote_alloca_select_two_derived_constant_pointers(
-; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promote_alloca_select_two_derived_constant_pointers.alloca, i32 0, i32 %{{[0-9]+}}
-; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 1
-; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 3
-; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
-; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
+; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promote_alloca_select_two_derived_constant_pointers.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(3) [[ARRAYGEP]], i32 0, i32 1
+; CHECK: %ptr1 = getelementptr inbounds [16 x i32], ptr addrspace(3) [[ARRAYGEP]], i32 0, i32 3
+; CHECK: %select = select i1 undef, ptr addrspace(3) %ptr0, ptr addrspace(3) %ptr1
+; CHECK: store i32 0, ptr addrspace(3) %select, align 4
 define amdgpu_kernel void @lds_promote_alloca_select_two_derived_constant_pointers() #0 {
   %alloca = alloca [16 x i32], align 4, addrspace(5)
-  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
-  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 3
-  %select = select i1 undef, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1
-  store i32 0, i32 addrspace(5)* %select, align 4
+  %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
+  %ptr1 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 3
+  %select = select i1 undef, ptr addrspace(5) %ptr0, ptr addrspace(5) %ptr1
+  store i32 0, ptr addrspace(5) %select, align 4
   ret void
 }
 
@@ -68,63 +68,63 @@ define amdgpu_kernel void @lds_promote_alloca_select_two_derived_constant_pointe
 ; CHECK: alloca
 define amdgpu_kernel void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c, i1 %c1, i1 %c2) #0 {
   %alloca = alloca [16 x i32], align 4, addrspace(5)
-  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a
-  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %b
-  %ptr2 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %c
-  %select0 = select i1 %c1, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1
-  %select1 = select i1 %c2, i32 addrspace(5)* %select0, i32 addrspace(5)* %ptr2
-  store i32 0, i32 addrspace(5)* %select1, align 4
+  %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
+  %ptr1 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %b
+  %ptr2 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %c
+  %select0 = select i1 %c1, ptr addrspace(5) %ptr0, ptr addrspace(5) %ptr1
+  %select1 = select i1 %c2, ptr addrspace(5) %select0, ptr addrspace(5) %ptr2
+  store i32 0, ptr addrspace(5) %select1, align 4
   ret void
 }
 
 define amdgpu_kernel void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 {
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
-  %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a
-  %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %b
-  store i32 0, i32 addrspace(5)* %ptr0
+  %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
+  %ptr1 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %b
+  store i32 0, ptr addrspace(5) %ptr0
   br i1 undef, label %bb1, label %bb2
 
 bb1:
-  %ptr2 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %c
-  %select0 = select i1 undef, i32 addrspace(5)* undef, i32 addrspace(5)* %ptr2
-  store i32 0, i32 addrspace(5)* %ptr1
+  %ptr2 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %c
+  %select0 = select i1 undef, ptr addrspace(5) undef, ptr addrspace(5) %ptr2
+  store i32 0, ptr addrspace(5) %ptr1
   br label %bb2
 
 bb2:
-  %phi.ptr = phi i32 addrspace(5)* [ %ptr0, %entry ], [ %select0, %bb1 ]
-  %select1 = select i1 undef, i32 addrspace(5)* %phi.ptr, i32 addrspace(5)* %ptr1
-  store i32 0, i32 addrspace(5)* %select1, align 4
+  %phi.ptr = phi ptr addrspace(5) [ %ptr0, %entry ], [ %select0, %bb1 ]
+  %select1 = select i1 undef, ptr addrspace(5) %phi.ptr, ptr addrspace(5) %ptr1
+  store i32 0, ptr addrspace(5) %select1, align 4
   ret void
 }
 
 ; CHECK-LABEL: @select_null_rhs(
 ; CHECK-NOT: alloca
-; CHECK: select i1 %tmp2, double addrspace(3)* %{{[0-9]+}}, double addrspace(3)* null
-define amdgpu_kernel void @select_null_rhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
+; CHECK: select i1 %tmp2, ptr addrspace(3) %{{[0-9]+}}, ptr addrspace(3) null
+define amdgpu_kernel void @select_null_rhs(ptr addrspace(1) nocapture %arg, i32 %arg1) #1 {
 bb:
   %tmp = alloca double, align 8, addrspace(5)
-  store double 0.000000e+00, double addrspace(5)* %tmp, align 8
+  store double 0.000000e+00, ptr addrspace(5) %tmp, align 8
   %tmp2 = icmp eq i32 %arg1, 0
-  %tmp3 = select i1 %tmp2, double addrspace(5)* %tmp, double addrspace(5)* null
-  store double 1.000000e+00, double addrspace(5)* %tmp3, align 8
-  %tmp4 = load double, double addrspace(5)* %tmp, align 8
-  store double %tmp4, double addrspace(1)* %arg
+  %tmp3 = select i1 %tmp2, ptr addrspace(5) %tmp, ptr addrspace(5) null
+  store double 1.000000e+00, ptr addrspace(5) %tmp3, align 8
+  %tmp4 = load double, ptr addrspace(5) %tmp, align 8
+  store double %tmp4, ptr addrspace(1) %arg
   ret void
 }
 
 ; CHECK-LABEL: @select_null_lhs(
 ; CHECK-NOT: alloca
-; CHECK: select i1 %tmp2, double addrspace(3)* null, double addrspace(3)* %{{[0-9]+}}
-define amdgpu_kernel void @select_null_lhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
+; CHECK: select i1 %tmp2, ptr addrspace(3) null, ptr addrspace(3) %{{[0-9]+}}
+define amdgpu_kernel void @select_null_lhs(ptr addrspace(1) nocapture %arg, i32 %arg1) #1 {
 bb:
   %tmp = alloca double, align 8, addrspace(5)
-  store double 0.000000e+00, double addrspace(5)* %tmp, align 8
+  store double 0.000000e+00, ptr addrspace(5) %tmp, align 8
   %tmp2 = icmp eq i32 %arg1, 0
-  %tmp3 = select i1 %tmp2, double addrspace(5)* null, double addrspace(5)* %tmp
-  store double 1.000000e+00, double addrspace(5)* %tmp3, align 8
-  %tmp4 = load double, double addrspace(5)* %tmp, align 8
-  store double %tmp4, double addrspace(1)* %arg
+  %tmp3 = select i1 %tmp2, ptr addrspace(5) null, ptr addrspace(5) %tmp
+  store double 1.000000e+00, ptr addrspace(5) %tmp3, align 8
+  %tmp4 = load double, ptr addrspace(5) %tmp, align 8
+  store double %tmp4, ptr addrspace(1) %arg
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll
index 24a2d6a28d054..25a2924bef541 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll
@@ -10,13 +10,13 @@
 define void @foo1(i32 %x) #1 {
 entry:
   %cc = icmp eq i32 %x, 0
-  store volatile i1 %cc, i1* undef
+  store volatile i1 %cc, ptr undef
   ret void
 }
 
 define amdgpu_kernel void @kernel1(float %x) #0 {
 entry:
-  call void bitcast (void (i32)* @foo1 to void (float)*)(float %x)
+  call void @foo1(float %x)
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/ptr-arg-dbg-value.ll b/llvm/test/CodeGen/AMDGPU/ptr-arg-dbg-value.ll
index 2f4de78fda1ba..3020991e8a930 100644
--- a/llvm/test/CodeGen/AMDGPU/ptr-arg-dbg-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptr-arg-dbg-value.ll
@@ -6,7 +6,7 @@
 ; The 64-bit pointer argument %arg1 will be split into two registers
 ; and for its llvm.dbg.declare, DAG should emit two DBG_VALUE instructions
 ; with the fragment expressions.
-define hidden void @ptr_arg_split_subregs(%struct.A* %arg1) #0 !dbg !9 {
+define hidden void @ptr_arg_split_subregs(ptr %arg1) #0 !dbg !9 {
 ; CHECK-LABEL: ptr_arg_split_subregs:
 ; CHECK:       .Lfunc_begin0:
 ; CHECK:       .loc 1 5 0 ; example.cpp:5:0
@@ -25,9 +25,9 @@ define hidden void @ptr_arg_split_subregs(%struct.A* %arg1) #0 !dbg !9 {
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .Ltmp1:
 ; CHECK:         .cfi_endproc
-  call void @llvm.dbg.declare(metadata %struct.A* %arg1, metadata !20, metadata !DIExpression()), !dbg !21
-  %gep1 = getelementptr inbounds %struct.A, %struct.A* %arg1, i32 0, i32 0, i32 99, !dbg !22
-  store i32 1, i32* %gep1, align 4, !dbg !23
+  call void @llvm.dbg.declare(metadata ptr %arg1, metadata !20, metadata !DIExpression()), !dbg !21
+  %gep1 = getelementptr inbounds %struct.A, ptr %arg1, i32 0, i32 0, i32 99, !dbg !22
+  store i32 1, ptr %gep1, align 4, !dbg !23
   ret void, !dbg !24
 }
 
@@ -37,7 +37,7 @@ define hidden void @ptr_arg_split_subregs(%struct.A* %arg1) #0 !dbg !9 {
 ; are totally misleading. The former represent part of the incoming argument in register
 ; while the latter was emitted for the parameter copy to a virtual register inserted
 ; at the function entry by DAGBuilder.
-define hidden void @ptr_arg_split_reg_mem(<30 x i32>, %struct.A* %arg2) #0 !dbg !25 {
+define hidden void @ptr_arg_split_reg_mem(<30 x i32>, ptr %arg2) #0 !dbg !25 {
 ; CHECK-LABEL: ptr_arg_split_reg_mem:
 ; CHECK:       .Lfunc_begin1:
 ; CHECK-NEXT:    .loc 1 10 0 ; example.cpp:10:0
@@ -56,15 +56,15 @@ define hidden void @ptr_arg_split_reg_mem(<30 x i32>, %struct.A* %arg2) #0 !dbg
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .Ltmp3:
 ; CHECK:         .cfi_endproc
-  call void @llvm.dbg.declare(metadata %struct.A* %arg2, metadata !26, metadata !DIExpression()), !dbg !27
-  %gep2 = getelementptr inbounds %struct.A, %struct.A* %arg2, i32 0, i32 0, i32 99, !dbg !28
-  store i32 1, i32* %gep2, align 4, !dbg !29
+  call void @llvm.dbg.declare(metadata ptr %arg2, metadata !26, metadata !DIExpression()), !dbg !27
+  %gep2 = getelementptr inbounds %struct.A, ptr %arg2, i32 0, i32 0, i32 99, !dbg !28
+  store i32 1, ptr %gep2, align 4, !dbg !29
   ret void, !dbg !30
 }
 
 ; FIXME: The 64-bit pointer argument %arg3 will be entirely in the stack memory.
 ; No DBG_VALUE emitted for the incoming argument in this case and it should be fixed.
-define hidden void @ptr_arg_in_memory(<32 x i32>, %struct.A* %arg3) #0 !dbg !31 {
+define hidden void @ptr_arg_in_memory(<32 x i32>, ptr %arg3) #0 !dbg !31 {
 ; CHECK-LABEL: ptr_arg_in_memory:
 ; CHECK:       .Lfunc_begin2:
 ; CHECK-NEXT:    .loc 1 15 0 ; example.cpp:15:0
@@ -83,9 +83,9 @@ define hidden void @ptr_arg_in_memory(<32 x i32>, %struct.A* %arg3) #0 !dbg !31
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .Ltmp5:
 ; CHECK:         .cfi_endproc
-  call void @llvm.dbg.declare(metadata %struct.A* %arg3, metadata !32, metadata !DIExpression()), !dbg !33
-  %gep3 = getelementptr inbounds %struct.A, %struct.A* %arg3, i32 0, i32 0, i32 99, !dbg !34
-  store i32 1, i32* %gep3, align 4, !dbg !35
+  call void @llvm.dbg.declare(metadata ptr %arg3, metadata !32, metadata !DIExpression()), !dbg !33
+  %gep3 = getelementptr inbounds %struct.A, ptr %arg3, i32 0, i32 0, i32 99, !dbg !34
+  store i32 1, ptr %gep3, align 4, !dbg !35
   ret void, !dbg !36
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/ptrmask.ll b/llvm/test/CodeGen/AMDGPU/ptrmask.ll
index 6ec9eb78b83a5..0eea4fb4f893f 100644
--- a/llvm/test/CodeGen/AMDGPU/ptrmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptrmask.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
 
-define i8 addrspace(1)* @v_ptrmask_global_variable_i64(i8 addrspace(1)* %ptr, i64 %mask) {
+define ptr addrspace(1) @v_ptrmask_global_variable_i64(ptr addrspace(1) %ptr, i64 %mask) {
 ; GCN-LABEL: v_ptrmask_global_variable_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -18,11 +18,11 @@ define i8 addrspace(1)* @v_ptrmask_global_variable_i64(i8 addrspace(1)* %ptr, i6
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %masked = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i64(i8 addrspace(1)* %ptr, i64 %mask)
-  ret i8 addrspace(1)* %masked
+  %masked = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) %ptr, i64 %mask)
+  ret ptr addrspace(1) %masked
 }
 
-define i8 addrspace(1)* @v_ptrmask_global_variable_i32(i8 addrspace(1)* %ptr, i32 %mask) {
+define ptr addrspace(1) @v_ptrmask_global_variable_i32(ptr addrspace(1) %ptr, i32 %mask) {
 ; GCN-LABEL: v_ptrmask_global_variable_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -44,11 +44,11 @@ define i8 addrspace(1)* @v_ptrmask_global_variable_i32(i8 addrspace(1)* %ptr, i3
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %masked = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i32(i8 addrspace(1)* %ptr, i32 %mask)
-  ret i8 addrspace(1)* %masked
+  %masked = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %ptr, i32 %mask)
+  ret ptr addrspace(1) %masked
 }
 
-define i8 addrspace(1)* @v_ptrmask_global_variable_i16(i8 addrspace(1)* %ptr, i16 %mask) {
+define ptr addrspace(1) @v_ptrmask_global_variable_i16(ptr addrspace(1) %ptr, i16 %mask) {
 ; GCN-LABEL: v_ptrmask_global_variable_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -71,11 +71,11 @@ define i8 addrspace(1)* @v_ptrmask_global_variable_i16(i8 addrspace(1)* %ptr, i1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %masked = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i16(i8 addrspace(1)* %ptr, i16 %mask)
-  ret i8 addrspace(1)* %masked
+  %masked = call ptr addrspace(1) @llvm.ptrmask.p1.i16(ptr addrspace(1) %ptr, i16 %mask)
+  ret ptr addrspace(1) %masked
 }
 
-define i8 addrspace(3)* @v_ptrmask_local_variable_i64(i8 addrspace(3)* %ptr, i64 %mask) {
+define ptr addrspace(3) @v_ptrmask_local_variable_i64(ptr addrspace(3) %ptr, i64 %mask) {
 ; GCN-LABEL: v_ptrmask_local_variable_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -88,11 +88,11 @@ define i8 addrspace(3)* @v_ptrmask_local_variable_i64(i8 addrspace(3)* %ptr, i64
 ; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i64(i8 addrspace(3)* %ptr, i64 %mask)
-  ret i8 addrspace(3)* %masked
+  %masked = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) %ptr, i64 %mask)
+  ret ptr addrspace(3) %masked
 }
 
-define i8 addrspace(3)* @v_ptrmask_local_variable_i32(i8 addrspace(3)* %ptr, i32 %mask) {
+define ptr addrspace(3) @v_ptrmask_local_variable_i32(ptr addrspace(3) %ptr, i32 %mask) {
 ; GCN-LABEL: v_ptrmask_local_variable_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -105,11 +105,11 @@ define i8 addrspace(3)* @v_ptrmask_local_variable_i32(i8 addrspace(3)* %ptr, i32
 ; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* %ptr, i32 %mask)
-  ret i8 addrspace(3)* %masked
+  %masked = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) %ptr, i32 %mask)
+  ret ptr addrspace(3) %masked
 }
 
-define i8 addrspace(3)* @v_ptrmask_local_variable_i16(i8 addrspace(3)* %ptr, i16 %mask) {
+define ptr addrspace(3) @v_ptrmask_local_variable_i16(ptr addrspace(3) %ptr, i16 %mask) {
 ; GCN-LABEL: v_ptrmask_local_variable_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -130,11 +130,11 @@ define i8 addrspace(3)* @v_ptrmask_local_variable_i16(i8 addrspace(3)* %ptr, i16
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i16(i8 addrspace(3)* %ptr, i16 %mask)
-  ret i8 addrspace(3)* %masked
+  %masked = call ptr addrspace(3) @llvm.ptrmask.p3.i16(ptr addrspace(3) %ptr, i16 %mask)
+  ret ptr addrspace(3) %masked
 }
 
-define amdgpu_ps i8 addrspace(1)* @s_ptrmask_global_variable_i64(i8 addrspace(1)* inreg %ptr, i64 inreg %mask) {
+define amdgpu_ps ptr addrspace(1) @s_ptrmask_global_variable_i64(ptr addrspace(1) inreg %ptr, i64 inreg %mask) {
 ; GCN-LABEL: s_ptrmask_global_variable_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_and_b64 s[0:1], s[2:3], s[4:5]
@@ -144,11 +144,11 @@ define amdgpu_ps i8 addrspace(1)* @s_ptrmask_global_variable_i64(i8 addrspace(1)
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[2:3], s[4:5]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
-  %masked = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i64(i8 addrspace(1)* %ptr, i64 %mask)
-  ret i8 addrspace(1)* %masked
+  %masked = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) %ptr, i64 %mask)
+  ret ptr addrspace(1) %masked
 }
 
-define amdgpu_ps i8 addrspace(1)* @s_ptrmask_global_variable_i32(i8 addrspace(1)* inreg %ptr, i32 inreg %mask) {
+define amdgpu_ps ptr addrspace(1) @s_ptrmask_global_variable_i32(ptr addrspace(1) inreg %ptr, i32 inreg %mask) {
 ; GCN-LABEL: s_ptrmask_global_variable_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s5, 0
@@ -162,11 +162,11 @@ define amdgpu_ps i8 addrspace(1)* @s_ptrmask_global_variable_i32(i8 addrspace(1)
 ; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[2:3], s[4:5]
 ; GFX10PLUS-NEXT:    s_mov_b32 s1, 0
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
-  %masked = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i32(i8 addrspace(1)* %ptr, i32 %mask)
-  ret i8 addrspace(1)* %masked
+  %masked = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %ptr, i32 %mask)
+  ret ptr addrspace(1) %masked
 }
 
-define amdgpu_ps i8 addrspace(1)* @s_ptrmask_global_variable_i16(i8 addrspace(1)* inreg %ptr, i16 inreg %mask) {
+define amdgpu_ps ptr addrspace(1) @s_ptrmask_global_variable_i16(ptr addrspace(1) inreg %ptr, i16 inreg %mask) {
 ; GCN-LABEL: s_ptrmask_global_variable_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_and_b32 s0, s4, 0xffff
@@ -182,11 +182,11 @@ define amdgpu_ps i8 addrspace(1)* @s_ptrmask_global_variable_i16(i8 addrspace(1)
 ; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GFX10PLUS-NEXT:    s_mov_b32 s1, 0
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
-  %masked = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i16(i8 addrspace(1)* %ptr, i16 %mask)
-  ret i8 addrspace(1)* %masked
+  %masked = call ptr addrspace(1) @llvm.ptrmask.p1.i16(ptr addrspace(1) %ptr, i16 %mask)
+  ret ptr addrspace(1) %masked
 }
 
-define amdgpu_ps i8 addrspace(3)* @s_ptrmask_local_variable_i64(i8 addrspace(3)* inreg %ptr, i64 inreg %mask) {
+define amdgpu_ps ptr addrspace(3) @s_ptrmask_local_variable_i64(ptr addrspace(3) inreg %ptr, i64 inreg %mask) {
 ; GCN-LABEL: s_ptrmask_local_variable_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_and_b32 s0, s2, s3
@@ -196,11 +196,11 @@ define amdgpu_ps i8 addrspace(3)* @s_ptrmask_local_variable_i64(i8 addrspace(3)*
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_and_b32 s0, s2, s3
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
-  %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i64(i8 addrspace(3)* %ptr, i64 %mask)
-  ret i8 addrspace(3)* %masked
+  %masked = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) %ptr, i64 %mask)
+  ret ptr addrspace(3) %masked
 }
 
-define amdgpu_ps i8 addrspace(3)* @s_ptrmask_local_variable_i32(i8 addrspace(3)* inreg %ptr, i32 inreg %mask) {
+define amdgpu_ps ptr addrspace(3) @s_ptrmask_local_variable_i32(ptr addrspace(3) inreg %ptr, i32 inreg %mask) {
 ; GCN-LABEL: s_ptrmask_local_variable_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_and_b32 s0, s2, s3
@@ -210,11 +210,11 @@ define amdgpu_ps i8 addrspace(3)* @s_ptrmask_local_variable_i32(i8 addrspace(3)*
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_and_b32 s0, s2, s3
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
-  %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* %ptr, i32 %mask)
-  ret i8 addrspace(3)* %masked
+  %masked = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) %ptr, i32 %mask)
+  ret ptr addrspace(3) %masked
 }
 
-define amdgpu_ps i8 addrspace(3)* @s_ptrmask_local_variable_i16(i8 addrspace(3)* inreg %ptr, i16 inreg %mask) {
+define amdgpu_ps ptr addrspace(3) @s_ptrmask_local_variable_i16(ptr addrspace(3) inreg %ptr, i16 inreg %mask) {
 ; GCN-LABEL: s_ptrmask_local_variable_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_and_b32 s0, 0xffff, s3
@@ -226,15 +226,15 @@ define amdgpu_ps i8 addrspace(3)* @s_ptrmask_local_variable_i16(i8 addrspace(3)*
 ; GFX10PLUS-NEXT:    s_and_b32 s0, 0xffff, s3
 ; GFX10PLUS-NEXT:    s_and_b32 s0, s2, s0
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
-  %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i16(i8 addrspace(3)* %ptr, i16 %mask)
-  ret i8 addrspace(3)* %masked
+  %masked = call ptr addrspace(3) @llvm.ptrmask.p3.i16(ptr addrspace(3) %ptr, i16 %mask)
+  ret ptr addrspace(3) %masked
 }
 
-declare i8 addrspace(3)* @llvm.ptrmask.p3i8.i64(i8 addrspace(3)*, i64) #0
-declare i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)*, i32) #0
-declare i8 addrspace(3)* @llvm.ptrmask.p3i8.i16(i8 addrspace(3)*, i16) #0
-declare i8 addrspace(1)* @llvm.ptrmask.p1i8.i64(i8 addrspace(1)*, i64) #0
-declare i8 addrspace(1)* @llvm.ptrmask.p1i8.i32(i8 addrspace(1)*, i32) #0
-declare i8 addrspace(1)* @llvm.ptrmask.p1i8.i16(i8 addrspace(1)*, i16) #0
+declare ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3), i64) #0
+declare ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3), i32) #0
+declare ptr addrspace(3) @llvm.ptrmask.p3.i16(ptr addrspace(3), i16) #0
+declare ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1), i64) #0
+declare ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1), i32) #0
+declare ptr addrspace(1) @llvm.ptrmask.p1.i16(ptr addrspace(1), i16) #0
 
 attributes #0 = { nounwind readnone speculatable willreturn }

diff  --git a/llvm/test/CodeGen/AMDGPU/pv-packing.ll b/llvm/test/CodeGen/AMDGPU/pv-packing.ll
index fbd6fc593d959..074966e6e3cbe 100644
--- a/llvm/test/CodeGen/AMDGPU/pv-packing.ll
+++ b/llvm/test/CodeGen/AMDGPU/pv-packing.ll
@@ -31,8 +31,8 @@ main_body:
   %6 = extractelement <4 x float> %reg3, i32 0
   %7 = extractelement <4 x float> %reg3, i32 1
   %8 = extractelement <4 x float> %reg3, i32 2
-  %9 = load <4 x float>, <4 x float> addrspace(8)* null
-  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %9 = load <4 x float>, ptr addrspace(8) null
+  %10 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 1)
   %11 = call float @llvm.r600.dot4(<4 x float> %9, <4 x float> %9)
   %12 = fmul float %0, %3
   %13 = fadd float %12, %6

diff  --git a/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll b/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
index 6b169e0f18f6c..3659f27d06f83 100644
--- a/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
+++ b/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
@@ -4,10 +4,10 @@
 
 declare i32 @llvm.read_register.i32(metadata) #0
 
-define amdgpu_kernel void @test_invalid_read_flat_scratch_lo(i32 addrspace(1)* %out) nounwind {
-  store volatile i32 0, i32 addrspace(3)* undef
+define amdgpu_kernel void @test_invalid_read_flat_scratch_lo(ptr addrspace(1) %out) nounwind {
+  store volatile i32 0, ptr addrspace(3) undef
   %m0 = call i32 @llvm.read_register.i32(metadata !0)
-  store i32 %m0, i32 addrspace(1)* %out
+  store i32 %m0, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll b/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
index 178b22f329b11..a7ea3391079e1 100644
--- a/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
@@ -4,10 +4,10 @@
 
 declare i32 @llvm.read_register.i32(metadata) #0
 
-define amdgpu_kernel void @test_invalid_read_exec(i32 addrspace(1)* %out) nounwind {
-  store volatile i32 0, i32 addrspace(3)* undef
+define amdgpu_kernel void @test_invalid_read_exec(ptr addrspace(1) %out) nounwind {
+  store volatile i32 0, ptr addrspace(3) undef
   %m0 = call i32 @llvm.read_register.i32(metadata !0)
-  store i32 %m0, i32 addrspace(1)* %out
+  store i32 %m0, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll b/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
index f05d900b39d63..dbdd7edcfd6b3 100644
--- a/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
@@ -4,9 +4,9 @@
 
 declare i64 @llvm.read_register.i64(metadata) #0
 
-define amdgpu_kernel void @test_invalid_read_m0(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_invalid_read_m0(ptr addrspace(1) %out) #0 {
   %exec = call i64 @llvm.read_register.i64(metadata !0)
-  store i64 %exec, i64 addrspace(1)* %out
+  store i64 %exec, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/read_register.ll b/llvm/test/CodeGen/AMDGPU/read_register.ll
index 8e8fc44bf57df..227a08ecc4dbe 100644
--- a/llvm/test/CodeGen/AMDGPU/read_register.ll
+++ b/llvm/test/CodeGen/AMDGPU/read_register.ll
@@ -8,10 +8,10 @@ declare i64 @llvm.read_register.i64(metadata) #0
 ; CHECK: s_mov_b32 m0, -1
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], m0
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[COPY]]
-define amdgpu_kernel void @test_read_m0(i32 addrspace(1)* %out) #0 {
-  store volatile i32 0, i32 addrspace(3)* undef
+define amdgpu_kernel void @test_read_m0(ptr addrspace(1) %out) #0 {
+  store volatile i32 0, ptr addrspace(3) undef
   %m0 = call i32 @llvm.read_register.i32(metadata !0)
-  store i32 %m0, i32 addrspace(1)* %out
+  store i32 %m0, ptr addrspace(1) %out
   ret void
 }
 
@@ -19,9 +19,9 @@ define amdgpu_kernel void @test_read_m0(i32 addrspace(1)* %out) #0 {
 ; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], exec_lo
 ; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], exec_hi
 ; CHECK: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]]
-define amdgpu_kernel void @test_read_exec(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_exec(ptr addrspace(1) %out) #0 {
   %exec = call i64 @llvm.read_register.i64(metadata !1)
-  store i64 %exec, i64 addrspace(1)* %out
+  store i64 %exec, ptr addrspace(1) %out
   ret void
 }
 
@@ -29,45 +29,45 @@ define amdgpu_kernel void @test_read_exec(i64 addrspace(1)* %out) #0 {
 ; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], flat_scratch_lo
 ; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], flat_scratch_hi
 ; CHECK: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]]
-define amdgpu_kernel void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_flat_scratch(ptr addrspace(1) %out) #0 {
   %flat_scratch = call i64 @llvm.read_register.i64(metadata !2)
-  store i64 %flat_scratch, i64 addrspace(1)* %out
+  store i64 %flat_scratch, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: {{^}}test_read_flat_scratch_lo:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], flat_scratch_lo
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[COPY]]
-define amdgpu_kernel void @test_read_flat_scratch_lo(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_flat_scratch_lo(ptr addrspace(1) %out) #0 {
   %flat_scratch_lo = call i32 @llvm.read_register.i32(metadata !3)
-  store i32 %flat_scratch_lo, i32 addrspace(1)* %out
+  store i32 %flat_scratch_lo, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: {{^}}test_read_flat_scratch_hi:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], flat_scratch_hi
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[COPY]]
-define amdgpu_kernel void @test_read_flat_scratch_hi(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_flat_scratch_hi(ptr addrspace(1) %out) #0 {
   %flat_scratch_hi = call i32 @llvm.read_register.i32(metadata !4)
-  store i32 %flat_scratch_hi, i32 addrspace(1)* %out
+  store i32 %flat_scratch_hi, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: {{^}}test_read_exec_lo:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], exec_lo
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[COPY]]
-define amdgpu_kernel void @test_read_exec_lo(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_exec_lo(ptr addrspace(1) %out) #0 {
   %exec_lo = call i32 @llvm.read_register.i32(metadata !5)
-  store i32 %exec_lo, i32 addrspace(1)* %out
+  store i32 %exec_lo, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: {{^}}test_read_exec_hi:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], exec_hi
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[COPY]]
-define amdgpu_kernel void @test_read_exec_hi(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_exec_hi(ptr addrspace(1) %out) #0 {
   %exec_hi = call i32 @llvm.read_register.i32(metadata !6)
-  store i32 %exec_hi, i32 addrspace(1)* %out
+  store i32 %exec_hi, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll b/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll
index 3093d979dbdca..22fe358b3b6be 100644
--- a/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll
+++ b/llvm/test/CodeGen/AMDGPU/reassoc-scalar.ll
@@ -5,12 +5,12 @@
 ; GCN: s_add_i32 [[ADD1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD1]], v{{[0-9]+}}
 ; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[ADD1]], v{{[0-9]+}}
-define amdgpu_kernel void @reassoc_i32(i32 addrspace(1)* %arg, i32 %x, i32 %y) {
+define amdgpu_kernel void @reassoc_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) {
 bb:
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %add1 = add i32 %x, %tid
   %add2 = add i32 %add1, %y
-  store i32 %add2, i32 addrspace(1)* %arg, align 4
+  store i32 %add2, ptr addrspace(1) %arg, align 4
   ret void
 }
 
@@ -18,12 +18,12 @@ bb:
 ; GCN:  s_add_i32 [[ADD1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD1]], v{{[0-9]+}}
 ; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[ADD1]], v{{[0-9]+}}
-define amdgpu_kernel void @reassoc_i32_swap_arg_order(i32 addrspace(1)* %arg, i32 %x, i32 %y) {
+define amdgpu_kernel void @reassoc_i32_swap_arg_order(ptr addrspace(1) %arg, i32 %x, i32 %y) {
 bb:
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %add1 = add i32 %tid, %x
   %add2 = add i32 %y, %add1
-  store i32 %add2, i32 addrspace(1)* %arg, align 4
+  store i32 %add2, ptr addrspace(1) %arg, align 4
   ret void
 }
 
@@ -35,13 +35,13 @@ bb:
 ; GCN-DAG:  v_mov_b32_e32 [[VADD1H:v[0-9]+]], [[ADD1H]]
 ; GFX8:     v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, [[VADD1H]], vcc
 ; GFX9:     v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, [[VADD1H]], vcc
-define amdgpu_kernel void @reassoc_i64(i64 addrspace(1)* %arg, i64 %x, i64 %y) {
+define amdgpu_kernel void @reassoc_i64(ptr addrspace(1) %arg, i64 %x, i64 %y) {
 bb:
   %tid32 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tid = zext i32 %tid32 to i64
   %add1 = add i64 %x, %tid
   %add2 = add i64 %add1, %y
-  store i64 %add2, i64 addrspace(1)* %arg, align 8
+  store i64 %add2, ptr addrspace(1) %arg, align 8
   ret void
 }
 
@@ -52,7 +52,7 @@ bb:
 ; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD2]], v{{[0-9]+}}
 ; GFX9-DAG: v_add_u32_e32 v{{[0-9]+}}, [[ADD1]], v{{[0-9]+}}
 ; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[ADD2]], v{{[0-9]+}}
-define amdgpu_kernel void @reassoc_v2i32(<2 x i32> addrspace(1)* %arg, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @reassoc_v2i32(ptr addrspace(1) %arg, <2 x i32> %x, <2 x i32> %y) {
 bb:
   %t1 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %t2 = tail call i32 @llvm.amdgcn.workitem.id.y()
@@ -60,7 +60,7 @@ bb:
   %v2 = insertelement <2 x i32> %v1, i32 %t2, i32 1
   %add1 = add <2 x i32> %x, %v2
   %add2 = add <2 x i32> %add1, %y
-  store <2 x i32> %add2, <2 x i32> addrspace(1)* %arg, align 4
+  store <2 x i32> %add2, ptr addrspace(1) %arg, align 4
   ret void
 }
 
@@ -68,12 +68,12 @@ bb:
 ; GCN:  s_add_i32 [[ADD1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD1]], v{{[0-9]+}}
 ; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[ADD1]], v{{[0-9]+}}
-define amdgpu_kernel void @reassoc_i32_nuw(i32 addrspace(1)* %arg, i32 %x, i32 %y) {
+define amdgpu_kernel void @reassoc_i32_nuw(ptr addrspace(1) %arg, i32 %x, i32 %y) {
 bb:
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %add1 = add i32 %x, %tid
   %add2 = add nuw i32 %add1, %y
-  store i32 %add2, i32 addrspace(1)* %arg, align 4
+  store i32 %add2, ptr addrspace(1) %arg, align 4
   ret void
 }
 
@@ -82,13 +82,13 @@ bb:
 ; GFX9: v_add_u32_e32 [[ADD1:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, [[ADD1]]
 ; GFX9: v_add_u32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[ADD1]]
-define amdgpu_kernel void @reassoc_i32_multiuse(i32 addrspace(1)* %arg, i32 %x, i32 %y) {
+define amdgpu_kernel void @reassoc_i32_multiuse(ptr addrspace(1) %arg, i32 %x, i32 %y) {
 bb:
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %add1 = add i32 %x, %tid
   %add2 = add i32 %add1, %y
-  store volatile i32 %add1, i32 addrspace(1)* %arg, align 4
-  store volatile i32 %add2, i32 addrspace(1)* %arg, align 4
+  store volatile i32 %add1, ptr addrspace(1) %arg, align 4
+  store volatile i32 %add2, ptr addrspace(1) %arg, align 4
   ret void
 }
 
@@ -99,13 +99,13 @@ bb:
 ; GFX9: v_add_u32_e32 [[ADD1:v[0-9]+]],  42, v{{[0-9]+}}
 ; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, [[ADD1]]
 ; GFX9: v_add_u32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[ADD1]]
-define amdgpu_kernel void @reassoc_i32_const(i32 addrspace(1)* %arg, i32 %x) {
+define amdgpu_kernel void @reassoc_i32_const(ptr addrspace(1) %arg, i32 %x) {
 bb:
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %add1 = add i32 %tid, 42
   %add2 = add i32 %add1, %x
-  store volatile i32 %add1, i32 addrspace(1)* %arg, align 4
-  store volatile i32 %add2, i32 addrspace(1)* %arg, align 4
+  store volatile i32 %add1, ptr addrspace(1) %arg, align 4
+  store volatile i32 %add2, ptr addrspace(1) %arg, align 4
   ret void
 }
 
@@ -119,9 +119,9 @@ define amdgpu_kernel void @reassoc_i32_ga(i64 %x) {
 bb:
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %t64 = zext i32 %tid to i64
-  %add1 = getelementptr [4 x i32], [4 x i32] addrspace(1)* @var, i64 0, i64 %t64
-  %add2 = getelementptr i32, i32 addrspace(1)* %add1, i64 %x
-  store volatile i32 1, i32 addrspace(1)* %add2, align 4
+  %add1 = getelementptr [4 x i32], ptr addrspace(1) @var, i64 0, i64 %t64
+  %add2 = getelementptr i32, ptr addrspace(1) %add1, i64 %x
+  store volatile i32 1, ptr addrspace(1) %add2, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll b/llvm/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
index 9f8667d359932..ceec9663b0760 100644
--- a/llvm/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
@@ -6,7 +6,7 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
-define amdgpu_kernel void @reg_coalescer_breaks_dead(<2 x i32> addrspace(1)* nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3) #1 {
+define amdgpu_kernel void @reg_coalescer_breaks_dead(ptr addrspace(1) nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3) #1 {
 bb:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %cmp0 = icmp eq i32 %id.x, 0
@@ -25,14 +25,14 @@ bb6:                                              ; preds = %bb6, %bb3
   %tmp8 = add nsw i32 0, %arg1
   %tmp9 = add nsw i32 %tmp8, 0
   %tmp10 = sext i32 %tmp9 to i64
-  %tmp11 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %arg, i64 %tmp10
-  %tmp12 = load <2 x i32>, <2 x i32> addrspace(1)* %tmp11, align 8
+  %tmp11 = getelementptr inbounds <2 x i32>, ptr addrspace(1) %arg, i64 %tmp10
+  %tmp12 = load <2 x i32>, ptr addrspace(1) %tmp11, align 8
   %tmp13 = add <2 x i32> %tmp12, %tmp7
   %tmp14 = icmp slt i32 undef, %arg2
   br i1 %tmp14, label %bb6, label %bb4
 
 bb15:                                             ; preds = %bb4
-  store <2 x i32> %tmp5, <2 x i32> addrspace(3)* undef, align 8
+  store <2 x i32> %tmp5, ptr addrspace(3) undef, align 8
   br label %bb16
 
 bb16:                                             ; preds = %bb15, %bb4

diff  --git a/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
index b125eecb975cd..0823e2d23aa0c 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
@@ -13,7 +13,7 @@
 ; CHECK: ; def v[4:19] v[20:27] v[0:4] v[0:3] a[0:15]
 ; CHECK: ; clobber
 ; CHECK: ; use v[4:19] v[20:27] v[0:4] v[0:3] a[1:16]
-define void @illegal_eviction_assert(<32 x i32> addrspace(1)* %arg) #0 {
+define void @illegal_eviction_assert(ptr addrspace(1) %arg) #0 {
   ;%agpr0 = call i32 asm sideeffect "; def $0","=${a0}"()
   %asm = call %asm.output asm sideeffect "; def $0 $1 $2 $3 $4","=v,=v,=v,=v,={a[0:15]}"()
   %vgpr0 = extractvalue %asm.output %asm, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
index 26a76cf2041e1..35149d354f2a8 100644
--- a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
+++ b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
@@ -9,22 +9,22 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
 ; SI: ; Kernel info:
 ; SI: ; NumSgprs: {{[0-9]+}}
 ; SI: ; NumVgprs: {{[0-9]+}}
-define amdgpu_kernel void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind {
+define amdgpu_kernel void @foo(ptr addrspace(1) noalias %out, ptr addrspace(1) %abase, ptr addrspace(1) %bbase) nounwind {
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
-  %aptr = getelementptr i32, i32 addrspace(1)* %abase, i32 %tid
-  %bptr = getelementptr i32, i32 addrspace(1)* %bbase, i32 %tid
-  %outptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %aptr = getelementptr i32, ptr addrspace(1) %abase, i32 %tid
+  %bptr = getelementptr i32, ptr addrspace(1) %bbase, i32 %tid
+  %outptr = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load i32, ptr addrspace(1) %aptr, align 4
+  %b = load i32, ptr addrspace(1) %bptr, align 4
   %result = add i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %outptr, align 4
+  store i32 %result, ptr addrspace(1) %outptr, align 4
   ret void
 }
 
 ; SI-LABEL: {{^}}one_vgpr_used:
 ; SI: NumVgprs: 1
-define amdgpu_kernel void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind {
-  store i32 %x, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @one_vgpr_used(ptr addrspace(1) %out, i32 %x) nounwind {
+  store i32 %x, ptr addrspace(1) %out, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/rel32.ll b/llvm/test/CodeGen/AMDGPU/rel32.ll
index e9323c9fccffb..41bf8f4ea8434 100644
--- a/llvm/test/CodeGen/AMDGPU/rel32.ll
+++ b/llvm/test/CodeGen/AMDGPU/rel32.ll
@@ -8,7 +8,7 @@
 ; CHECK: s_getpc_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
 ; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], g at rel32@lo-4
 ; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], g at rel32@hi+4
-define i32 addrspace(4)* @rel32_neg_offset() {
-  %r = getelementptr i32, i32 addrspace(4)* @g, i64 -2
-  ret i32 addrspace(4)* %r
+define ptr addrspace(4) @rel32_neg_offset() {
+  %r = getelementptr i32, ptr addrspace(4) @g, i64 -2
+  ret ptr addrspace(4) %r
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
index 432540283d22e..26cde1e6cd59d 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
+++ b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
@@ -8,7 +8,7 @@
 ; GCN-NOT:     v_writelane_b32
 ; GCN:         s_cbranch_{{[^ ]+}} [[LOOP]]
 ; GCN: .sgpr_spill_count: 0
-define amdgpu_kernel void @test_remat_sgpr(double addrspace(1)* %arg, double addrspace(1)* %arg1) {
+define amdgpu_kernel void @test_remat_sgpr(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) {
 bb:
   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb3
@@ -20,8 +20,8 @@ bb3:                                              ; preds = %bb3, %bb
   %i4 = phi i32 [ 0, %bb ], [ %i22, %bb3 ]
   %i5 = add nuw nsw i32 %i4, %i
   %i6 = zext i32 %i5 to i64
-  %i7 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %i6
-  %i8 = load double, double addrspace(1)* %i7, align 8
+  %i7 = getelementptr inbounds double, ptr addrspace(1) %arg, i64 %i6
+  %i8 = load double, ptr addrspace(1) %i7, align 8
   %i9 = fadd double %i8, 0x3EFC01997CC9E6B0
   %i10 = tail call double @llvm.fma.f64(double %i8, double %i9, double 0x3FBE25E43ABE935A)
   %i11 = tail call double @llvm.fma.f64(double %i10, double %i9, double 0x3FC110EF47E6C9C2)
@@ -34,8 +34,8 @@ bb3:                                              ; preds = %bb3, %bb
   %i18 = tail call double @llvm.fma.f64(double %i17, double %i9, double 0x3FD799999999799C)
   %i19 = tail call double @llvm.fma.f64(double %i18, double %i9, double 0x3FD699999999699C)
   %i20 = tail call double @llvm.fma.f64(double %i19, double %i9, double 0x3FD599999999599C)
-  %i21 = getelementptr inbounds double, double addrspace(1)* %arg1, i64 %i6
-  store double %i19, double addrspace(1)* %i21, align 8
+  %i21 = getelementptr inbounds double, ptr addrspace(1) %arg1, i64 %i6
+  store double %i19, ptr addrspace(1) %i21, align 8
   %i22 = add nuw nsw i32 %i4, 1
   %i23 = icmp eq i32 %i22, 1024
   br i1 %i23, label %bb2, label %bb3

diff  --git a/llvm/test/CodeGen/AMDGPU/rename-disconnected-bug.ll b/llvm/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
index 5d4955aa1ce2f..9839b71980041 100644
--- a/llvm/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
@@ -28,6 +28,6 @@ B30.1:
 B30.2:
   %v3 = phi <4 x float> [ %sub, %B30.1 ], [ %v2, %B20.2 ]
   %ve0 = extractelement <4 x float> %v3, i32 0
-  store float %ve0, float addrspace(3)* undef, align 4
+  store float %ve0, ptr addrspace(3) undef, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-to-declare-only-func.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-to-declare-only-func.ll
index a5e2054512e78..b54876ad775e8 100644
--- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-to-declare-only-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-to-declare-only-func.ll
@@ -20,10 +20,10 @@ declare i32 @foo()
 ; No change
 define internal void @goo() {
 ; CHECK-LABEL: entry:
-; CHECK:   store i32 undef, i32 addrspace(3)* @lds, align 4
+; CHECK:   store i32 undef, ptr addrspace(3) @lds, align 4
 ; CHECK:   ret void
 entry:
-  store i32 undef, i32 addrspace(3)* @lds, align 4
+  store i32 undef, ptr addrspace(3) @lds, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll
index 7c1de3b917bb9..f92c1a7e5029f 100644
--- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll
@@ -14,14 +14,14 @@
 @lds.2 = addrspace(3) global i32 undef, align 4
 @lds.3 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
 
-; CHECK: @global_var = addrspace(1) global float* addrspacecast (float addrspace(3)* bitcast ([4 x i32] addrspace(3)* @lds to float addrspace(3)*) to float*), align 8
-; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
-; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
-; CHECK: @alias.to.lds.3 = alias [1 x i8], [1 x i8] addrspace(3)* @lds.3
- at global_var = addrspace(1) global float* addrspacecast ([4 x i32] addrspace(3)* @lds to float*), align 8
- at llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i16 addrspace(3)* @lds.1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
- at llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
- at alias.to.lds.3 = alias [1 x i8], [1 x i8] addrspace(3)* @lds.3
+; CHECK: @global_var = addrspace(1) global ptr addrspacecast (ptr addrspace(3) @lds to ptr), align 8
+; CHECK: @llvm.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @lds.1 to ptr)], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @lds.2 to ptr)], section "llvm.metadata"
+; CHECK: @alias.to.lds.3 = alias [1 x i8], ptr addrspace(3) @lds.3
+ at global_var = addrspace(1) global ptr addrspacecast (ptr addrspace(3) @lds to ptr), align 8
+ at llvm.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @lds.1 to ptr)], section "llvm.metadata"
+ at llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @lds.2 to ptr)], section "llvm.metadata"
+ at alias.to.lds.3 = alias [1 x i8], ptr addrspace(3) @lds.3
 
 ; CHECK-NOT: @lds.ptr
 ; CHECK-NOT: @lds.1.ptr
@@ -30,16 +30,12 @@
 
 define void @f0() {
 ; CHECK-LABEL: entry:
-; CHECK:   %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds, i32 0, i32 0
-; CHECK:   %ld1 = load i16, i16 addrspace(3)* @lds.1
-; CHECK:   %ld2 = load i32, i32 addrspace(3)* @lds.2
-; CHECK:   %gep2 = getelementptr inbounds [1 x i8], [1 x i8] addrspace(3)* @lds.3, i32 0, i32 0
+; CHECK:   %ld1 = load i16, ptr addrspace(3) @lds.1
+; CHECK:   %ld2 = load i32, ptr addrspace(3) @lds.2
 ; CHECK:   ret void
 entry:
-  %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds, i32 0, i32 0
-  %ld1 = load i16, i16 addrspace(3)* @lds.1
-  %ld2 = load i32, i32 addrspace(3)* @lds.2
-  %gep2 = getelementptr inbounds [1 x i8], [1 x i8] addrspace(3)* @lds.3, i32 0, i32 0
+  %ld1 = load i16, ptr addrspace(3) @lds.1
+  %ld2 = load i32, ptr addrspace(3) @lds.2
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll
index 1d04b9d4a151d..f4a5757907b70 100644
--- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll
@@ -13,10 +13,10 @@
 
 define void @f0(i32 %x) {
 ; CHECK-LABEL: entry:
-; CHECK:   store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_func, i32 0, i32 0) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_func, i32 0, i32 0) to i32*) to i64)) to i32*), align 4
+; CHECK:   store i32 %x, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_func to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_func to ptr) to i64)) to ptr), align 4
 ; CHECK:   ret void
 entry:
-  store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+  store i32 %x, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_func to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_func to ptr) to i64)) to ptr), align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll
index f890164e1f03a..821dc1e1ddf43 100644
--- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll
@@ -13,13 +13,13 @@
 
 define amdgpu_kernel void @k0() {
 ; CHECK-LABEL: entry:
-; CHECK:   %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_kern, i32 0, i32 0) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_kern, i32 0, i32 0) to i32*) to i64)) to i32*), align 4
+; CHECK:   %ld = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_kern to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_kern to ptr) to i64)) to ptr), align 4
 ; CHECK:   %mul = mul i32 %ld, 2
-; CHECK:   store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_kern, i32 0, i32 0) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_kern, i32 0, i32 0) to i32*) to i64)) to i32*), align 4
+; CHECK:   store i32 %mul, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_kern to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_kern to ptr) to i64)) to ptr), align 4
 ; CHECK:   ret void
 entry:
-  %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+  %ld = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_kern to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_kern to ptr) to i64)) to ptr), align 4
   %mul = mul i32 %ld, 2
-  store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+  store i32 %mul, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_kern to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_kern to ptr) to i64)) to ptr), align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll
index df82e38410609..ba4153bd3e59c 100644
--- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll
@@ -13,10 +13,8 @@
 
 define internal void @f0() {
 ; CHECK-LABEL: entry:
-; CHECK:   %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @not-reachable-lds, i32 0, i32 0
 ; CHECK:   ret void
 entry:
-  %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @not-reachable-lds, i32 0, i32 0
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll
index fe454c7404576..3c2ca5136ae0e 100644
--- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll
@@ -14,10 +14,10 @@
 
 define void @f0() {
 ; CHECK-LABEL: entry:
-; CHECK:   store i8 1, i8 addrspace(3)* @small_lds, align 1
+; CHECK:   store i8 1, ptr addrspace(3) @small_lds, align 1
 ; CHECK:   ret void
 entry:
-  store i8 1, i8 addrspace(3)* @small_lds, align 1
+  store i8 1, ptr addrspace(3) @small_lds, align 1
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll
index 858f9be3aa58f..876b6a9f74df7 100644
--- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll
+++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll
@@ -18,18 +18,16 @@
 ; Pointer replacement code should be added.
 define void @f0(i32 %x) {
 ; CHECK-LABEL: entry:
-; CHECK:   %0 = load i16, i16 addrspace(3)* @used_only_within_func.ptr, align 2
-; CHECK:   %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
-; CHECK:   %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
-; CHECK:   %3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0
-; CHECK:   %4 = addrspacecast i32 addrspace(3)* %3 to i32*
-; CHECK:   %5 = ptrtoint i32* %4 to i64
-; CHECK:   %6 = add i64 %5, %5
-; CHECK:   %7 = inttoptr i64 %6 to i32*
-; CHECK:   store i32 %x, i32* %7, align 4
+; CHECK:   %0 = load i16, ptr addrspace(3) @used_only_within_func.ptr, align 2
+; CHECK:   %1 = getelementptr i8, ptr addrspace(3) null, i16 %0
+; CHECK:   %2 = addrspacecast ptr addrspace(3) %1 to ptr
+; CHECK:   %3 = ptrtoint ptr %2 to i64
+; CHECK:   %4 = add i64 %3, %3
+; CHECK:   %5 = inttoptr i64 %4 to ptr
+; CHECK:   store i32 %x, ptr %5, align 4
 ; CHECK:   ret void
 entry:
-  store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+  store i32 %x, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_func to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_func to ptr) to i64)) to ptr), align 4
   ret void
 }
 
@@ -41,7 +39,7 @@ define amdgpu_kernel void @k0() {
 ; CHECK:   br i1 %1, label %2, label %3
 ;
 ; CHECK-LABEL: 2:
-; CHECK:   store i16 ptrtoint ([4 x i32] addrspace(3)* @used_only_within_func to i16), i16 addrspace(3)* @used_only_within_func.ptr, align 2
+; CHECK:   store i16 ptrtoint (ptr addrspace(3) @used_only_within_func to i16), ptr addrspace(3) @used_only_within_func.ptr, align 2
 ; CHECK:   br label %3
 ;
 ; CHECK-LABEL: 3:

diff  --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr2.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr2.ll
index 43d120d5d326c..22b2941b4071a 100644
--- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr2.ll
+++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr2.ll
@@ -24,16 +24,15 @@
 ; Pointer replacement code should be added.
 define internal void @function() {
 ; CHECK-LABEL: entry:
-; CHECK:   %0 = load i16, i16 addrspace(3)* @lds_used_within_function.ptr, align 2
-; CHECK:   %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
-; CHECK:   %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
-; CHECK:   %3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 2
-; CHECK:   %4 = addrspacecast i32 addrspace(3)* %3 to i32*
-; CHECK:   %5 = ptrtoint i32* %4 to i32
-; CHECK:   %6 = add i32 %5, ptrtoint (i32 addrspace(1)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(1)* @global_var, i32 0, i32 2) to i32)
+; CHECK:   %0 = load i16, ptr addrspace(3) @lds_used_within_function.ptr, align 2
+; CHECK:   %1 = getelementptr i8, ptr addrspace(3) null, i16 %0
+; CHECK:   %2 = getelementptr inbounds [4 x i32], ptr addrspace(3) %1, i32 0, i32 2
+; CHECK:   %3 = addrspacecast ptr addrspace(3) %2 to ptr
+; CHECK:   %4 = ptrtoint ptr %3 to i32
+; CHECK:   %5 = add i32 %4, ptrtoint (ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @global_var, i32 0, i32 2) to i32)
 ; CHECK:   ret void
 entry:
-  %0 = add i32 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function, i32 0, i32 2) to i32*) to i32), ptrtoint (i32 addrspace(1)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(1)* @global_var, i32 0, i32 2) to i32)
+  %0 = add i32 ptrtoint (ptr addrspacecast (ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @lds_used_within_function, i32 0, i32 2) to ptr) to i32), ptrtoint (ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @global_var, i32 0, i32 2) to i32)
   ret void
 }
 
@@ -45,7 +44,7 @@ define protected amdgpu_kernel void @kernel() {
 ; CHECK:   br i1 %1, label %2, label %3
 ;
 ; CHECK-LABEL: 2:
-; CHECK:   store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function to i16), i16 addrspace(3)* @lds_used_within_function.ptr, align 2
+; CHECK:   store i16 ptrtoint (ptr addrspace(3) @lds_used_within_function to i16), ptr addrspace(3) @lds_used_within_function.ptr, align 2
 ; CHECK:   br label %3
 ;
 ; CHECK-LABEL: 3:

diff  --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
index 607dc0623236e..a67691f42bf27 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
@@ -99,7 +99,7 @@ define amdgpu_kernel void @test_kernel() !dbg !3 {
   call void asm sideeffect "; clobber v8", "~{v8}"()
   call void asm sideeffect "; clobber s23", "~{s23}"()
   call void asm sideeffect "; clobber a42", "~{a42}"()
-  call void asm sideeffect "; use $0", "v"([128 x i32] addrspace(3)* @lds)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(3) @lds)
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/ret.ll b/llvm/test/CodeGen/AMDGPU/ret.ll
index 97c07a3d7773f..e157d5ce9e984 100644
--- a/llvm/test/CodeGen/AMDGPU/ret.ll
+++ b/llvm/test/CodeGen/AMDGPU/ret.ll
@@ -9,7 +9,7 @@
 ; GCN: s_waitcnt expcnt(0)
 ; GCN: v_add_f32_e32 v0, 1.0, v1
 ; GCN-NOT: s_endpgm
-define amdgpu_vs { float, float } @vgpr([9 x <16 x i8>] addrspace(4)* inreg %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+define amdgpu_vs { float, float } @vgpr(ptr addrspace(4) inreg %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
 bb:
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
   %x = fadd float %arg3, 1.000000e+00
@@ -27,7 +27,7 @@ bb:
 ; GCN-DAG: v_mov_b32_e32 v3, -1.0
 ; GCN-DAG: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
-define amdgpu_vs { float, float, float, float } @vgpr_literal([9 x <16 x i8>] addrspace(4)* inreg %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+define amdgpu_vs { float, float, float, float } @vgpr_literal(ptr addrspace(4) inreg %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
 bb:
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
   ret { float, float, float, float } { float 1.000000e+00, float 2.000000e+00, float 4.000000e+00, float -1.000000e+00 }
@@ -44,7 +44,7 @@ bb:
 ; GCN: v_mov_b32_e32 v3, v4
 ; GCN: v_mov_b32_e32 v4, v6
 ; GCN-NOT: s_endpgm
-define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr0([9 x <16 x i8>] addrspace(4)* inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr0(ptr addrspace(4) inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
 bb:
   %i0 = extractelement <2 x i32> %arg4, i32 0
   %i1 = extractelement <2 x i32> %arg4, i32 1
@@ -69,7 +69,7 @@ bb:
 ; GCN-LABEL: {{^}}ps_input_ena_no_inputs:
 ; GCN: v_mov_b32_e32 v0, 1.0
 ; GCN-NOT: s_endpgm
-define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(4)* inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
+define amdgpu_ps float @ps_input_ena_no_inputs(ptr addrspace(4) inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
 bb:
   ret float 1.000000e+00
 }
@@ -83,7 +83,7 @@ bb:
 ; GCN-DAG: v_mov_b32_e32 v1, v2
 ; GCN-DAG: v_mov_b32_e32 v2, v3
 ; GCN-NOT: s_endpgm
-define amdgpu_ps { float, <2 x float> } @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(4)* inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
+define amdgpu_ps { float, <2 x float> } @ps_input_ena_pos_w(ptr addrspace(4) inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
 bb:
   %f = bitcast <2 x i32> %arg8 to <2 x float>
   %s = insertvalue { float, <2 x float> } undef, float %arg14, 0
@@ -102,7 +102,7 @@ bb:
 ; GCN-DAG: v_mov_b32_e32 v3, v6
 ; GCN-DAG: v_mov_b32_e32 v4, v8
 ; GCN-NOT: s_endpgm
-define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr1([9 x <16 x i8>] addrspace(4)* inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #2 {
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr1(ptr addrspace(4) inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #2 {
 bb:
   %i0 = extractelement <2 x i32> %arg4, i32 0
   %i1 = extractelement <2 x i32> %arg4, i32 1
@@ -131,7 +131,7 @@ bb:
 ; GCN-DAG: v_mov_b32_e32 v3, v8
 ; GCN-DAG: v_mov_b32_e32 v4, v12
 ; GCN-NOT: s_endpgm
-define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr119([9 x <16 x i8>] addrspace(4)* inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #3 {
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr119(ptr addrspace(4) inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #3 {
 bb:
   %i0 = extractelement <2 x i32> %arg4, i32 0
   %i1 = extractelement <2 x i32> %arg4, i32 1
@@ -160,7 +160,7 @@ bb:
 ; GCN: v_mov_b32_e32 v3, v4
 ; GCN: v_mov_b32_e32 v4, v8
 ; GCN-NOT: s_endpgm
-define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr418([9 x <16 x i8>] addrspace(4)* inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #4 {
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr418(ptr addrspace(4) inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #4 {
 bb:
   %i0 = extractelement <2 x i32> %arg4, i32 0
   %i1 = extractelement <2 x i32> %arg4, i32 1
@@ -182,7 +182,7 @@ bb:
 ; GCN-DAG: s_mov_b32 s2, s3
 ; GCN-DAG: s_add_{{i|u}}32 s0, s3, 2
 ; GCN-NOT: s_endpgm
-define amdgpu_vs { i32, i32, i32 } @sgpr([9 x <16 x i8>] addrspace(4)* inreg %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+define amdgpu_vs { i32, i32, i32 } @sgpr(ptr addrspace(4) inreg %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
 bb:
   %x = add i32 %arg2, 2
   %a = insertvalue { i32, i32, i32 } undef, i32 %x, 0
@@ -198,7 +198,7 @@ bb:
 ; GCN-DAG: s_mov_b32 s2, 7
 ; GCN-DAG: s_mov_b32 s3, 8
 ; GCN-NOT: s_endpgm
-define amdgpu_vs { i32, i32, i32, i32 } @sgpr_literal([9 x <16 x i8>] addrspace(4)* inreg %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+define amdgpu_vs { i32, i32, i32, i32 } @sgpr_literal(ptr addrspace(4) inreg %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
 bb:
   %x = add i32 %arg2, 2
   ret { i32, i32, i32, i32 } { i32 5, i32 6, i32 7, i32 8 }
@@ -213,7 +213,7 @@ bb:
 ; GCN-DAG: s_add_{{i|u}}32 s0, s3, 2
 ; GCN-DAG: s_mov_b32 s2, s3
 ; GCN-NOT: s_endpgm
-define amdgpu_vs { float, i32, float, i32, i32 } @both([9 x <16 x i8>] addrspace(4)* inreg %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+define amdgpu_vs { float, i32, float, i32, i32 } @both(ptr addrspace(4) inreg %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
 bb:
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
   %v = fadd float %arg3, 1.000000e+00
@@ -235,7 +235,7 @@ bb:
 ; GCN-DAG: v_mov_b32_e32 v1, 2.0
 ; GCN-DAG: v_mov_b32_e32 v2, 4.0
 ; GCN-DAG: s_waitcnt expcnt(0)
-define amdgpu_vs { { float, i32 }, { i32, <2 x float> } } @structure_literal([9 x <16 x i8>] addrspace(4)* inreg %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+define amdgpu_vs { { float, i32 }, { i32, <2 x float> } } @structure_literal(ptr addrspace(4) inreg %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
 bb:
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
   ret { { float, i32 }, { i32, <2 x float> } } { { float, i32 } { float 1.000000e+00, i32 2 }, { i32, <2 x float> } { i32 3, <2 x float> <float 2.000000e+00, float 4.000000e+00> } }

diff  --git a/llvm/test/CodeGen/AMDGPU/ret_jump.ll b/llvm/test/CodeGen/AMDGPU/ret_jump.ll
index b06f3b8d68453..58d209764d7b7 100644
--- a/llvm/test/CodeGen/AMDGPU/ret_jump.ll
+++ b/llvm/test/CodeGen/AMDGPU/ret_jump.ll
@@ -21,7 +21,7 @@
 ; GCN-NEXT: [[RET_BB]]:
 ; GCN-NEXT: ; return
 ; GCN-NEXT: .Lfunc_end0
-define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_trivial_ret_divergent_br_trivial_unreachable([9 x <4 x i32>] addrspace(4)* inreg %arg, [17 x <4 x i32>] addrspace(4)* inreg %arg1, [17 x <8 x i32>] addrspace(4)* inreg %arg2, i32 addrspace(4)* inreg %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, i32 inreg %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_trivial_ret_divergent_br_trivial_unreachable(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, ptr addrspace(4) inreg %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, i32 inreg %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
 entry:
   %i.i = extractelement <2 x i32> %arg7, i32 0
   %j.i = extractelement <2 x i32> %arg7, i32 1
@@ -73,7 +73,7 @@ ret.bb:                                          ; preds = %else, %main_body
 
 ; GCN: ; %ret.bb
 ; GCN: store_dword
-define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <4 x i32>] addrspace(4)* inreg %arg, [17 x <4 x i32>] addrspace(4)* inreg %arg1, [17 x <8 x i32>] addrspace(4)* inreg %arg2, i32 addrspace(4)* inreg %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, ptr addrspace(4) inreg %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
 main_body:
   %i.i = extractelement <2 x i32> %arg7, i32 0
   %j.i = extractelement <2 x i32> %arg7, i32 1
@@ -99,11 +99,11 @@ else:                                             ; preds = %main_body
   br i1 %divergent.cond, label %ret.bb, label %unreachable.bb
 
 unreachable.bb:                                           ; preds = %else
-  store volatile i32 8, i32 addrspace(3)* undef
+  store volatile i32 8, ptr addrspace(3) undef
   unreachable
 
 ret.bb:                                          ; preds = %else, %main_body
-  store volatile i32 11, i32 addrspace(1)* undef
+  store volatile i32 11, ptr addrspace(1) undef
   ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/returnaddress.ll b/llvm/test/CodeGen/AMDGPU/returnaddress.ll
index d9fff7c451174..3d8e10b00f6d4 100644
--- a/llvm/test/CodeGen/AMDGPU/returnaddress.ll
+++ b/llvm/test/CodeGen/AMDGPU/returnaddress.ll
@@ -6,10 +6,10 @@
 ; GCN: v_mov_b32_e32 v0, s30
 ; GCN: v_mov_b32_e32 v1, s31
 ; GCN: s_setpc_b64 s[30:31]
-define i8* @func1() nounwind {
+define ptr @func1() nounwind {
 entry:
-  %0 = tail call i8* @llvm.returnaddress(i32 0)
-  ret i8* %0
+  %0 = tail call ptr @llvm.returnaddress(i32 0)
+  ret ptr %0
 }
 
 ; Test with non-zero frame
@@ -17,20 +17,20 @@ entry:
 ; GCN: v_mov_b32_e32 v0, 0
 ; GCN: v_mov_b32_e32 v1, 0
 ; GCN: s_setpc_b64 s[30:31]
-define i8* @func2() nounwind {
+define ptr @func2() nounwind {
 entry:
-  %0 = tail call i8* @llvm.returnaddress(i32 1)
-  ret i8* %0
+  %0 = tail call ptr @llvm.returnaddress(i32 1)
+  ret ptr %0
 }
 
 ; Test with amdgpu_kernel
 ; GCN-LABEL: {{^}}func3
 ; GCN: v_mov_b32_e32 v0, 0
 ; GCN: v_mov_b32_e32 v1, {{v0|0}}
-define amdgpu_kernel void @func3(i8** %out) nounwind {
+define amdgpu_kernel void @func3(ptr %out) nounwind {
 entry:
-  %tmp = tail call i8* @llvm.returnaddress(i32 0)
-  store i8* %tmp, i8** %out, align 4
+  %tmp = tail call ptr @llvm.returnaddress(i32 0)
+  store ptr %tmp, ptr %out, align 4
   ret void
 }
 
@@ -38,14 +38,14 @@ entry:
 ; GCN-LABEL: {{^}}func4
 ; GCN: v_mov_b32_e32 v0, 0
 ; GCN: v_mov_b32_e32 v1, {{v0|0}}
-define amdgpu_kernel void @func4(i8** %out, i32 %val) nounwind {
+define amdgpu_kernel void @func4(ptr %out, i32 %val) nounwind {
 entry:
   %cmp = icmp ne i32 %val, 0
   br i1 %cmp, label %store, label %exit
 
 store:
-  %tmp = tail call i8* @llvm.returnaddress(i32 1)
-  store i8* %tmp, i8** %out, align 4
+  %tmp = tail call ptr @llvm.returnaddress(i32 1)
+  store ptr %tmp, ptr %out, align 4
   ret void
 
 exit:
@@ -57,8 +57,8 @@ exit:
 ; GCN: v_mov_b32_e32 v0, 0
 define void @func5() nounwind {
 entry:
-  %tmp = tail call i8* @llvm.returnaddress(i32 2)
-  store volatile i32 0, i32 addrspace(3)* undef, align 4
+  %tmp = tail call ptr @llvm.returnaddress(i32 2)
+  store volatile i32 0, ptr addrspace(3) undef, align 4
   unreachable
 }
 
@@ -72,12 +72,12 @@ declare void @callee()
 ; GCN: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]]
 define void @multi_use() nounwind {
 entry:
-  %ret0 = tail call i8* @llvm.returnaddress(i32 0)
-  store volatile i8* %ret0, i8* addrspace(1)* undef
+  %ret0 = tail call ptr @llvm.returnaddress(i32 0)
+  store volatile ptr %ret0, ptr addrspace(1) undef
   call void @callee()
-  %ret1 = tail call i8* @llvm.returnaddress(i32 0)
-  store volatile i8* %ret1, i8* addrspace(1)* undef
+  %ret1 = tail call ptr @llvm.returnaddress(i32 0)
+  store volatile ptr %ret1, ptr addrspace(1) undef
   ret void
 }
 
-declare i8* @llvm.returnaddress(i32) nounwind readnone
+declare ptr @llvm.returnaddress(i32) nounwind readnone

diff  --git a/llvm/test/CodeGen/AMDGPU/rotl.i64.ll b/llvm/test/CodeGen/AMDGPU/rotl.i64.ll
index 1134f191c888b..8c0ada9763311 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.i64.ll
@@ -7,13 +7,13 @@
 ; BOTH-DAG: s_lshr_b64
 ; BOTH: s_or_b64
 ; BOTH: s_endpgm
-define amdgpu_kernel void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_rotl_i64(ptr addrspace(1) %in, i64 %x, i64 %y) {
 entry:
   %0 = shl i64 %x, %y
   %1 = sub i64 64, %y
   %2 = lshr i64 %x, %1
   %3 = or i64 %0, %2
-  store i64 %3, i64 addrspace(1)* %in
+  store i64 %3, ptr addrspace(1) %in
   ret void
 }
 
@@ -26,14 +26,14 @@ entry:
 ; BOTH: v_or_b32
 ; BOTH: v_or_b32
 ; BOTH: s_endpgm
-define amdgpu_kernel void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
+define amdgpu_kernel void @v_rotl_i64(ptr addrspace(1) %in, ptr addrspace(1) %xptr, ptr addrspace(1) %yptr) {
 entry:
-  %x = load i64, i64 addrspace(1)* %xptr, align 8
-  %y = load i64, i64 addrspace(1)* %yptr, align 8
+  %x = load i64, ptr addrspace(1) %xptr, align 8
+  %y = load i64, ptr addrspace(1) %yptr, align 8
   %tmp0 = shl i64 %x, %y
   %tmp1 = sub i64 64, %y
   %tmp2 = lshr i64 %x, %tmp1
   %tmp3 = or i64 %tmp0, %tmp2
-  store i64 %tmp3, i64 addrspace(1)* %in, align 8
+  store i64 %tmp3, ptr addrspace(1) %in, align 8
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 12c46d3605289..7473d128e7d82 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -10,13 +10,13 @@
 ; SI: s_sub_i32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}}
 ; SI: v_mov_b32_e32 [[VDST:v[0-9]+]], [[SDST]]
 ; SI: v_alignbit_b32 {{v[0-9]+, [s][0-9]+, s[0-9]+}}, [[VDST]]
-define amdgpu_kernel void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 entry:
   %0 = shl i32 %x, %y
   %1 = sub i32 32, %y
   %2 = lshr i32 %x, %1
   %3 = or i32 %0, %2
-  store i32 %3, i32 addrspace(1)* %in
+  store i32 %3, ptr addrspace(1) %in
   ret void
 }
 
@@ -26,13 +26,13 @@ entry:
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_alignbit_b32
 ; SI: s_endpgm
-define amdgpu_kernel void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
 entry:
   %0 = shl <2 x i32> %x, %y
   %1 = sub <2 x i32> <i32 32, i32 32>, %y
   %2 = lshr <2 x i32> %x, %1
   %3 = or <2 x i32> %0, %2
-  store <2 x i32> %3, <2 x i32> addrspace(1)* %in
+  store <2 x i32> %3, ptr addrspace(1) %in
   ret void
 }
 
@@ -46,13 +46,13 @@ entry:
 ; SI-DAG: s_sub_i32
 ; SI-DAG: v_alignbit_b32
 ; SI: s_endpgm
-define amdgpu_kernel void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) {
 entry:
   %0 = shl <4 x i32> %x, %y
   %1 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
   %2 = lshr <4 x i32> %x, %1
   %3 = or <4 x i32> %0, %2
-  store <4 x i32> %3, <4 x i32> addrspace(1)* %in
+  store <4 x i32> %3, ptr addrspace(1) %in
   ret void
 }
 
@@ -69,14 +69,14 @@ entry:
 
 declare i16 @llvm.fshl.i16(i16, i16, i16)
 
-define void @test_rotl_i16(i16 addrspace(1)* nocapture readonly %sourceA, i16 addrspace(1)* nocapture readonly %sourceB, i16 addrspace(1)* nocapture %destValues) {
+define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr addrspace(1) nocapture readonly %sourceB, ptr addrspace(1) nocapture %destValues) {
 entry:
-  %arrayidx = getelementptr inbounds i16, i16 addrspace(1)* %sourceA, i64 16
-  %a = load i16, i16 addrspace(1)* %arrayidx
-  %arrayidx2 = getelementptr inbounds i16, i16 addrspace(1)* %sourceB, i64 24
-  %b = load i16, i16 addrspace(1)* %arrayidx2
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %sourceA, i64 16
+  %a = load i16, ptr addrspace(1) %arrayidx
+  %arrayidx2 = getelementptr inbounds i16, ptr addrspace(1) %sourceB, i64 24
+  %b = load i16, ptr addrspace(1) %arrayidx2
   %c = tail call i16 @llvm.fshl.i16(i16 %a, i16 %a, i16 %b)
-  %arrayidx5 = getelementptr inbounds i16, i16 addrspace(1)* %destValues, i64 4
-  store i16 %c, i16 addrspace(1)* %arrayidx5
+  %arrayidx5 = getelementptr inbounds i16, ptr addrspace(1) %destValues, i64 4
+  store i16 %c, ptr addrspace(1) %arrayidx5
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/rotr.i64.ll b/llvm/test/CodeGen/AMDGPU/rotr.i64.ll
index c55af2376ff0f..383e5ea1e4943 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.i64.ll
@@ -6,13 +6,13 @@
 ; BOTH-DAG: s_lshr_b64
 ; BOTH-DAG: s_lshl_b64
 ; BOTH: s_or_b64
-define amdgpu_kernel void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_rotr_i64(ptr addrspace(1) %in, i64 %x, i64 %y) {
 entry:
   %tmp0 = sub i64 64, %y
   %tmp1 = shl i64 %x, %tmp0
   %tmp2 = lshr i64 %x, %y
   %tmp3 = or i64 %tmp1, %tmp2
-  store i64 %tmp3, i64 addrspace(1)* %in
+  store i64 %tmp3, ptr addrspace(1) %in
   ret void
 }
 
@@ -24,38 +24,38 @@ entry:
 ; VI-DAG: v_lshlrev_b64
 ; BOTH: v_or_b32
 ; BOTH: v_or_b32
-define amdgpu_kernel void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
+define amdgpu_kernel void @v_rotr_i64(ptr addrspace(1) %in, ptr addrspace(1) %xptr, ptr addrspace(1) %yptr) {
 entry:
-  %x = load i64, i64 addrspace(1)* %xptr, align 8
-  %y = load i64, i64 addrspace(1)* %yptr, align 8
+  %x = load i64, ptr addrspace(1) %xptr, align 8
+  %y = load i64, ptr addrspace(1) %yptr, align 8
   %tmp0 = sub i64 64, %y
   %tmp1 = shl i64 %x, %tmp0
   %tmp2 = lshr i64 %x, %y
   %tmp3 = or i64 %tmp1, %tmp2
-  store i64 %tmp3, i64 addrspace(1)* %in
+  store i64 %tmp3, ptr addrspace(1) %in
   ret void
 }
 
 ; BOTH-LABEL: {{^}}s_rotr_v2i64:
-define amdgpu_kernel void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @s_rotr_v2i64(ptr addrspace(1) %in, <2 x i64> %x, <2 x i64> %y) {
 entry:
   %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
   %tmp1 = shl <2 x i64> %x, %tmp0
   %tmp2 = lshr <2 x i64> %x, %y
   %tmp3 = or <2 x i64> %tmp1, %tmp2
-  store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in
+  store <2 x i64> %tmp3, ptr addrspace(1) %in
   ret void
 }
 
 ; BOTH-LABEL: {{^}}v_rotr_v2i64:
-define amdgpu_kernel void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) {
+define amdgpu_kernel void @v_rotr_v2i64(ptr addrspace(1) %in, ptr addrspace(1) %xptr, ptr addrspace(1) %yptr) {
 entry:
-  %x = load <2 x i64>, <2 x i64> addrspace(1)* %xptr, align 8
-  %y = load <2 x i64>, <2 x i64> addrspace(1)* %yptr, align 8
+  %x = load <2 x i64>, ptr addrspace(1) %xptr, align 8
+  %y = load <2 x i64>, ptr addrspace(1) %yptr, align 8
   %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
   %tmp1 = shl <2 x i64> %x, %tmp0
   %tmp2 = lshr <2 x i64> %x, %y
   %tmp3 = or <2 x i64> %tmp1, %tmp2
-  store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in
+  store <2 x i64> %tmp3, ptr addrspace(1) %in
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index 84f277bcc0870..7bb71ebf84baa 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -6,13 +6,13 @@
 ; R600: BIT_ALIGN_INT
 
 ; SI: v_alignbit_b32
-define amdgpu_kernel void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 entry:
   %tmp0 = sub i32 32, %y
   %tmp1 = shl i32 %x, %tmp0
   %tmp2 = lshr i32 %x, %y
   %tmp3 = or i32 %tmp1, %tmp2
-  store i32 %tmp3, i32 addrspace(1)* %in
+  store i32 %tmp3, ptr addrspace(1) %in
   ret void
 }
 
@@ -22,13 +22,13 @@ entry:
 
 ; SI: v_alignbit_b32
 ; SI: v_alignbit_b32
-define amdgpu_kernel void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
 entry:
   %tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
   %tmp1 = shl <2 x i32> %x, %tmp0
   %tmp2 = lshr <2 x i32> %x, %y
   %tmp3 = or <2 x i32> %tmp1, %tmp2
-  store <2 x i32> %tmp3, <2 x i32> addrspace(1)* %in
+  store <2 x i32> %tmp3, ptr addrspace(1) %in
   ret void
 }
 
@@ -42,13 +42,13 @@ entry:
 ; SI: v_alignbit_b32
 ; SI: v_alignbit_b32
 ; SI: v_alignbit_b32
-define amdgpu_kernel void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) {
 entry:
   %tmp0 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
   %tmp1 = shl <4 x i32> %x, %tmp0
   %tmp2 = lshr <4 x i32> %x, %y
   %tmp3 = or <4 x i32> %tmp1, %tmp2
-  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %in
+  store <4 x i32> %tmp3, ptr addrspace(1) %in
   ret void
 }
 
@@ -65,14 +65,14 @@ entry:
 
 declare i16 @llvm.fshr.i16(i16, i16, i16)
 
-define void @test_rotr_i16(i16 addrspace(1)* nocapture readonly %sourceA, i16 addrspace(1)* nocapture readonly %sourceB, i16 addrspace(1)* nocapture %destValues) {
+define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr addrspace(1) nocapture readonly %sourceB, ptr addrspace(1) nocapture %destValues) {
 entry:
-  %arrayidx = getelementptr inbounds i16, i16 addrspace(1)* %sourceA, i64 16
-  %a = load i16, i16 addrspace(1)* %arrayidx
-  %arrayidx2 = getelementptr inbounds i16, i16 addrspace(1)* %sourceB, i64 24
-  %b = load i16, i16 addrspace(1)* %arrayidx2
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %sourceA, i64 16
+  %a = load i16, ptr addrspace(1) %arrayidx
+  %arrayidx2 = getelementptr inbounds i16, ptr addrspace(1) %sourceB, i64 24
+  %b = load i16, ptr addrspace(1) %arrayidx2
   %c = tail call i16 @llvm.fshr.i16(i16 %a, i16 %a, i16 %b)
-  %arrayidx5 = getelementptr inbounds i16, i16 addrspace(1)* %destValues, i64 4
-  store i16 %c, i16 addrspace(1)* %arrayidx5
+  %arrayidx5 = getelementptr inbounds i16, ptr addrspace(1) %destValues, i64 4
+  store i16 %c, ptr addrspace(1) %arrayidx5
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/scalar-branch-missing-and-exec.ll b/llvm/test/CodeGen/AMDGPU/scalar-branch-missing-and-exec.ll
index 68bbf24856367..140cca9d85cc4 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar-branch-missing-and-exec.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar-branch-missing-and-exec.ll
@@ -22,10 +22,10 @@
 
 define amdgpu_cs void @main(i32 inreg %arg) {
 .entry:
-  %tmp44 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
-  %tmp16 = load volatile float, float addrspace(1)* undef
-  %tmp22 = load volatile float, float addrspace(1)* undef
-  %tmp25 = load volatile float, float addrspace(1)* undef
+  %tmp44 = load volatile <2 x float>, ptr addrspace(1) undef
+  %tmp16 = load volatile float, ptr addrspace(1) undef
+  %tmp22 = load volatile float, ptr addrspace(1) undef
+  %tmp25 = load volatile float, ptr addrspace(1) undef
   %tmp31 = fcmp olt float %tmp16, 0x3FA99999A0000000
   br i1 %tmp31, label %bb, label %.exit.thread
 
@@ -45,7 +45,7 @@ bb50:
   br i1 %tmp53, label %.exit3.i, label %.exit.thread
 
 .exit3.i:
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   br label %.exit.thread
 
 .exit.thread:

diff  --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 565ccacbd2638..20862e33cc4b5 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
 
 ; XXX - Why the packing?
-define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
 ; SI-LABEL: scalar_to_vector_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -43,14 +43,14 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out,
 ; VI-NEXT:    v_mov_b32_e32 v1, v0
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %tmp1 = load i32, i32 addrspace(1)* %in, align 4
+  %tmp1 = load i32, ptr addrspace(1) %in, align 4
   %bc = bitcast i32 %tmp1 to <2 x i16>
   %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8
+  store <4 x i16> %tmp2, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
 ; SI-LABEL: scalar_to_vector_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -90,10 +90,10 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out,
 ; VI-NEXT:    v_mov_b32_e32 v1, v0
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-  %tmp1 = load float, float addrspace(1)* %in, align 4
+  %tmp1 = load float, ptr addrspace(1) %in, align 4
   %bc = bitcast float %tmp1 to <2 x i16>
   %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8
+  store <4 x i16> %tmp2, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -126,10 +126,10 @@ define amdgpu_kernel void @scalar_to_vector_v4i16() {
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 bb:
-  %tmp = load <2 x i8>, <2 x i8> addrspace(1)* undef, align 1
+  %tmp = load <2 x i8>, ptr addrspace(1) undef, align 1
   %tmp1 = shufflevector <2 x i8> %tmp, <2 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 0, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
-  store <8 x i8> %tmp2, <8 x i8> addrspace(1)* undef, align 8
+  store <8 x i8> %tmp2, ptr addrspace(1) undef, align 8
   ret void
 }
 
@@ -164,11 +164,11 @@ define amdgpu_kernel void @scalar_to_vector_v4f16() {
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 bb:
-  %load = load half, half addrspace(1)* undef, align 1
+  %load = load half, ptr addrspace(1) undef, align 1
   %tmp = bitcast half %load to <2 x i8>
   %tmp1 = shufflevector <2 x i8> %tmp, <2 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 0, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
-  store <8 x i8> %tmp2, <8 x i8> addrspace(1)* undef, align 8
+  store <8 x i8> %tmp2, ptr addrspace(1) undef, align 8
   ret void
 }
 
@@ -176,41 +176,41 @@ bb:
 ; to produce one, but for some reason never made it to selection.
 
 
-; define amdgpu_kernel void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-;   %tmp1 = load i32, i32 addrspace(1)* %in, align 4
+; define amdgpu_kernel void @scalar_to_vector_test2(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+;   %tmp1 = load i32, ptr addrspace(1) %in, align 4
 ;   %bc = bitcast i32 %tmp1 to <4 x i8>
 
 ;   %tmp2 = shufflevector <4 x i8> %bc, <4 x i8> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-;   store <8 x i8> %tmp2, <8 x i8> addrspace(1)* %out, align 4
+;   store <8 x i8> %tmp2, ptr addrspace(1) %out, align 4
 ;   ret void
 ; }
 
-; define amdgpu_kernel void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind {
+; define amdgpu_kernel void @scalar_to_vector_test3(ptr addrspace(1) %out) nounwind {
 ;   %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0
 ;   %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1
 ;   %bc = bitcast <2 x i64> %newvec1 to <4 x i32>
 ;   %add = add <4 x i32> %bc, <i32 1, i32 2, i32 3, i32 4>
-;   store <4 x i32> %add, <4 x i32> addrspace(1)* %out, align 16
+;   store <4 x i32> %add, ptr addrspace(1) %out, align 16
 ;   ret void
 ; }
 
-; define amdgpu_kernel void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind {
+; define amdgpu_kernel void @scalar_to_vector_test4(ptr addrspace(1) %out) nounwind {
 ;   %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0
 ;   %bc = bitcast <4 x i32> %newvec0 to <8 x i16>
 ;   %add = add <8 x i16> %bc, <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4>
-;   store <8 x i16> %add, <8 x i16> addrspace(1)* %out, align 16
+;   store <8 x i16> %add, ptr addrspace(1) %out, align 16
 ;   ret void
 ; }
 
-; define amdgpu_kernel void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind {
+; define amdgpu_kernel void @scalar_to_vector_test5(ptr addrspace(1) %out) nounwind {
 ;   %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
 ;   %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
 ;   %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
-;   store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16
+;   store <4 x i16> %add, ptr addrspace(1) %out, align 16
 ;   ret void
 ; }
 
-define amdgpu_kernel void @scalar_to_vector_test6(<2 x half> addrspace(1)* %out, i8 zeroext %val) nounwind {
+define amdgpu_kernel void @scalar_to_vector_test6(ptr addrspace(1) %out, i8 zeroext %val) nounwind {
 ; SI-LABEL: scalar_to_vector_test6:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -234,6 +234,6 @@ define amdgpu_kernel void @scalar_to_vector_test6(<2 x half> addrspace(1)* %out,
 ; VI-NEXT:    s_endpgm
   %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0
   %bc = bitcast <4 x i8> %newvec0 to <2 x half>
-  store <2 x half> %bc, <2 x half> addrspace(1)* %out
+  store <2 x half> %bc, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
index 7fb21f13bf396..0f65920946ab1 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=GFX908 %s
 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A %s
 
-define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, <8 x i16>* %out) #0 {
+define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
 ; GFX900-LABEL: scalar_to_vector_v8i16:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -80,13 +80,13 @@ entry:
 
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %out.gep = getelementptr inbounds <8 x i16>, <8 x i16>* %out, i64 %tid.ext
-  store <8 x i16> %val.6.vec8.i16, <8 x i16>* %out.gep, align 16
+  %out.gep = getelementptr inbounds <8 x i16>, ptr %out, i64 %tid.ext
+  store <8 x i16> %val.6.vec8.i16, ptr %out.gep, align 16
 
   ret void
 }
 
-define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, <8 x half>* %out) #0 {
+define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 {
 ; GFX900-LABEL: scalar_to_vector_v8f16:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -158,8 +158,8 @@ entry:
 
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %out.gep = getelementptr inbounds <8 x half>, <8 x half>* %out, i64 %tid.ext
-  store <8 x half> %val.6.vec8.half, <8 x half>* %out.gep, align 16
+  %out.gep = getelementptr inbounds <8 x half>, ptr %out, i64 %tid.ext
+  store <8 x half> %val.6.vec8.half, ptr %out.gep, align 16
 
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll
index 7af841f08d9e9..5659f7cf01a5f 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll
@@ -7,9 +7,9 @@
 ; GCN-OPT:   v_mov_b32_e32 [[V:v[0-9]+]], 42
 ; GCN: buffer_store_short [[V]],
 define void @scalar_to_vector_i16() {
-  %tmp = load <2 x i16>, <2 x i16> addrspace(5)* undef
+  %tmp = load <2 x i16>, ptr addrspace(5) undef
   %tmp1 = insertelement <2 x i16> %tmp, i16 42, i64 0
-  store <2 x i16> %tmp1, <2 x i16> addrspace(5)* undef
+  store <2 x i16> %tmp1, ptr addrspace(5) undef
   ret void
 }
 
@@ -19,8 +19,8 @@ define void @scalar_to_vector_i16() {
 ; GCN-OPT:   v_mov_b32_e32 [[V:v[0-9]+]], 0x3c00
 ; GCN: buffer_store_short [[V]],
 define void @scalar_to_vector_f16() {
-  %tmp = load <2 x half>, <2 x half> addrspace(5)* undef
+  %tmp = load <2 x half>, ptr addrspace(5) undef
   %tmp1 = insertelement <2 x half> %tmp, half 1.0, i64 0
-  store <2 x half> %tmp1, <2 x half> addrspace(5)* undef
+  store <2 x half> %tmp1, ptr addrspace(5) undef
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
index 0d85776b988db..2cbb505cd55d4 100644
--- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
@@ -5,7 +5,7 @@
 ; while trying to spill SGPRs to memory. After we enabled SGPR spills into virtual VGPRs
 ; the edge case won't arise and the test would always compile.
 
-define amdgpu_kernel void @kernel0(i32 addrspace(1)* %out, i32 %in) #1 {
+define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-LABEL: kernel0:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ;;#ASMSTART

diff  --git a/llvm/test/CodeGen/AMDGPU/sched-setprio.ll b/llvm/test/CodeGen/AMDGPU/sched-setprio.ll
index bf60e071679cc..895f81306e957 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-setprio.ll
+++ b/llvm/test/CodeGen/AMDGPU/sched-setprio.ll
@@ -8,13 +8,13 @@ declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i3
 ; GCN: v_mfma
 ; GCN: v_mfma
 ; GCN: s_setprio 0
-define amdgpu_kernel void @test_mfma_f32_4x4x1f32(<4 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   call void @llvm.amdgcn.s.setprio(i16 1)
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
   %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 3.0, float 4.0, <4 x float> %mai.1, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.s.setprio(i16 0)
-  store <4 x float> %mai.2, <4 x float> addrspace(1)* %arg
+  store <4 x float> %mai.2, ptr addrspace(1) %arg
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll b/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
index 177957c0b35b8..a5fa03fa4162c 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
@@ -3,7 +3,7 @@
 
 define amdgpu_kernel void @main() #0 {
 main_body:
-  %tmp = load <4 x float>, <4 x float> addrspace(9)* null
+  %tmp = load <4 x float>, ptr addrspace(9) null
   %tmp5 = extractelement <4 x float> %tmp, i32 3
   %tmp6 = fptosi float %tmp5 to i32
   %tmp7 = bitcast i32 %tmp6 to float
@@ -20,11 +20,11 @@ main_body:
   %tmp18 = bitcast float %tmp16 to i32
   %tmp19 = add i32 %tmp17, %tmp18
   %tmp20 = bitcast i32 %tmp19 to float
-  %tmp21 = load <4 x float>, <4 x float> addrspace(9)* null
+  %tmp21 = load <4 x float>, ptr addrspace(9) null
   %tmp22 = extractelement <4 x float> %tmp21, i32 0
-  %tmp23 = load <4 x float>, <4 x float> addrspace(9)* null
+  %tmp23 = load <4 x float>, ptr addrspace(9) null
   %tmp24 = extractelement <4 x float> %tmp23, i32 1
-  %tmp25 = load <4 x float>, <4 x float> addrspace(9)* null
+  %tmp25 = load <4 x float>, ptr addrspace(9) null
   %tmp26 = extractelement <4 x float> %tmp25, i32 2
   br label %LOOP
 

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll b/llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll
index 2dddba8bccc76..1a31ff1a894a2 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll
@@ -10,12 +10,12 @@
 ; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; SI: buffer_store_dword [[REG0]]
 ; SI: buffer_store_dword [[REG1]]
-define amdgpu_kernel void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
-  %load0 = load i32, i32 addrspace(1)* %ptr, align 4
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 2
-  %load1 = load i32, i32 addrspace(1)* %gep, align 4
-  store i32 %load0, i32 addrspace(1)* %out0, align 4
-  store i32 %load1, i32 addrspace(1)* %out1, align 4
+define amdgpu_kernel void @cluster_global_arg_loads(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %ptr) #0 {
+  %load0 = load i32, ptr addrspace(1) %ptr, align 4
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 2
+  %load1 = load i32, ptr addrspace(1) %gep, align 4
+  store i32 %load0, ptr addrspace(1) %out0, align 4
+  store i32 %load1, ptr addrspace(1) %out1, align 4
   ret void
 }
 
@@ -24,13 +24,13 @@ define amdgpu_kernel void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32
 ; FUNC-LABEL: {{^}}same_base_ptr_crash:
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
-define amdgpu_kernel void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
+define amdgpu_kernel void @same_base_ptr_crash(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %offset) {
 entry:
-  %out1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset
-  %tmp0 = load i32, i32 addrspace(1)* %out
-  %tmp1 = load i32, i32 addrspace(1)* %out1
+  %out1 = getelementptr i32, ptr addrspace(1) %out, i32 %offset
+  %tmp0 = load i32, ptr addrspace(1) %out
+  %tmp1 = load i32, ptr addrspace(1) %out1
   %tmp2 = add i32 %tmp0, %tmp1
-  store i32 %tmp2, i32 addrspace(1)* %out
+  store i32 %tmp2, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-if-2.ll b/llvm/test/CodeGen/AMDGPU/schedule-if-2.ll
index 832b3b3a29203..1888df188ac07 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-if-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-if-2.ll
@@ -3,10 +3,10 @@
 
 define amdgpu_kernel void @main() {
 main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %0 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 2)
   %1 = extractelement <4 x float> %0, i32 0
   %2 = fadd float 1.000000e+03, %1
-  %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %3 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 1)
   %4 = extractelement <4 x float> %3, i32 0
   %5 = bitcast float %4 to i32
   %6 = icmp eq i32 %5, 0
@@ -47,7 +47,7 @@ IF:                                               ; preds = %main_body
   br label %ENDIF
 
 ELSE:                                             ; preds = %main_body
-  %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %36 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 1)
   %37 = extractelement <4 x float> %36, i32 0
   %38 = bitcast float %37 to i32
   %39 = icmp eq i32 %38, 1
@@ -80,7 +80,7 @@ IF23:                                             ; preds = %ELSE
   %.28 = select i1 %54, float 0x36A0000000000000, float 0.000000e+00
   %55 = bitcast float %.28 to i32
   %56 = sitofp i32 %55 to float
-  %57 = load <4 x float>, <4 x float> addrspace(8)* null
+  %57 = load <4 x float>, ptr addrspace(8) null
   %58 = extractelement <4 x float> %57, i32 0
   %59 = fsub float -0.000000e+00, %58
   %60 = fadd float %2, %59

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-if.ll b/llvm/test/CodeGen/AMDGPU/schedule-if.ll
index feac5d918f63b..21c5d1ad8ec81 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-if.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-if.ll
@@ -3,7 +3,7 @@
 
 define amdgpu_kernel void @main() {
 main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %0 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 1)
   %1 = extractelement <4 x float> %0, i32 0
   %2 = bitcast float %1 to i32
   %3 = icmp eq i32 %2, 0
@@ -14,7 +14,7 @@ main_body:
   br i1 %7, label %ENDIF, label %ELSE
 
 ELSE:                                             ; preds = %main_body
-  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %8 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 1)
   %9 = extractelement <4 x float> %8, i32 0
   %10 = bitcast float %9 to i32
   %11 = icmp eq i32 %10, 1
@@ -36,7 +36,7 @@ ENDIF:                                            ; preds = %IF13, %ELSE, %main_
   ret void
 
 IF13:                                             ; preds = %ELSE
-  %20 = load <4 x float>, <4 x float> addrspace(8)* null
+  %20 = load <4 x float>, ptr addrspace(8) null
   %21 = extractelement <4 x float> %20, i32 0
   %22 = fsub float -0.000000e+00, %21
   %23 = fadd float 1.000000e+03, %22

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
index 52fa12346c1a2..d0812eeaf772b 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
@@ -4,583 +4,583 @@
 
 ; CHECK: NumVgprs: {{[0-9][0-9][0-9]$}}
 
-define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(1)* nocapture %arg1) #0 {
+define amdgpu_kernel void @load_fma_store(ptr addrspace(3) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) #0 {
 bb:
-  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1
-  %tmp2 = load float, float addrspace(3)* %tmp, align 4
-  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2
-  %tmp4 = load float, float addrspace(3)* %tmp3, align 4
-  %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 3
-  %tmp6 = load float, float addrspace(3)* %tmp5, align 4
+  %tmp = getelementptr inbounds float, ptr addrspace(3) %arg, i32 1
+  %tmp2 = load float, ptr addrspace(3) %tmp, align 4
+  %tmp3 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 2
+  %tmp4 = load float, ptr addrspace(3) %tmp3, align 4
+  %tmp5 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 3
+  %tmp6 = load float, ptr addrspace(3) %tmp5, align 4
   %tmp7 = tail call float @llvm.fmuladd.f32(float %tmp2, float %tmp4, float %tmp6)
-  %tmp8 = getelementptr inbounds float, float addrspace(3)* %arg, i32 5
-  %tmp9 = load float, float addrspace(3)* %tmp8, align 4
-  %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6
-  %tmp11 = load float, float addrspace(3)* %tmp10, align 4
-  %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 7
-  %tmp13 = load float, float addrspace(3)* %tmp12, align 4
+  %tmp8 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 5
+  %tmp9 = load float, ptr addrspace(3) %tmp8, align 4
+  %tmp10 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 6
+  %tmp11 = load float, ptr addrspace(3) %tmp10, align 4
+  %tmp12 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 7
+  %tmp13 = load float, ptr addrspace(3) %tmp12, align 4
   %tmp14 = tail call float @llvm.fmuladd.f32(float %tmp9, float %tmp11, float %tmp13)
-  %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 9
-  %tmp16 = load float, float addrspace(3)* %tmp15, align 4
-  %tmp17 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10
-  %tmp18 = load float, float addrspace(3)* %tmp17, align 4
-  %tmp19 = getelementptr inbounds float, float addrspace(3)* %arg, i32 11
-  %tmp20 = load float, float addrspace(3)* %tmp19, align 4
+  %tmp15 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 9
+  %tmp16 = load float, ptr addrspace(3) %tmp15, align 4
+  %tmp17 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 10
+  %tmp18 = load float, ptr addrspace(3) %tmp17, align 4
+  %tmp19 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 11
+  %tmp20 = load float, ptr addrspace(3) %tmp19, align 4
   %tmp21 = tail call float @llvm.fmuladd.f32(float %tmp16, float %tmp18, float %tmp20)
-  %tmp22 = getelementptr inbounds float, float addrspace(3)* %arg, i32 13
-  %tmp23 = load float, float addrspace(3)* %tmp22, align 4
-  %tmp24 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14
-  %tmp25 = load float, float addrspace(3)* %tmp24, align 4
-  %tmp26 = getelementptr inbounds float, float addrspace(3)* %arg, i32 15
-  %tmp27 = load float, float addrspace(3)* %tmp26, align 4
+  %tmp22 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 13
+  %tmp23 = load float, ptr addrspace(3) %tmp22, align 4
+  %tmp24 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 14
+  %tmp25 = load float, ptr addrspace(3) %tmp24, align 4
+  %tmp26 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 15
+  %tmp27 = load float, ptr addrspace(3) %tmp26, align 4
   %tmp28 = tail call float @llvm.fmuladd.f32(float %tmp23, float %tmp25, float %tmp27)
-  %tmp29 = getelementptr inbounds float, float addrspace(3)* %arg, i32 17
-  %tmp30 = load float, float addrspace(3)* %tmp29, align 4
-  %tmp31 = getelementptr inbounds float, float addrspace(3)* %arg, i32 18
-  %tmp32 = load float, float addrspace(3)* %tmp31, align 4
-  %tmp33 = getelementptr inbounds float, float addrspace(3)* %arg, i32 19
-  %tmp34 = load float, float addrspace(3)* %tmp33, align 4
+  %tmp29 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 17
+  %tmp30 = load float, ptr addrspace(3) %tmp29, align 4
+  %tmp31 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 18
+  %tmp32 = load float, ptr addrspace(3) %tmp31, align 4
+  %tmp33 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 19
+  %tmp34 = load float, ptr addrspace(3) %tmp33, align 4
   %tmp35 = tail call float @llvm.fmuladd.f32(float %tmp30, float %tmp32, float %tmp34)
-  %tmp36 = getelementptr inbounds float, float addrspace(3)* %arg, i32 21
-  %tmp37 = load float, float addrspace(3)* %tmp36, align 4
-  %tmp38 = getelementptr inbounds float, float addrspace(3)* %arg, i32 22
-  %tmp39 = load float, float addrspace(3)* %tmp38, align 4
-  %tmp40 = getelementptr inbounds float, float addrspace(3)* %arg, i32 23
-  %tmp41 = load float, float addrspace(3)* %tmp40, align 4
+  %tmp36 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 21
+  %tmp37 = load float, ptr addrspace(3) %tmp36, align 4
+  %tmp38 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 22
+  %tmp39 = load float, ptr addrspace(3) %tmp38, align 4
+  %tmp40 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 23
+  %tmp41 = load float, ptr addrspace(3) %tmp40, align 4
   %tmp42 = tail call float @llvm.fmuladd.f32(float %tmp37, float %tmp39, float %tmp41)
-  %tmp43 = getelementptr inbounds float, float addrspace(3)* %arg, i32 25
-  %tmp44 = load float, float addrspace(3)* %tmp43, align 4
-  %tmp45 = getelementptr inbounds float, float addrspace(3)* %arg, i32 26
-  %tmp46 = load float, float addrspace(3)* %tmp45, align 4
-  %tmp47 = getelementptr inbounds float, float addrspace(3)* %arg, i32 27
-  %tmp48 = load float, float addrspace(3)* %tmp47, align 4
+  %tmp43 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 25
+  %tmp44 = load float, ptr addrspace(3) %tmp43, align 4
+  %tmp45 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 26
+  %tmp46 = load float, ptr addrspace(3) %tmp45, align 4
+  %tmp47 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 27
+  %tmp48 = load float, ptr addrspace(3) %tmp47, align 4
   %tmp49 = tail call float @llvm.fmuladd.f32(float %tmp44, float %tmp46, float %tmp48)
-  %tmp50 = getelementptr inbounds float, float addrspace(3)* %arg, i32 29
-  %tmp51 = load float, float addrspace(3)* %tmp50, align 4
-  %tmp52 = getelementptr inbounds float, float addrspace(3)* %arg, i32 30
-  %tmp53 = load float, float addrspace(3)* %tmp52, align 4
-  %tmp54 = getelementptr inbounds float, float addrspace(3)* %arg, i32 31
-  %tmp55 = load float, float addrspace(3)* %tmp54, align 4
+  %tmp50 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 29
+  %tmp51 = load float, ptr addrspace(3) %tmp50, align 4
+  %tmp52 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 30
+  %tmp53 = load float, ptr addrspace(3) %tmp52, align 4
+  %tmp54 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 31
+  %tmp55 = load float, ptr addrspace(3) %tmp54, align 4
   %tmp56 = tail call float @llvm.fmuladd.f32(float %tmp51, float %tmp53, float %tmp55)
-  %tmp57 = getelementptr inbounds float, float addrspace(3)* %arg, i32 33
-  %tmp58 = load float, float addrspace(3)* %tmp57, align 4
-  %tmp59 = getelementptr inbounds float, float addrspace(3)* %arg, i32 34
-  %tmp60 = load float, float addrspace(3)* %tmp59, align 4
-  %tmp61 = getelementptr inbounds float, float addrspace(3)* %arg, i32 35
-  %tmp62 = load float, float addrspace(3)* %tmp61, align 4
+  %tmp57 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 33
+  %tmp58 = load float, ptr addrspace(3) %tmp57, align 4
+  %tmp59 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 34
+  %tmp60 = load float, ptr addrspace(3) %tmp59, align 4
+  %tmp61 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 35
+  %tmp62 = load float, ptr addrspace(3) %tmp61, align 4
   %tmp63 = tail call float @llvm.fmuladd.f32(float %tmp58, float %tmp60, float %tmp62)
-  %tmp64 = getelementptr inbounds float, float addrspace(3)* %arg, i32 37
-  %tmp65 = load float, float addrspace(3)* %tmp64, align 4
-  %tmp66 = getelementptr inbounds float, float addrspace(3)* %arg, i32 38
-  %tmp67 = load float, float addrspace(3)* %tmp66, align 4
-  %tmp68 = getelementptr inbounds float, float addrspace(3)* %arg, i32 39
-  %tmp69 = load float, float addrspace(3)* %tmp68, align 4
+  %tmp64 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 37
+  %tmp65 = load float, ptr addrspace(3) %tmp64, align 4
+  %tmp66 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 38
+  %tmp67 = load float, ptr addrspace(3) %tmp66, align 4
+  %tmp68 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 39
+  %tmp69 = load float, ptr addrspace(3) %tmp68, align 4
   %tmp70 = tail call float @llvm.fmuladd.f32(float %tmp65, float %tmp67, float %tmp69)
-  %tmp71 = getelementptr inbounds float, float addrspace(3)* %arg, i32 41
-  %tmp72 = load float, float addrspace(3)* %tmp71, align 4
-  %tmp73 = getelementptr inbounds float, float addrspace(3)* %arg, i32 42
-  %tmp74 = load float, float addrspace(3)* %tmp73, align 4
-  %tmp75 = getelementptr inbounds float, float addrspace(3)* %arg, i32 43
-  %tmp76 = load float, float addrspace(3)* %tmp75, align 4
+  %tmp71 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 41
+  %tmp72 = load float, ptr addrspace(3) %tmp71, align 4
+  %tmp73 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 42
+  %tmp74 = load float, ptr addrspace(3) %tmp73, align 4
+  %tmp75 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 43
+  %tmp76 = load float, ptr addrspace(3) %tmp75, align 4
   %tmp77 = tail call float @llvm.fmuladd.f32(float %tmp72, float %tmp74, float %tmp76)
-  %tmp78 = getelementptr inbounds float, float addrspace(3)* %arg, i32 45
-  %tmp79 = load float, float addrspace(3)* %tmp78, align 4
-  %tmp80 = getelementptr inbounds float, float addrspace(3)* %arg, i32 46
-  %tmp81 = load float, float addrspace(3)* %tmp80, align 4
-  %tmp82 = getelementptr inbounds float, float addrspace(3)* %arg, i32 47
-  %tmp83 = load float, float addrspace(3)* %tmp82, align 4
+  %tmp78 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 45
+  %tmp79 = load float, ptr addrspace(3) %tmp78, align 4
+  %tmp80 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 46
+  %tmp81 = load float, ptr addrspace(3) %tmp80, align 4
+  %tmp82 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 47
+  %tmp83 = load float, ptr addrspace(3) %tmp82, align 4
   %tmp84 = tail call float @llvm.fmuladd.f32(float %tmp79, float %tmp81, float %tmp83)
-  %tmp85 = getelementptr inbounds float, float addrspace(3)* %arg, i32 49
-  %tmp86 = load float, float addrspace(3)* %tmp85, align 4
-  %tmp87 = getelementptr inbounds float, float addrspace(3)* %arg, i32 50
-  %tmp88 = load float, float addrspace(3)* %tmp87, align 4
-  %tmp89 = getelementptr inbounds float, float addrspace(3)* %arg, i32 51
-  %tmp90 = load float, float addrspace(3)* %tmp89, align 4
+  %tmp85 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 49
+  %tmp86 = load float, ptr addrspace(3) %tmp85, align 4
+  %tmp87 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 50
+  %tmp88 = load float, ptr addrspace(3) %tmp87, align 4
+  %tmp89 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 51
+  %tmp90 = load float, ptr addrspace(3) %tmp89, align 4
   %tmp91 = tail call float @llvm.fmuladd.f32(float %tmp86, float %tmp88, float %tmp90)
-  %tmp92 = getelementptr inbounds float, float addrspace(3)* %arg, i32 53
-  %tmp93 = load float, float addrspace(3)* %tmp92, align 4
-  %tmp94 = getelementptr inbounds float, float addrspace(3)* %arg, i32 54
-  %tmp95 = load float, float addrspace(3)* %tmp94, align 4
-  %tmp96 = getelementptr inbounds float, float addrspace(3)* %arg, i32 55
-  %tmp97 = load float, float addrspace(3)* %tmp96, align 4
+  %tmp92 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 53
+  %tmp93 = load float, ptr addrspace(3) %tmp92, align 4
+  %tmp94 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 54
+  %tmp95 = load float, ptr addrspace(3) %tmp94, align 4
+  %tmp96 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 55
+  %tmp97 = load float, ptr addrspace(3) %tmp96, align 4
   %tmp98 = tail call float @llvm.fmuladd.f32(float %tmp93, float %tmp95, float %tmp97)
-  %tmp99 = getelementptr inbounds float, float addrspace(3)* %arg, i32 57
-  %tmp100 = load float, float addrspace(3)* %tmp99, align 4
-  %tmp101 = getelementptr inbounds float, float addrspace(3)* %arg, i32 58
-  %tmp102 = load float, float addrspace(3)* %tmp101, align 4
-  %tmp103 = getelementptr inbounds float, float addrspace(3)* %arg, i32 59
-  %tmp104 = load float, float addrspace(3)* %tmp103, align 4
+  %tmp99 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 57
+  %tmp100 = load float, ptr addrspace(3) %tmp99, align 4
+  %tmp101 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 58
+  %tmp102 = load float, ptr addrspace(3) %tmp101, align 4
+  %tmp103 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 59
+  %tmp104 = load float, ptr addrspace(3) %tmp103, align 4
   %tmp105 = tail call float @llvm.fmuladd.f32(float %tmp100, float %tmp102, float %tmp104)
-  %tmp106 = getelementptr inbounds float, float addrspace(3)* %arg, i32 61
-  %tmp107 = load float, float addrspace(3)* %tmp106, align 4
-  %tmp108 = getelementptr inbounds float, float addrspace(3)* %arg, i32 62
-  %tmp109 = load float, float addrspace(3)* %tmp108, align 4
-  %tmp110 = getelementptr inbounds float, float addrspace(3)* %arg, i32 63
-  %tmp111 = load float, float addrspace(3)* %tmp110, align 4
+  %tmp106 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 61
+  %tmp107 = load float, ptr addrspace(3) %tmp106, align 4
+  %tmp108 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 62
+  %tmp109 = load float, ptr addrspace(3) %tmp108, align 4
+  %tmp110 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 63
+  %tmp111 = load float, ptr addrspace(3) %tmp110, align 4
   %tmp112 = tail call float @llvm.fmuladd.f32(float %tmp107, float %tmp109, float %tmp111)
-  %tmp113 = getelementptr inbounds float, float addrspace(3)* %arg, i32 65
-  %tmp114 = load float, float addrspace(3)* %tmp113, align 4
-  %tmp115 = getelementptr inbounds float, float addrspace(3)* %arg, i32 66
-  %tmp116 = load float, float addrspace(3)* %tmp115, align 4
-  %tmp117 = getelementptr inbounds float, float addrspace(3)* %arg, i32 67
-  %tmp118 = load float, float addrspace(3)* %tmp117, align 4
+  %tmp113 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 65
+  %tmp114 = load float, ptr addrspace(3) %tmp113, align 4
+  %tmp115 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 66
+  %tmp116 = load float, ptr addrspace(3) %tmp115, align 4
+  %tmp117 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 67
+  %tmp118 = load float, ptr addrspace(3) %tmp117, align 4
   %tmp119 = tail call float @llvm.fmuladd.f32(float %tmp114, float %tmp116, float %tmp118)
-  %tmp120 = getelementptr inbounds float, float addrspace(3)* %arg, i32 69
-  %tmp121 = load float, float addrspace(3)* %tmp120, align 4
-  %tmp122 = getelementptr inbounds float, float addrspace(3)* %arg, i32 70
-  %tmp123 = load float, float addrspace(3)* %tmp122, align 4
-  %tmp124 = getelementptr inbounds float, float addrspace(3)* %arg, i32 71
-  %tmp125 = load float, float addrspace(3)* %tmp124, align 4
+  %tmp120 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 69
+  %tmp121 = load float, ptr addrspace(3) %tmp120, align 4
+  %tmp122 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 70
+  %tmp123 = load float, ptr addrspace(3) %tmp122, align 4
+  %tmp124 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 71
+  %tmp125 = load float, ptr addrspace(3) %tmp124, align 4
   %tmp126 = tail call float @llvm.fmuladd.f32(float %tmp121, float %tmp123, float %tmp125)
-  %tmp127 = getelementptr inbounds float, float addrspace(3)* %arg, i32 73
-  %tmp128 = load float, float addrspace(3)* %tmp127, align 4
-  %tmp129 = getelementptr inbounds float, float addrspace(3)* %arg, i32 74
-  %tmp130 = load float, float addrspace(3)* %tmp129, align 4
-  %tmp131 = getelementptr inbounds float, float addrspace(3)* %arg, i32 75
-  %tmp132 = load float, float addrspace(3)* %tmp131, align 4
+  %tmp127 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 73
+  %tmp128 = load float, ptr addrspace(3) %tmp127, align 4
+  %tmp129 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 74
+  %tmp130 = load float, ptr addrspace(3) %tmp129, align 4
+  %tmp131 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 75
+  %tmp132 = load float, ptr addrspace(3) %tmp131, align 4
   %tmp133 = tail call float @llvm.fmuladd.f32(float %tmp128, float %tmp130, float %tmp132)
-  %tmp134 = getelementptr inbounds float, float addrspace(3)* %arg, i32 77
-  %tmp135 = load float, float addrspace(3)* %tmp134, align 4
-  %tmp136 = getelementptr inbounds float, float addrspace(3)* %arg, i32 78
-  %tmp137 = load float, float addrspace(3)* %tmp136, align 4
-  %tmp138 = getelementptr inbounds float, float addrspace(3)* %arg, i32 79
-  %tmp139 = load float, float addrspace(3)* %tmp138, align 4
+  %tmp134 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 77
+  %tmp135 = load float, ptr addrspace(3) %tmp134, align 4
+  %tmp136 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 78
+  %tmp137 = load float, ptr addrspace(3) %tmp136, align 4
+  %tmp138 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 79
+  %tmp139 = load float, ptr addrspace(3) %tmp138, align 4
   %tmp140 = tail call float @llvm.fmuladd.f32(float %tmp135, float %tmp137, float %tmp139)
-  %tmp141 = getelementptr inbounds float, float addrspace(3)* %arg, i32 81
-  %tmp142 = load float, float addrspace(3)* %tmp141, align 4
-  %tmp143 = getelementptr inbounds float, float addrspace(3)* %arg, i32 82
-  %tmp144 = load float, float addrspace(3)* %tmp143, align 4
-  %tmp145 = getelementptr inbounds float, float addrspace(3)* %arg, i32 83
-  %tmp146 = load float, float addrspace(3)* %tmp145, align 4
+  %tmp141 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 81
+  %tmp142 = load float, ptr addrspace(3) %tmp141, align 4
+  %tmp143 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 82
+  %tmp144 = load float, ptr addrspace(3) %tmp143, align 4
+  %tmp145 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 83
+  %tmp146 = load float, ptr addrspace(3) %tmp145, align 4
   %tmp147 = tail call float @llvm.fmuladd.f32(float %tmp142, float %tmp144, float %tmp146)
-  %tmp148 = getelementptr inbounds float, float addrspace(3)* %arg, i32 85
-  %tmp149 = load float, float addrspace(3)* %tmp148, align 4
-  %tmp150 = getelementptr inbounds float, float addrspace(3)* %arg, i32 86
-  %tmp151 = load float, float addrspace(3)* %tmp150, align 4
-  %tmp152 = getelementptr inbounds float, float addrspace(3)* %arg, i32 87
-  %tmp153 = load float, float addrspace(3)* %tmp152, align 4
+  %tmp148 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 85
+  %tmp149 = load float, ptr addrspace(3) %tmp148, align 4
+  %tmp150 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 86
+  %tmp151 = load float, ptr addrspace(3) %tmp150, align 4
+  %tmp152 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 87
+  %tmp153 = load float, ptr addrspace(3) %tmp152, align 4
   %tmp154 = tail call float @llvm.fmuladd.f32(float %tmp149, float %tmp151, float %tmp153)
-  %tmp155 = getelementptr inbounds float, float addrspace(3)* %arg, i32 89
-  %tmp156 = load float, float addrspace(3)* %tmp155, align 4
-  %tmp157 = getelementptr inbounds float, float addrspace(3)* %arg, i32 90
-  %tmp158 = load float, float addrspace(3)* %tmp157, align 4
-  %tmp159 = getelementptr inbounds float, float addrspace(3)* %arg, i32 91
-  %tmp160 = load float, float addrspace(3)* %tmp159, align 4
+  %tmp155 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 89
+  %tmp156 = load float, ptr addrspace(3) %tmp155, align 4
+  %tmp157 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 90
+  %tmp158 = load float, ptr addrspace(3) %tmp157, align 4
+  %tmp159 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 91
+  %tmp160 = load float, ptr addrspace(3) %tmp159, align 4
   %tmp161 = tail call float @llvm.fmuladd.f32(float %tmp156, float %tmp158, float %tmp160)
-  %tmp162 = getelementptr inbounds float, float addrspace(3)* %arg, i32 93
-  %tmp163 = load float, float addrspace(3)* %tmp162, align 4
-  %tmp164 = getelementptr inbounds float, float addrspace(3)* %arg, i32 94
-  %tmp165 = load float, float addrspace(3)* %tmp164, align 4
-  %tmp166 = getelementptr inbounds float, float addrspace(3)* %arg, i32 95
-  %tmp167 = load float, float addrspace(3)* %tmp166, align 4
+  %tmp162 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 93
+  %tmp163 = load float, ptr addrspace(3) %tmp162, align 4
+  %tmp164 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 94
+  %tmp165 = load float, ptr addrspace(3) %tmp164, align 4
+  %tmp166 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 95
+  %tmp167 = load float, ptr addrspace(3) %tmp166, align 4
   %tmp168 = tail call float @llvm.fmuladd.f32(float %tmp163, float %tmp165, float %tmp167)
-  %tmp169 = getelementptr inbounds float, float addrspace(3)* %arg, i32 97
-  %tmp170 = load float, float addrspace(3)* %tmp169, align 4
-  %tmp171 = getelementptr inbounds float, float addrspace(3)* %arg, i32 98
-  %tmp172 = load float, float addrspace(3)* %tmp171, align 4
-  %tmp173 = getelementptr inbounds float, float addrspace(3)* %arg, i32 99
-  %tmp174 = load float, float addrspace(3)* %tmp173, align 4
+  %tmp169 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 97
+  %tmp170 = load float, ptr addrspace(3) %tmp169, align 4
+  %tmp171 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 98
+  %tmp172 = load float, ptr addrspace(3) %tmp171, align 4
+  %tmp173 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 99
+  %tmp174 = load float, ptr addrspace(3) %tmp173, align 4
   %tmp175 = tail call float @llvm.fmuladd.f32(float %tmp170, float %tmp172, float %tmp174)
-  %tmp176 = getelementptr inbounds float, float addrspace(3)* %arg, i32 101
-  %tmp177 = load float, float addrspace(3)* %tmp176, align 4
-  %tmp178 = getelementptr inbounds float, float addrspace(3)* %arg, i32 102
-  %tmp179 = load float, float addrspace(3)* %tmp178, align 4
-  %tmp180 = getelementptr inbounds float, float addrspace(3)* %arg, i32 103
-  %tmp181 = load float, float addrspace(3)* %tmp180, align 4
+  %tmp176 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 101
+  %tmp177 = load float, ptr addrspace(3) %tmp176, align 4
+  %tmp178 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 102
+  %tmp179 = load float, ptr addrspace(3) %tmp178, align 4
+  %tmp180 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 103
+  %tmp181 = load float, ptr addrspace(3) %tmp180, align 4
   %tmp182 = tail call float @llvm.fmuladd.f32(float %tmp177, float %tmp179, float %tmp181)
-  %tmp183 = getelementptr inbounds float, float addrspace(3)* %arg, i32 105
-  %tmp184 = load float, float addrspace(3)* %tmp183, align 4
-  %tmp185 = getelementptr inbounds float, float addrspace(3)* %arg, i32 106
-  %tmp186 = load float, float addrspace(3)* %tmp185, align 4
-  %tmp187 = getelementptr inbounds float, float addrspace(3)* %arg, i32 107
-  %tmp188 = load float, float addrspace(3)* %tmp187, align 4
+  %tmp183 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 105
+  %tmp184 = load float, ptr addrspace(3) %tmp183, align 4
+  %tmp185 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 106
+  %tmp186 = load float, ptr addrspace(3) %tmp185, align 4
+  %tmp187 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 107
+  %tmp188 = load float, ptr addrspace(3) %tmp187, align 4
   %tmp189 = tail call float @llvm.fmuladd.f32(float %tmp184, float %tmp186, float %tmp188)
-  %tmp190 = getelementptr inbounds float, float addrspace(3)* %arg, i32 109
-  %tmp191 = load float, float addrspace(3)* %tmp190, align 4
-  %tmp192 = getelementptr inbounds float, float addrspace(3)* %arg, i32 110
-  %tmp193 = load float, float addrspace(3)* %tmp192, align 4
-  %tmp194 = getelementptr inbounds float, float addrspace(3)* %arg, i32 111
-  %tmp195 = load float, float addrspace(3)* %tmp194, align 4
+  %tmp190 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 109
+  %tmp191 = load float, ptr addrspace(3) %tmp190, align 4
+  %tmp192 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 110
+  %tmp193 = load float, ptr addrspace(3) %tmp192, align 4
+  %tmp194 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 111
+  %tmp195 = load float, ptr addrspace(3) %tmp194, align 4
   %tmp196 = tail call float @llvm.fmuladd.f32(float %tmp191, float %tmp193, float %tmp195)
-  %tmp197 = getelementptr inbounds float, float addrspace(3)* %arg, i32 113
-  %tmp198 = load float, float addrspace(3)* %tmp197, align 4
-  %tmp199 = getelementptr inbounds float, float addrspace(3)* %arg, i32 114
-  %tmp200 = load float, float addrspace(3)* %tmp199, align 4
-  %tmp201 = getelementptr inbounds float, float addrspace(3)* %arg, i32 115
-  %tmp202 = load float, float addrspace(3)* %tmp201, align 4
+  %tmp197 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 113
+  %tmp198 = load float, ptr addrspace(3) %tmp197, align 4
+  %tmp199 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 114
+  %tmp200 = load float, ptr addrspace(3) %tmp199, align 4
+  %tmp201 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 115
+  %tmp202 = load float, ptr addrspace(3) %tmp201, align 4
   %tmp203 = tail call float @llvm.fmuladd.f32(float %tmp198, float %tmp200, float %tmp202)
-  %tmp204 = getelementptr inbounds float, float addrspace(3)* %arg, i32 117
-  %tmp205 = load float, float addrspace(3)* %tmp204, align 4
-  %tmp206 = getelementptr inbounds float, float addrspace(3)* %arg, i32 118
-  %tmp207 = load float, float addrspace(3)* %tmp206, align 4
-  %tmp208 = getelementptr inbounds float, float addrspace(3)* %arg, i32 119
-  %tmp209 = load float, float addrspace(3)* %tmp208, align 4
+  %tmp204 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 117
+  %tmp205 = load float, ptr addrspace(3) %tmp204, align 4
+  %tmp206 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 118
+  %tmp207 = load float, ptr addrspace(3) %tmp206, align 4
+  %tmp208 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 119
+  %tmp209 = load float, ptr addrspace(3) %tmp208, align 4
   %tmp210 = tail call float @llvm.fmuladd.f32(float %tmp205, float %tmp207, float %tmp209)
-  %tmp211 = getelementptr inbounds float, float addrspace(3)* %arg, i32 121
-  %tmp212 = load float, float addrspace(3)* %tmp211, align 4
-  %tmp213 = getelementptr inbounds float, float addrspace(3)* %arg, i32 122
-  %tmp214 = load float, float addrspace(3)* %tmp213, align 4
-  %tmp215 = getelementptr inbounds float, float addrspace(3)* %arg, i32 123
-  %tmp216 = load float, float addrspace(3)* %tmp215, align 4
+  %tmp211 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 121
+  %tmp212 = load float, ptr addrspace(3) %tmp211, align 4
+  %tmp213 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 122
+  %tmp214 = load float, ptr addrspace(3) %tmp213, align 4
+  %tmp215 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 123
+  %tmp216 = load float, ptr addrspace(3) %tmp215, align 4
   %tmp217 = tail call float @llvm.fmuladd.f32(float %tmp212, float %tmp214, float %tmp216)
-  %tmp218 = getelementptr inbounds float, float addrspace(3)* %arg, i32 125
-  %tmp219 = load float, float addrspace(3)* %tmp218, align 4
-  %tmp220 = getelementptr inbounds float, float addrspace(3)* %arg, i32 126
-  %tmp221 = load float, float addrspace(3)* %tmp220, align 4
-  %tmp222 = getelementptr inbounds float, float addrspace(3)* %arg, i32 127
-  %tmp223 = load float, float addrspace(3)* %tmp222, align 4
+  %tmp218 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 125
+  %tmp219 = load float, ptr addrspace(3) %tmp218, align 4
+  %tmp220 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 126
+  %tmp221 = load float, ptr addrspace(3) %tmp220, align 4
+  %tmp222 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 127
+  %tmp223 = load float, ptr addrspace(3) %tmp222, align 4
   %tmp224 = tail call float @llvm.fmuladd.f32(float %tmp219, float %tmp221, float %tmp223)
-  %tmp225 = getelementptr inbounds float, float addrspace(3)* %arg, i32 129
-  %tmp226 = load float, float addrspace(3)* %tmp225, align 4
-  %tmp227 = getelementptr inbounds float, float addrspace(3)* %arg, i32 130
-  %tmp228 = load float, float addrspace(3)* %tmp227, align 4
-  %tmp229 = getelementptr inbounds float, float addrspace(3)* %arg, i32 131
-  %tmp230 = load float, float addrspace(3)* %tmp229, align 4
+  %tmp225 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 129
+  %tmp226 = load float, ptr addrspace(3) %tmp225, align 4
+  %tmp227 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 130
+  %tmp228 = load float, ptr addrspace(3) %tmp227, align 4
+  %tmp229 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 131
+  %tmp230 = load float, ptr addrspace(3) %tmp229, align 4
   %tmp231 = tail call float @llvm.fmuladd.f32(float %tmp226, float %tmp228, float %tmp230)
-  %tmp232 = getelementptr inbounds float, float addrspace(3)* %arg, i32 133
-  %tmp233 = load float, float addrspace(3)* %tmp232, align 4
-  %tmp234 = getelementptr inbounds float, float addrspace(3)* %arg, i32 134
-  %tmp235 = load float, float addrspace(3)* %tmp234, align 4
-  %tmp236 = getelementptr inbounds float, float addrspace(3)* %arg, i32 135
-  %tmp237 = load float, float addrspace(3)* %tmp236, align 4
+  %tmp232 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 133
+  %tmp233 = load float, ptr addrspace(3) %tmp232, align 4
+  %tmp234 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 134
+  %tmp235 = load float, ptr addrspace(3) %tmp234, align 4
+  %tmp236 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 135
+  %tmp237 = load float, ptr addrspace(3) %tmp236, align 4
   %tmp238 = tail call float @llvm.fmuladd.f32(float %tmp233, float %tmp235, float %tmp237)
-  %tmp239 = getelementptr inbounds float, float addrspace(3)* %arg, i32 137
-  %tmp240 = load float, float addrspace(3)* %tmp239, align 4
-  %tmp241 = getelementptr inbounds float, float addrspace(3)* %arg, i32 138
-  %tmp242 = load float, float addrspace(3)* %tmp241, align 4
-  %tmp243 = getelementptr inbounds float, float addrspace(3)* %arg, i32 139
-  %tmp244 = load float, float addrspace(3)* %tmp243, align 4
+  %tmp239 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 137
+  %tmp240 = load float, ptr addrspace(3) %tmp239, align 4
+  %tmp241 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 138
+  %tmp242 = load float, ptr addrspace(3) %tmp241, align 4
+  %tmp243 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 139
+  %tmp244 = load float, ptr addrspace(3) %tmp243, align 4
   %tmp245 = tail call float @llvm.fmuladd.f32(float %tmp240, float %tmp242, float %tmp244)
-  %tmp246 = getelementptr inbounds float, float addrspace(3)* %arg, i32 141
-  %tmp247 = load float, float addrspace(3)* %tmp246, align 4
-  %tmp248 = getelementptr inbounds float, float addrspace(3)* %arg, i32 142
-  %tmp249 = load float, float addrspace(3)* %tmp248, align 4
-  %tmp250 = getelementptr inbounds float, float addrspace(3)* %arg, i32 143
-  %tmp251 = load float, float addrspace(3)* %tmp250, align 4
+  %tmp246 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 141
+  %tmp247 = load float, ptr addrspace(3) %tmp246, align 4
+  %tmp248 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 142
+  %tmp249 = load float, ptr addrspace(3) %tmp248, align 4
+  %tmp250 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 143
+  %tmp251 = load float, ptr addrspace(3) %tmp250, align 4
   %tmp252 = tail call float @llvm.fmuladd.f32(float %tmp247, float %tmp249, float %tmp251)
-  %tmp253 = getelementptr inbounds float, float addrspace(3)* %arg, i32 145
-  %tmp254 = load float, float addrspace(3)* %tmp253, align 4
-  %tmp255 = getelementptr inbounds float, float addrspace(3)* %arg, i32 146
-  %tmp256 = load float, float addrspace(3)* %tmp255, align 4
-  %tmp257 = getelementptr inbounds float, float addrspace(3)* %arg, i32 147
-  %tmp258 = load float, float addrspace(3)* %tmp257, align 4
+  %tmp253 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 145
+  %tmp254 = load float, ptr addrspace(3) %tmp253, align 4
+  %tmp255 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 146
+  %tmp256 = load float, ptr addrspace(3) %tmp255, align 4
+  %tmp257 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 147
+  %tmp258 = load float, ptr addrspace(3) %tmp257, align 4
   %tmp259 = tail call float @llvm.fmuladd.f32(float %tmp254, float %tmp256, float %tmp258)
-  %tmp260 = getelementptr inbounds float, float addrspace(3)* %arg, i32 149
-  %tmp261 = load float, float addrspace(3)* %tmp260, align 4
-  %tmp262 = getelementptr inbounds float, float addrspace(3)* %arg, i32 150
-  %tmp263 = load float, float addrspace(3)* %tmp262, align 4
-  %tmp264 = getelementptr inbounds float, float addrspace(3)* %arg, i32 151
-  %tmp265 = load float, float addrspace(3)* %tmp264, align 4
+  %tmp260 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 149
+  %tmp261 = load float, ptr addrspace(3) %tmp260, align 4
+  %tmp262 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 150
+  %tmp263 = load float, ptr addrspace(3) %tmp262, align 4
+  %tmp264 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 151
+  %tmp265 = load float, ptr addrspace(3) %tmp264, align 4
   %tmp266 = tail call float @llvm.fmuladd.f32(float %tmp261, float %tmp263, float %tmp265)
-  %tmp267 = getelementptr inbounds float, float addrspace(3)* %arg, i32 153
-  %tmp268 = load float, float addrspace(3)* %tmp267, align 4
-  %tmp269 = getelementptr inbounds float, float addrspace(3)* %arg, i32 154
-  %tmp270 = load float, float addrspace(3)* %tmp269, align 4
-  %tmp271 = getelementptr inbounds float, float addrspace(3)* %arg, i32 155
-  %tmp272 = load float, float addrspace(3)* %tmp271, align 4
+  %tmp267 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 153
+  %tmp268 = load float, ptr addrspace(3) %tmp267, align 4
+  %tmp269 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 154
+  %tmp270 = load float, ptr addrspace(3) %tmp269, align 4
+  %tmp271 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 155
+  %tmp272 = load float, ptr addrspace(3) %tmp271, align 4
   %tmp273 = tail call float @llvm.fmuladd.f32(float %tmp268, float %tmp270, float %tmp272)
-  %tmp274 = getelementptr inbounds float, float addrspace(3)* %arg, i32 157
-  %tmp275 = load float, float addrspace(3)* %tmp274, align 4
-  %tmp276 = getelementptr inbounds float, float addrspace(3)* %arg, i32 158
-  %tmp277 = load float, float addrspace(3)* %tmp276, align 4
-  %tmp278 = getelementptr inbounds float, float addrspace(3)* %arg, i32 159
-  %tmp279 = load float, float addrspace(3)* %tmp278, align 4
+  %tmp274 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 157
+  %tmp275 = load float, ptr addrspace(3) %tmp274, align 4
+  %tmp276 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 158
+  %tmp277 = load float, ptr addrspace(3) %tmp276, align 4
+  %tmp278 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 159
+  %tmp279 = load float, ptr addrspace(3) %tmp278, align 4
   %tmp280 = tail call float @llvm.fmuladd.f32(float %tmp275, float %tmp277, float %tmp279)
-  %tmp281 = getelementptr inbounds float, float addrspace(3)* %arg, i32 161
-  %tmp282 = load float, float addrspace(3)* %tmp281, align 4
-  %tmp283 = getelementptr inbounds float, float addrspace(3)* %arg, i32 162
-  %tmp284 = load float, float addrspace(3)* %tmp283, align 4
-  %tmp285 = getelementptr inbounds float, float addrspace(3)* %arg, i32 163
-  %tmp286 = load float, float addrspace(3)* %tmp285, align 4
+  %tmp281 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 161
+  %tmp282 = load float, ptr addrspace(3) %tmp281, align 4
+  %tmp283 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 162
+  %tmp284 = load float, ptr addrspace(3) %tmp283, align 4
+  %tmp285 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 163
+  %tmp286 = load float, ptr addrspace(3) %tmp285, align 4
   %tmp287 = tail call float @llvm.fmuladd.f32(float %tmp282, float %tmp284, float %tmp286)
-  %tmp288 = getelementptr inbounds float, float addrspace(3)* %arg, i32 165
-  %tmp289 = load float, float addrspace(3)* %tmp288, align 4
-  %tmp290 = getelementptr inbounds float, float addrspace(3)* %arg, i32 166
-  %tmp291 = load float, float addrspace(3)* %tmp290, align 4
-  %tmp292 = getelementptr inbounds float, float addrspace(3)* %arg, i32 167
-  %tmp293 = load float, float addrspace(3)* %tmp292, align 4
+  %tmp288 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 165
+  %tmp289 = load float, ptr addrspace(3) %tmp288, align 4
+  %tmp290 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 166
+  %tmp291 = load float, ptr addrspace(3) %tmp290, align 4
+  %tmp292 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 167
+  %tmp293 = load float, ptr addrspace(3) %tmp292, align 4
   %tmp294 = tail call float @llvm.fmuladd.f32(float %tmp289, float %tmp291, float %tmp293)
-  %tmp295 = getelementptr inbounds float, float addrspace(3)* %arg, i32 169
-  %tmp296 = load float, float addrspace(3)* %tmp295, align 4
-  %tmp297 = getelementptr inbounds float, float addrspace(3)* %arg, i32 170
-  %tmp298 = load float, float addrspace(3)* %tmp297, align 4
-  %tmp299 = getelementptr inbounds float, float addrspace(3)* %arg, i32 171
-  %tmp300 = load float, float addrspace(3)* %tmp299, align 4
+  %tmp295 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 169
+  %tmp296 = load float, ptr addrspace(3) %tmp295, align 4
+  %tmp297 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 170
+  %tmp298 = load float, ptr addrspace(3) %tmp297, align 4
+  %tmp299 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 171
+  %tmp300 = load float, ptr addrspace(3) %tmp299, align 4
   %tmp301 = tail call float @llvm.fmuladd.f32(float %tmp296, float %tmp298, float %tmp300)
-  %tmp302 = getelementptr inbounds float, float addrspace(3)* %arg, i32 173
-  %tmp303 = load float, float addrspace(3)* %tmp302, align 4
-  %tmp304 = getelementptr inbounds float, float addrspace(3)* %arg, i32 174
-  %tmp305 = load float, float addrspace(3)* %tmp304, align 4
-  %tmp306 = getelementptr inbounds float, float addrspace(3)* %arg, i32 175
-  %tmp307 = load float, float addrspace(3)* %tmp306, align 4
+  %tmp302 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 173
+  %tmp303 = load float, ptr addrspace(3) %tmp302, align 4
+  %tmp304 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 174
+  %tmp305 = load float, ptr addrspace(3) %tmp304, align 4
+  %tmp306 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 175
+  %tmp307 = load float, ptr addrspace(3) %tmp306, align 4
   %tmp308 = tail call float @llvm.fmuladd.f32(float %tmp303, float %tmp305, float %tmp307)
-  %tmp309 = getelementptr inbounds float, float addrspace(3)* %arg, i32 177
-  %tmp310 = load float, float addrspace(3)* %tmp309, align 4
-  %tmp311 = getelementptr inbounds float, float addrspace(3)* %arg, i32 178
-  %tmp312 = load float, float addrspace(3)* %tmp311, align 4
-  %tmp313 = getelementptr inbounds float, float addrspace(3)* %arg, i32 179
-  %tmp314 = load float, float addrspace(3)* %tmp313, align 4
+  %tmp309 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 177
+  %tmp310 = load float, ptr addrspace(3) %tmp309, align 4
+  %tmp311 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 178
+  %tmp312 = load float, ptr addrspace(3) %tmp311, align 4
+  %tmp313 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 179
+  %tmp314 = load float, ptr addrspace(3) %tmp313, align 4
   %tmp315 = tail call float @llvm.fmuladd.f32(float %tmp310, float %tmp312, float %tmp314)
-  %tmp316 = getelementptr inbounds float, float addrspace(3)* %arg, i32 181
-  %tmp317 = load float, float addrspace(3)* %tmp316, align 4
-  %tmp318 = getelementptr inbounds float, float addrspace(3)* %arg, i32 182
-  %tmp319 = load float, float addrspace(3)* %tmp318, align 4
-  %tmp320 = getelementptr inbounds float, float addrspace(3)* %arg, i32 183
-  %tmp321 = load float, float addrspace(3)* %tmp320, align 4
+  %tmp316 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 181
+  %tmp317 = load float, ptr addrspace(3) %tmp316, align 4
+  %tmp318 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 182
+  %tmp319 = load float, ptr addrspace(3) %tmp318, align 4
+  %tmp320 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 183
+  %tmp321 = load float, ptr addrspace(3) %tmp320, align 4
   %tmp322 = tail call float @llvm.fmuladd.f32(float %tmp317, float %tmp319, float %tmp321)
-  %tmp323 = getelementptr inbounds float, float addrspace(3)* %arg, i32 185
-  %tmp324 = load float, float addrspace(3)* %tmp323, align 4
-  %tmp325 = getelementptr inbounds float, float addrspace(3)* %arg, i32 186
-  %tmp326 = load float, float addrspace(3)* %tmp325, align 4
-  %tmp327 = getelementptr inbounds float, float addrspace(3)* %arg, i32 187
-  %tmp328 = load float, float addrspace(3)* %tmp327, align 4
+  %tmp323 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 185
+  %tmp324 = load float, ptr addrspace(3) %tmp323, align 4
+  %tmp325 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 186
+  %tmp326 = load float, ptr addrspace(3) %tmp325, align 4
+  %tmp327 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 187
+  %tmp328 = load float, ptr addrspace(3) %tmp327, align 4
   %tmp329 = tail call float @llvm.fmuladd.f32(float %tmp324, float %tmp326, float %tmp328)
-  %tmp330 = getelementptr inbounds float, float addrspace(3)* %arg, i32 189
-  %tmp331 = load float, float addrspace(3)* %tmp330, align 4
-  %tmp332 = getelementptr inbounds float, float addrspace(3)* %arg, i32 190
-  %tmp333 = load float, float addrspace(3)* %tmp332, align 4
-  %tmp334 = getelementptr inbounds float, float addrspace(3)* %arg, i32 191
-  %tmp335 = load float, float addrspace(3)* %tmp334, align 4
+  %tmp330 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 189
+  %tmp331 = load float, ptr addrspace(3) %tmp330, align 4
+  %tmp332 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 190
+  %tmp333 = load float, ptr addrspace(3) %tmp332, align 4
+  %tmp334 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 191
+  %tmp335 = load float, ptr addrspace(3) %tmp334, align 4
   %tmp336 = tail call float @llvm.fmuladd.f32(float %tmp331, float %tmp333, float %tmp335)
-  %tmp337 = getelementptr inbounds float, float addrspace(3)* %arg, i32 193
-  %tmp338 = load float, float addrspace(3)* %tmp337, align 4
-  %tmp339 = getelementptr inbounds float, float addrspace(3)* %arg, i32 194
-  %tmp340 = load float, float addrspace(3)* %tmp339, align 4
-  %tmp341 = getelementptr inbounds float, float addrspace(3)* %arg, i32 195
-  %tmp342 = load float, float addrspace(3)* %tmp341, align 4
+  %tmp337 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 193
+  %tmp338 = load float, ptr addrspace(3) %tmp337, align 4
+  %tmp339 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 194
+  %tmp340 = load float, ptr addrspace(3) %tmp339, align 4
+  %tmp341 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 195
+  %tmp342 = load float, ptr addrspace(3) %tmp341, align 4
   %tmp343 = tail call float @llvm.fmuladd.f32(float %tmp338, float %tmp340, float %tmp342)
-  %tmp344 = getelementptr inbounds float, float addrspace(3)* %arg, i32 197
-  %tmp345 = load float, float addrspace(3)* %tmp344, align 4
-  %tmp346 = getelementptr inbounds float, float addrspace(3)* %arg, i32 198
-  %tmp347 = load float, float addrspace(3)* %tmp346, align 4
-  %tmp348 = getelementptr inbounds float, float addrspace(3)* %arg, i32 199
-  %tmp349 = load float, float addrspace(3)* %tmp348, align 4
+  %tmp344 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 197
+  %tmp345 = load float, ptr addrspace(3) %tmp344, align 4
+  %tmp346 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 198
+  %tmp347 = load float, ptr addrspace(3) %tmp346, align 4
+  %tmp348 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 199
+  %tmp349 = load float, ptr addrspace(3) %tmp348, align 4
   %tmp350 = tail call float @llvm.fmuladd.f32(float %tmp345, float %tmp347, float %tmp349)
-  %tmp351 = getelementptr inbounds float, float addrspace(3)* %arg, i32 201
-  %tmp352 = load float, float addrspace(3)* %tmp351, align 4
-  %tmp353 = getelementptr inbounds float, float addrspace(3)* %arg, i32 202
-  %tmp354 = load float, float addrspace(3)* %tmp353, align 4
-  %tmp355 = getelementptr inbounds float, float addrspace(3)* %arg, i32 203
-  %tmp356 = load float, float addrspace(3)* %tmp355, align 4
+  %tmp351 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 201
+  %tmp352 = load float, ptr addrspace(3) %tmp351, align 4
+  %tmp353 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 202
+  %tmp354 = load float, ptr addrspace(3) %tmp353, align 4
+  %tmp355 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 203
+  %tmp356 = load float, ptr addrspace(3) %tmp355, align 4
   %tmp357 = tail call float @llvm.fmuladd.f32(float %tmp352, float %tmp354, float %tmp356)
-  %tmp358 = getelementptr inbounds float, float addrspace(3)* %arg, i32 205
-  %tmp359 = load float, float addrspace(3)* %tmp358, align 4
-  %tmp360 = getelementptr inbounds float, float addrspace(3)* %arg, i32 206
-  %tmp361 = load float, float addrspace(3)* %tmp360, align 4
-  %tmp362 = getelementptr inbounds float, float addrspace(3)* %arg, i32 207
-  %tmp363 = load float, float addrspace(3)* %tmp362, align 4
+  %tmp358 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 205
+  %tmp359 = load float, ptr addrspace(3) %tmp358, align 4
+  %tmp360 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 206
+  %tmp361 = load float, ptr addrspace(3) %tmp360, align 4
+  %tmp362 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 207
+  %tmp363 = load float, ptr addrspace(3) %tmp362, align 4
   %tmp364 = tail call float @llvm.fmuladd.f32(float %tmp359, float %tmp361, float %tmp363)
-  %tmp365 = getelementptr inbounds float, float addrspace(3)* %arg, i32 209
-  %tmp366 = load float, float addrspace(3)* %tmp365, align 4
-  %tmp367 = getelementptr inbounds float, float addrspace(3)* %arg, i32 210
-  %tmp368 = load float, float addrspace(3)* %tmp367, align 4
-  %tmp369 = getelementptr inbounds float, float addrspace(3)* %arg, i32 211
-  %tmp370 = load float, float addrspace(3)* %tmp369, align 4
+  %tmp365 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 209
+  %tmp366 = load float, ptr addrspace(3) %tmp365, align 4
+  %tmp367 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 210
+  %tmp368 = load float, ptr addrspace(3) %tmp367, align 4
+  %tmp369 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 211
+  %tmp370 = load float, ptr addrspace(3) %tmp369, align 4
   %tmp371 = tail call float @llvm.fmuladd.f32(float %tmp366, float %tmp368, float %tmp370)
-  %tmp372 = getelementptr inbounds float, float addrspace(3)* %arg, i32 213
-  %tmp373 = load float, float addrspace(3)* %tmp372, align 4
-  %tmp374 = getelementptr inbounds float, float addrspace(3)* %arg, i32 214
-  %tmp375 = load float, float addrspace(3)* %tmp374, align 4
-  %tmp376 = getelementptr inbounds float, float addrspace(3)* %arg, i32 215
-  %tmp377 = load float, float addrspace(3)* %tmp376, align 4
+  %tmp372 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 213
+  %tmp373 = load float, ptr addrspace(3) %tmp372, align 4
+  %tmp374 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 214
+  %tmp375 = load float, ptr addrspace(3) %tmp374, align 4
+  %tmp376 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 215
+  %tmp377 = load float, ptr addrspace(3) %tmp376, align 4
   %tmp378 = tail call float @llvm.fmuladd.f32(float %tmp373, float %tmp375, float %tmp377)
-  %tmp379 = getelementptr inbounds float, float addrspace(3)* %arg, i32 217
-  %tmp380 = load float, float addrspace(3)* %tmp379, align 4
-  %tmp381 = getelementptr inbounds float, float addrspace(3)* %arg, i32 218
-  %tmp382 = load float, float addrspace(3)* %tmp381, align 4
-  %tmp383 = getelementptr inbounds float, float addrspace(3)* %arg, i32 219
-  %tmp384 = load float, float addrspace(3)* %tmp383, align 4
+  %tmp379 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 217
+  %tmp380 = load float, ptr addrspace(3) %tmp379, align 4
+  %tmp381 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 218
+  %tmp382 = load float, ptr addrspace(3) %tmp381, align 4
+  %tmp383 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 219
+  %tmp384 = load float, ptr addrspace(3) %tmp383, align 4
   %tmp385 = tail call float @llvm.fmuladd.f32(float %tmp380, float %tmp382, float %tmp384)
-  %tmp386 = getelementptr inbounds float, float addrspace(3)* %arg, i32 221
-  %tmp387 = load float, float addrspace(3)* %tmp386, align 4
-  %tmp388 = getelementptr inbounds float, float addrspace(3)* %arg, i32 222
-  %tmp389 = load float, float addrspace(3)* %tmp388, align 4
-  %tmp390 = getelementptr inbounds float, float addrspace(3)* %arg, i32 223
-  %tmp391 = load float, float addrspace(3)* %tmp390, align 4
+  %tmp386 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 221
+  %tmp387 = load float, ptr addrspace(3) %tmp386, align 4
+  %tmp388 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 222
+  %tmp389 = load float, ptr addrspace(3) %tmp388, align 4
+  %tmp390 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 223
+  %tmp391 = load float, ptr addrspace(3) %tmp390, align 4
   %tmp392 = tail call float @llvm.fmuladd.f32(float %tmp387, float %tmp389, float %tmp391)
-  %tmp393 = getelementptr inbounds float, float addrspace(3)* %arg, i32 225
-  %tmp394 = load float, float addrspace(3)* %tmp393, align 4
-  %tmp395 = getelementptr inbounds float, float addrspace(3)* %arg, i32 226
-  %tmp396 = load float, float addrspace(3)* %tmp395, align 4
-  %tmp397 = getelementptr inbounds float, float addrspace(3)* %arg, i32 227
-  %tmp398 = load float, float addrspace(3)* %tmp397, align 4
+  %tmp393 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 225
+  %tmp394 = load float, ptr addrspace(3) %tmp393, align 4
+  %tmp395 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 226
+  %tmp396 = load float, ptr addrspace(3) %tmp395, align 4
+  %tmp397 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 227
+  %tmp398 = load float, ptr addrspace(3) %tmp397, align 4
   %tmp399 = tail call float @llvm.fmuladd.f32(float %tmp394, float %tmp396, float %tmp398)
-  %tmp400 = getelementptr inbounds float, float addrspace(3)* %arg, i32 229
-  %tmp401 = load float, float addrspace(3)* %tmp400, align 4
-  %tmp402 = getelementptr inbounds float, float addrspace(3)* %arg, i32 230
-  %tmp403 = load float, float addrspace(3)* %tmp402, align 4
-  %tmp404 = getelementptr inbounds float, float addrspace(3)* %arg, i32 231
-  %tmp405 = load float, float addrspace(3)* %tmp404, align 4
+  %tmp400 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 229
+  %tmp401 = load float, ptr addrspace(3) %tmp400, align 4
+  %tmp402 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 230
+  %tmp403 = load float, ptr addrspace(3) %tmp402, align 4
+  %tmp404 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 231
+  %tmp405 = load float, ptr addrspace(3) %tmp404, align 4
   %tmp406 = tail call float @llvm.fmuladd.f32(float %tmp401, float %tmp403, float %tmp405)
-  %tmp407 = getelementptr inbounds float, float addrspace(3)* %arg, i32 233
-  %tmp408 = load float, float addrspace(3)* %tmp407, align 4
-  %tmp409 = getelementptr inbounds float, float addrspace(3)* %arg, i32 234
-  %tmp410 = load float, float addrspace(3)* %tmp409, align 4
-  %tmp411 = getelementptr inbounds float, float addrspace(3)* %arg, i32 235
-  %tmp412 = load float, float addrspace(3)* %tmp411, align 4
+  %tmp407 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 233
+  %tmp408 = load float, ptr addrspace(3) %tmp407, align 4
+  %tmp409 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 234
+  %tmp410 = load float, ptr addrspace(3) %tmp409, align 4
+  %tmp411 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 235
+  %tmp412 = load float, ptr addrspace(3) %tmp411, align 4
   %tmp413 = tail call float @llvm.fmuladd.f32(float %tmp408, float %tmp410, float %tmp412)
-  %tmp414 = getelementptr inbounds float, float addrspace(3)* %arg, i32 237
-  %tmp415 = load float, float addrspace(3)* %tmp414, align 4
-  %tmp416 = getelementptr inbounds float, float addrspace(3)* %arg, i32 238
-  %tmp417 = load float, float addrspace(3)* %tmp416, align 4
-  %tmp418 = getelementptr inbounds float, float addrspace(3)* %arg, i32 239
-  %tmp419 = load float, float addrspace(3)* %tmp418, align 4
+  %tmp414 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 237
+  %tmp415 = load float, ptr addrspace(3) %tmp414, align 4
+  %tmp416 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 238
+  %tmp417 = load float, ptr addrspace(3) %tmp416, align 4
+  %tmp418 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 239
+  %tmp419 = load float, ptr addrspace(3) %tmp418, align 4
   %tmp420 = tail call float @llvm.fmuladd.f32(float %tmp415, float %tmp417, float %tmp419)
-  %tmp421 = getelementptr inbounds float, float addrspace(3)* %arg, i32 241
-  %tmp422 = load float, float addrspace(3)* %tmp421, align 4
-  %tmp423 = getelementptr inbounds float, float addrspace(3)* %arg, i32 242
-  %tmp424 = load float, float addrspace(3)* %tmp423, align 4
-  %tmp425 = getelementptr inbounds float, float addrspace(3)* %arg, i32 243
-  %tmp426 = load float, float addrspace(3)* %tmp425, align 4
+  %tmp421 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 241
+  %tmp422 = load float, ptr addrspace(3) %tmp421, align 4
+  %tmp423 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 242
+  %tmp424 = load float, ptr addrspace(3) %tmp423, align 4
+  %tmp425 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 243
+  %tmp426 = load float, ptr addrspace(3) %tmp425, align 4
   %tmp427 = tail call float @llvm.fmuladd.f32(float %tmp422, float %tmp424, float %tmp426)
-  %tmp428 = getelementptr inbounds float, float addrspace(3)* %arg, i32 245
-  %tmp429 = load float, float addrspace(3)* %tmp428, align 4
-  %tmp430 = getelementptr inbounds float, float addrspace(3)* %arg, i32 246
-  %tmp431 = load float, float addrspace(3)* %tmp430, align 4
-  %tmp432 = getelementptr inbounds float, float addrspace(3)* %arg, i32 247
-  %tmp433 = load float, float addrspace(3)* %tmp432, align 4
+  %tmp428 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 245
+  %tmp429 = load float, ptr addrspace(3) %tmp428, align 4
+  %tmp430 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 246
+  %tmp431 = load float, ptr addrspace(3) %tmp430, align 4
+  %tmp432 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 247
+  %tmp433 = load float, ptr addrspace(3) %tmp432, align 4
   %tmp434 = tail call float @llvm.fmuladd.f32(float %tmp429, float %tmp431, float %tmp433)
-  %tmp435 = getelementptr inbounds float, float addrspace(3)* %arg, i32 249
-  %tmp436 = load float, float addrspace(3)* %tmp435, align 4
-  %tmp437 = getelementptr inbounds float, float addrspace(3)* %arg, i32 250
-  %tmp438 = load float, float addrspace(3)* %tmp437, align 4
-  %tmp439 = getelementptr inbounds float, float addrspace(3)* %arg, i32 251
-  %tmp440 = load float, float addrspace(3)* %tmp439, align 4
+  %tmp435 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 249
+  %tmp436 = load float, ptr addrspace(3) %tmp435, align 4
+  %tmp437 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 250
+  %tmp438 = load float, ptr addrspace(3) %tmp437, align 4
+  %tmp439 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 251
+  %tmp440 = load float, ptr addrspace(3) %tmp439, align 4
   %tmp441 = tail call float @llvm.fmuladd.f32(float %tmp436, float %tmp438, float %tmp440)
-  %tmp442 = getelementptr inbounds float, float addrspace(3)* %arg, i32 253
-  %tmp443 = load float, float addrspace(3)* %tmp442, align 4
-  %tmp444 = getelementptr inbounds float, float addrspace(3)* %arg, i32 254
-  %tmp445 = load float, float addrspace(3)* %tmp444, align 4
-  %tmp446 = getelementptr inbounds float, float addrspace(3)* %arg, i32 255
-  %tmp447 = load float, float addrspace(3)* %tmp446, align 4
+  %tmp442 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 253
+  %tmp443 = load float, ptr addrspace(3) %tmp442, align 4
+  %tmp444 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 254
+  %tmp445 = load float, ptr addrspace(3) %tmp444, align 4
+  %tmp446 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 255
+  %tmp447 = load float, ptr addrspace(3) %tmp446, align 4
   %tmp448 = tail call float @llvm.fmuladd.f32(float %tmp443, float %tmp445, float %tmp447)
-  store float %tmp7, float addrspace(1)* %arg1, align 4
-  %tmp449 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 1
-  store float %tmp14, float addrspace(1)* %tmp449, align 4
-  %tmp450 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 2
-  store float %tmp21, float addrspace(1)* %tmp450, align 4
-  %tmp451 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 3
-  store float %tmp28, float addrspace(1)* %tmp451, align 4
-  %tmp452 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 4
-  store float %tmp35, float addrspace(1)* %tmp452, align 4
-  %tmp453 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 5
-  store float %tmp42, float addrspace(1)* %tmp453, align 4
-  %tmp454 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 6
-  store float %tmp49, float addrspace(1)* %tmp454, align 4
-  %tmp455 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 7
-  store float %tmp56, float addrspace(1)* %tmp455, align 4
-  %tmp456 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 8
-  store float %tmp63, float addrspace(1)* %tmp456, align 4
-  %tmp457 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 9
-  store float %tmp70, float addrspace(1)* %tmp457, align 4
-  %tmp458 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 10
-  store float %tmp77, float addrspace(1)* %tmp458, align 4
-  %tmp459 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 11
-  store float %tmp84, float addrspace(1)* %tmp459, align 4
-  %tmp460 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 12
-  store float %tmp91, float addrspace(1)* %tmp460, align 4
-  %tmp461 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 13
-  store float %tmp98, float addrspace(1)* %tmp461, align 4
-  %tmp462 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 14
-  store float %tmp105, float addrspace(1)* %tmp462, align 4
-  %tmp463 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 15
-  store float %tmp112, float addrspace(1)* %tmp463, align 4
-  %tmp464 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 16
-  store float %tmp119, float addrspace(1)* %tmp464, align 4
-  %tmp465 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 17
-  store float %tmp126, float addrspace(1)* %tmp465, align 4
-  %tmp466 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 18
-  store float %tmp133, float addrspace(1)* %tmp466, align 4
-  %tmp467 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 19
-  store float %tmp140, float addrspace(1)* %tmp467, align 4
-  %tmp468 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 20
-  store float %tmp147, float addrspace(1)* %tmp468, align 4
-  %tmp469 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 21
-  store float %tmp154, float addrspace(1)* %tmp469, align 4
-  %tmp470 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 22
-  store float %tmp161, float addrspace(1)* %tmp470, align 4
-  %tmp471 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 23
-  store float %tmp168, float addrspace(1)* %tmp471, align 4
-  %tmp472 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 24
-  store float %tmp175, float addrspace(1)* %tmp472, align 4
-  %tmp473 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 25
-  store float %tmp182, float addrspace(1)* %tmp473, align 4
-  %tmp474 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 26
-  store float %tmp189, float addrspace(1)* %tmp474, align 4
-  %tmp475 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 27
-  store float %tmp196, float addrspace(1)* %tmp475, align 4
-  %tmp476 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 28
-  store float %tmp203, float addrspace(1)* %tmp476, align 4
-  %tmp477 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 29
-  store float %tmp210, float addrspace(1)* %tmp477, align 4
-  %tmp478 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 30
-  store float %tmp217, float addrspace(1)* %tmp478, align 4
-  %tmp479 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 31
-  store float %tmp224, float addrspace(1)* %tmp479, align 4
-  %tmp480 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 32
-  store float %tmp231, float addrspace(1)* %tmp480, align 4
-  %tmp481 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 33
-  store float %tmp238, float addrspace(1)* %tmp481, align 4
-  %tmp482 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 34
-  store float %tmp245, float addrspace(1)* %tmp482, align 4
-  %tmp483 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 35
-  store float %tmp252, float addrspace(1)* %tmp483, align 4
-  %tmp484 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 36
-  store float %tmp259, float addrspace(1)* %tmp484, align 4
-  %tmp485 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 37
-  store float %tmp266, float addrspace(1)* %tmp485, align 4
-  %tmp486 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 38
-  store float %tmp273, float addrspace(1)* %tmp486, align 4
-  %tmp487 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 39
-  store float %tmp280, float addrspace(1)* %tmp487, align 4
-  %tmp488 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 40
-  store float %tmp287, float addrspace(1)* %tmp488, align 4
-  %tmp489 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 41
-  store float %tmp294, float addrspace(1)* %tmp489, align 4
-  %tmp490 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 42
-  store float %tmp301, float addrspace(1)* %tmp490, align 4
-  %tmp491 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 43
-  store float %tmp308, float addrspace(1)* %tmp491, align 4
-  %tmp492 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 44
-  store float %tmp315, float addrspace(1)* %tmp492, align 4
-  %tmp493 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 45
-  store float %tmp322, float addrspace(1)* %tmp493, align 4
-  %tmp494 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 46
-  store float %tmp329, float addrspace(1)* %tmp494, align 4
-  %tmp495 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 47
-  store float %tmp336, float addrspace(1)* %tmp495, align 4
-  %tmp496 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 48
-  store float %tmp343, float addrspace(1)* %tmp496, align 4
-  %tmp497 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 49
-  store float %tmp350, float addrspace(1)* %tmp497, align 4
-  %tmp498 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 50
-  store float %tmp357, float addrspace(1)* %tmp498, align 4
-  %tmp499 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 51
-  store float %tmp364, float addrspace(1)* %tmp499, align 4
-  %tmp500 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 52
-  store float %tmp371, float addrspace(1)* %tmp500, align 4
-  %tmp501 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 53
-  store float %tmp378, float addrspace(1)* %tmp501, align 4
-  %tmp502 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 54
-  store float %tmp385, float addrspace(1)* %tmp502, align 4
-  %tmp503 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 55
-  store float %tmp392, float addrspace(1)* %tmp503, align 4
-  %tmp504 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 56
-  store float %tmp399, float addrspace(1)* %tmp504, align 4
-  %tmp505 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 57
-  store float %tmp406, float addrspace(1)* %tmp505, align 4
-  %tmp506 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 58
-  store float %tmp413, float addrspace(1)* %tmp506, align 4
-  %tmp507 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 59
-  store float %tmp420, float addrspace(1)* %tmp507, align 4
-  %tmp508 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 60
-  store float %tmp427, float addrspace(1)* %tmp508, align 4
-  %tmp509 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 61
-  store float %tmp434, float addrspace(1)* %tmp509, align 4
-  %tmp510 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 62
-  store float %tmp441, float addrspace(1)* %tmp510, align 4
-  %tmp511 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 63
-  store float %tmp448, float addrspace(1)* %tmp511, align 4
+  store float %tmp7, ptr addrspace(1) %arg1, align 4
+  %tmp449 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 1
+  store float %tmp14, ptr addrspace(1) %tmp449, align 4
+  %tmp450 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 2
+  store float %tmp21, ptr addrspace(1) %tmp450, align 4
+  %tmp451 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 3
+  store float %tmp28, ptr addrspace(1) %tmp451, align 4
+  %tmp452 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 4
+  store float %tmp35, ptr addrspace(1) %tmp452, align 4
+  %tmp453 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 5
+  store float %tmp42, ptr addrspace(1) %tmp453, align 4
+  %tmp454 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 6
+  store float %tmp49, ptr addrspace(1) %tmp454, align 4
+  %tmp455 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 7
+  store float %tmp56, ptr addrspace(1) %tmp455, align 4
+  %tmp456 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 8
+  store float %tmp63, ptr addrspace(1) %tmp456, align 4
+  %tmp457 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 9
+  store float %tmp70, ptr addrspace(1) %tmp457, align 4
+  %tmp458 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 10
+  store float %tmp77, ptr addrspace(1) %tmp458, align 4
+  %tmp459 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 11
+  store float %tmp84, ptr addrspace(1) %tmp459, align 4
+  %tmp460 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 12
+  store float %tmp91, ptr addrspace(1) %tmp460, align 4
+  %tmp461 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 13
+  store float %tmp98, ptr addrspace(1) %tmp461, align 4
+  %tmp462 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 14
+  store float %tmp105, ptr addrspace(1) %tmp462, align 4
+  %tmp463 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 15
+  store float %tmp112, ptr addrspace(1) %tmp463, align 4
+  %tmp464 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 16
+  store float %tmp119, ptr addrspace(1) %tmp464, align 4
+  %tmp465 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 17
+  store float %tmp126, ptr addrspace(1) %tmp465, align 4
+  %tmp466 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 18
+  store float %tmp133, ptr addrspace(1) %tmp466, align 4
+  %tmp467 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 19
+  store float %tmp140, ptr addrspace(1) %tmp467, align 4
+  %tmp468 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 20
+  store float %tmp147, ptr addrspace(1) %tmp468, align 4
+  %tmp469 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 21
+  store float %tmp154, ptr addrspace(1) %tmp469, align 4
+  %tmp470 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 22
+  store float %tmp161, ptr addrspace(1) %tmp470, align 4
+  %tmp471 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 23
+  store float %tmp168, ptr addrspace(1) %tmp471, align 4
+  %tmp472 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 24
+  store float %tmp175, ptr addrspace(1) %tmp472, align 4
+  %tmp473 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 25
+  store float %tmp182, ptr addrspace(1) %tmp473, align 4
+  %tmp474 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 26
+  store float %tmp189, ptr addrspace(1) %tmp474, align 4
+  %tmp475 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 27
+  store float %tmp196, ptr addrspace(1) %tmp475, align 4
+  %tmp476 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 28
+  store float %tmp203, ptr addrspace(1) %tmp476, align 4
+  %tmp477 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 29
+  store float %tmp210, ptr addrspace(1) %tmp477, align 4
+  %tmp478 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 30
+  store float %tmp217, ptr addrspace(1) %tmp478, align 4
+  %tmp479 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 31
+  store float %tmp224, ptr addrspace(1) %tmp479, align 4
+  %tmp480 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 32
+  store float %tmp231, ptr addrspace(1) %tmp480, align 4
+  %tmp481 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 33
+  store float %tmp238, ptr addrspace(1) %tmp481, align 4
+  %tmp482 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 34
+  store float %tmp245, ptr addrspace(1) %tmp482, align 4
+  %tmp483 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 35
+  store float %tmp252, ptr addrspace(1) %tmp483, align 4
+  %tmp484 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 36
+  store float %tmp259, ptr addrspace(1) %tmp484, align 4
+  %tmp485 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 37
+  store float %tmp266, ptr addrspace(1) %tmp485, align 4
+  %tmp486 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 38
+  store float %tmp273, ptr addrspace(1) %tmp486, align 4
+  %tmp487 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 39
+  store float %tmp280, ptr addrspace(1) %tmp487, align 4
+  %tmp488 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 40
+  store float %tmp287, ptr addrspace(1) %tmp488, align 4
+  %tmp489 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 41
+  store float %tmp294, ptr addrspace(1) %tmp489, align 4
+  %tmp490 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 42
+  store float %tmp301, ptr addrspace(1) %tmp490, align 4
+  %tmp491 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 43
+  store float %tmp308, ptr addrspace(1) %tmp491, align 4
+  %tmp492 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 44
+  store float %tmp315, ptr addrspace(1) %tmp492, align 4
+  %tmp493 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 45
+  store float %tmp322, ptr addrspace(1) %tmp493, align 4
+  %tmp494 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 46
+  store float %tmp329, ptr addrspace(1) %tmp494, align 4
+  %tmp495 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 47
+  store float %tmp336, ptr addrspace(1) %tmp495, align 4
+  %tmp496 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 48
+  store float %tmp343, ptr addrspace(1) %tmp496, align 4
+  %tmp497 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 49
+  store float %tmp350, ptr addrspace(1) %tmp497, align 4
+  %tmp498 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 50
+  store float %tmp357, ptr addrspace(1) %tmp498, align 4
+  %tmp499 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 51
+  store float %tmp364, ptr addrspace(1) %tmp499, align 4
+  %tmp500 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 52
+  store float %tmp371, ptr addrspace(1) %tmp500, align 4
+  %tmp501 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 53
+  store float %tmp378, ptr addrspace(1) %tmp501, align 4
+  %tmp502 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 54
+  store float %tmp385, ptr addrspace(1) %tmp502, align 4
+  %tmp503 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 55
+  store float %tmp392, ptr addrspace(1) %tmp503, align 4
+  %tmp504 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 56
+  store float %tmp399, ptr addrspace(1) %tmp504, align 4
+  %tmp505 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 57
+  store float %tmp406, ptr addrspace(1) %tmp505, align 4
+  %tmp506 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 58
+  store float %tmp413, ptr addrspace(1) %tmp506, align 4
+  %tmp507 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 59
+  store float %tmp420, ptr addrspace(1) %tmp507, align 4
+  %tmp508 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 60
+  store float %tmp427, ptr addrspace(1) %tmp508, align 4
+  %tmp509 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 61
+  store float %tmp434, ptr addrspace(1) %tmp509, align 4
+  %tmp510 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 62
+  store float %tmp441, ptr addrspace(1) %tmp510, align 4
+  %tmp511 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 63
+  store float %tmp448, ptr addrspace(1) %tmp511, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll b/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
index 0645bb474ca0e..c1a022713718f 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
@@ -7,9 +7,9 @@
 
 ; VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
 ; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
-define amdgpu_kernel void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
-  store i32 %x, i32 addrspace(1)* %out0, align 4
-  store i32 %y, i32 addrspace(1)* %out1, align 4
+define amdgpu_kernel void @cluster_arg_loads(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) nounwind {
+  store i32 %x, ptr addrspace(1) %out0, align 4
+  store i32 %y, ptr addrspace(1) %out1, align 4
   ret void
 }
 
@@ -21,7 +21,7 @@ define amdgpu_kernel void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrsp
 ; GCN: s_load_dwordx2
 ; GCN: s_load_dwordx2
 ; GCN: s_endpgm
-define amdgpu_kernel void @same_base_ptr_crash(i64 addrspace(1)* %out,
+define amdgpu_kernel void @same_base_ptr_crash(ptr addrspace(1) %out,
     i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7,
     i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15,
     i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23,
@@ -40,6 +40,6 @@ define amdgpu_kernel void @same_base_ptr_crash(i64 addrspace(1)* %out,
     i64 %arg120, i64 %arg121, i64 %arg122, i64 %arg123, i64 %arg124, i64 %arg125, i64 %arg126) {
 entry:
   %value = add i64 %arg124, %arg126
-  store i64 %value, i64 addrspace(1)* %out, align 8
+  store i64 %value, ptr addrspace(1) %out, align 8
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
index 139669bbe6d08..0d1808a0ee0b2 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
@@ -15,24 +15,24 @@
 ; GCN: NumVgprs: {{[0-9]$}}
 ; GCN: ScratchSize: 0{{$}}
 
-define amdgpu_kernel void @load_store_max_9vgprs(<4 x i32> addrspace(1)* nocapture noalias readonly %arg, <4 x i32> addrspace(1)* nocapture noalias %arg1, i1 %cnd) #1 {
+define amdgpu_kernel void @load_store_max_9vgprs(ptr addrspace(1) nocapture noalias readonly %arg, ptr addrspace(1) nocapture noalias %arg1, i1 %cnd) #1 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %base = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %id
+  %base = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i32 %id
   br i1 %cnd, label %bb1, label %bb2
 
 bb1:
-  %tmp = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 1
-  %tmp2 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp, align 4
-  %tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 3
-  %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 4
-  %tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 5
-  %tmp6 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp5, align 4
-  store <4 x i32> %tmp2, <4 x i32> addrspace(1)* %arg1, align 4
-  %tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 3
-  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp7, align 4
-  %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 5
-  store <4 x i32> %tmp6, <4 x i32> addrspace(1)* %tmp8, align 4
+  %tmp = getelementptr inbounds <4 x i32>, ptr addrspace(1) %base, i32 1
+  %tmp2 = load <4 x i32>, ptr addrspace(1) %tmp, align 4
+  %tmp3 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %base, i32 3
+  %tmp4 = load <4 x i32>, ptr addrspace(1) %tmp3, align 4
+  %tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %base, i32 5
+  %tmp6 = load <4 x i32>, ptr addrspace(1) %tmp5, align 4
+  store <4 x i32> %tmp2, ptr addrspace(1) %arg1, align 4
+  %tmp7 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 3
+  store <4 x i32> %tmp4, ptr addrspace(1) %tmp7, align 4
+  %tmp8 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 5
+  store <4 x i32> %tmp6, ptr addrspace(1) %tmp8, align 4
   br label %bb2
 
 bb2:

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
index a9e395108698e..a56a60935d3af 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
@@ -5,583 +5,583 @@
 ; We expect a two digit VGPR usage here, not a three digit.
 ; CHECK: NumVgprs: {{[0-9][0-9]$}}
 
-define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(1)* nocapture %arg1) {
+define amdgpu_kernel void @load_fma_store(ptr addrspace(3) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
 bb:
-  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1
-  %tmp2 = load float, float addrspace(3)* %tmp, align 4
-  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2
-  %tmp4 = load float, float addrspace(3)* %tmp3, align 4
-  %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 3
-  %tmp6 = load float, float addrspace(3)* %tmp5, align 4
+  %tmp = getelementptr inbounds float, ptr addrspace(3) %arg, i32 1
+  %tmp2 = load float, ptr addrspace(3) %tmp, align 4
+  %tmp3 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 2
+  %tmp4 = load float, ptr addrspace(3) %tmp3, align 4
+  %tmp5 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 3
+  %tmp6 = load float, ptr addrspace(3) %tmp5, align 4
   %tmp7 = tail call float @llvm.fmuladd.f32(float %tmp2, float %tmp4, float %tmp6)
-  %tmp8 = getelementptr inbounds float, float addrspace(3)* %arg, i32 5
-  %tmp9 = load float, float addrspace(3)* %tmp8, align 4
-  %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6
-  %tmp11 = load float, float addrspace(3)* %tmp10, align 4
-  %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 7
-  %tmp13 = load float, float addrspace(3)* %tmp12, align 4
+  %tmp8 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 5
+  %tmp9 = load float, ptr addrspace(3) %tmp8, align 4
+  %tmp10 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 6
+  %tmp11 = load float, ptr addrspace(3) %tmp10, align 4
+  %tmp12 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 7
+  %tmp13 = load float, ptr addrspace(3) %tmp12, align 4
   %tmp14 = tail call float @llvm.fmuladd.f32(float %tmp9, float %tmp11, float %tmp13)
-  %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 9
-  %tmp16 = load float, float addrspace(3)* %tmp15, align 4
-  %tmp17 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10
-  %tmp18 = load float, float addrspace(3)* %tmp17, align 4
-  %tmp19 = getelementptr inbounds float, float addrspace(3)* %arg, i32 11
-  %tmp20 = load float, float addrspace(3)* %tmp19, align 4
+  %tmp15 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 9
+  %tmp16 = load float, ptr addrspace(3) %tmp15, align 4
+  %tmp17 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 10
+  %tmp18 = load float, ptr addrspace(3) %tmp17, align 4
+  %tmp19 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 11
+  %tmp20 = load float, ptr addrspace(3) %tmp19, align 4
   %tmp21 = tail call float @llvm.fmuladd.f32(float %tmp16, float %tmp18, float %tmp20)
-  %tmp22 = getelementptr inbounds float, float addrspace(3)* %arg, i32 13
-  %tmp23 = load float, float addrspace(3)* %tmp22, align 4
-  %tmp24 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14
-  %tmp25 = load float, float addrspace(3)* %tmp24, align 4
-  %tmp26 = getelementptr inbounds float, float addrspace(3)* %arg, i32 15
-  %tmp27 = load float, float addrspace(3)* %tmp26, align 4
+  %tmp22 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 13
+  %tmp23 = load float, ptr addrspace(3) %tmp22, align 4
+  %tmp24 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 14
+  %tmp25 = load float, ptr addrspace(3) %tmp24, align 4
+  %tmp26 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 15
+  %tmp27 = load float, ptr addrspace(3) %tmp26, align 4
   %tmp28 = tail call float @llvm.fmuladd.f32(float %tmp23, float %tmp25, float %tmp27)
-  %tmp29 = getelementptr inbounds float, float addrspace(3)* %arg, i32 17
-  %tmp30 = load float, float addrspace(3)* %tmp29, align 4
-  %tmp31 = getelementptr inbounds float, float addrspace(3)* %arg, i32 18
-  %tmp32 = load float, float addrspace(3)* %tmp31, align 4
-  %tmp33 = getelementptr inbounds float, float addrspace(3)* %arg, i32 19
-  %tmp34 = load float, float addrspace(3)* %tmp33, align 4
+  %tmp29 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 17
+  %tmp30 = load float, ptr addrspace(3) %tmp29, align 4
+  %tmp31 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 18
+  %tmp32 = load float, ptr addrspace(3) %tmp31, align 4
+  %tmp33 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 19
+  %tmp34 = load float, ptr addrspace(3) %tmp33, align 4
   %tmp35 = tail call float @llvm.fmuladd.f32(float %tmp30, float %tmp32, float %tmp34)
-  %tmp36 = getelementptr inbounds float, float addrspace(3)* %arg, i32 21
-  %tmp37 = load float, float addrspace(3)* %tmp36, align 4
-  %tmp38 = getelementptr inbounds float, float addrspace(3)* %arg, i32 22
-  %tmp39 = load float, float addrspace(3)* %tmp38, align 4
-  %tmp40 = getelementptr inbounds float, float addrspace(3)* %arg, i32 23
-  %tmp41 = load float, float addrspace(3)* %tmp40, align 4
+  %tmp36 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 21
+  %tmp37 = load float, ptr addrspace(3) %tmp36, align 4
+  %tmp38 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 22
+  %tmp39 = load float, ptr addrspace(3) %tmp38, align 4
+  %tmp40 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 23
+  %tmp41 = load float, ptr addrspace(3) %tmp40, align 4
   %tmp42 = tail call float @llvm.fmuladd.f32(float %tmp37, float %tmp39, float %tmp41)
-  %tmp43 = getelementptr inbounds float, float addrspace(3)* %arg, i32 25
-  %tmp44 = load float, float addrspace(3)* %tmp43, align 4
-  %tmp45 = getelementptr inbounds float, float addrspace(3)* %arg, i32 26
-  %tmp46 = load float, float addrspace(3)* %tmp45, align 4
-  %tmp47 = getelementptr inbounds float, float addrspace(3)* %arg, i32 27
-  %tmp48 = load float, float addrspace(3)* %tmp47, align 4
+  %tmp43 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 25
+  %tmp44 = load float, ptr addrspace(3) %tmp43, align 4
+  %tmp45 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 26
+  %tmp46 = load float, ptr addrspace(3) %tmp45, align 4
+  %tmp47 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 27
+  %tmp48 = load float, ptr addrspace(3) %tmp47, align 4
   %tmp49 = tail call float @llvm.fmuladd.f32(float %tmp44, float %tmp46, float %tmp48)
-  %tmp50 = getelementptr inbounds float, float addrspace(3)* %arg, i32 29
-  %tmp51 = load float, float addrspace(3)* %tmp50, align 4
-  %tmp52 = getelementptr inbounds float, float addrspace(3)* %arg, i32 30
-  %tmp53 = load float, float addrspace(3)* %tmp52, align 4
-  %tmp54 = getelementptr inbounds float, float addrspace(3)* %arg, i32 31
-  %tmp55 = load float, float addrspace(3)* %tmp54, align 4
+  %tmp50 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 29
+  %tmp51 = load float, ptr addrspace(3) %tmp50, align 4
+  %tmp52 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 30
+  %tmp53 = load float, ptr addrspace(3) %tmp52, align 4
+  %tmp54 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 31
+  %tmp55 = load float, ptr addrspace(3) %tmp54, align 4
   %tmp56 = tail call float @llvm.fmuladd.f32(float %tmp51, float %tmp53, float %tmp55)
-  %tmp57 = getelementptr inbounds float, float addrspace(3)* %arg, i32 33
-  %tmp58 = load float, float addrspace(3)* %tmp57, align 4
-  %tmp59 = getelementptr inbounds float, float addrspace(3)* %arg, i32 34
-  %tmp60 = load float, float addrspace(3)* %tmp59, align 4
-  %tmp61 = getelementptr inbounds float, float addrspace(3)* %arg, i32 35
-  %tmp62 = load float, float addrspace(3)* %tmp61, align 4
+  %tmp57 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 33
+  %tmp58 = load float, ptr addrspace(3) %tmp57, align 4
+  %tmp59 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 34
+  %tmp60 = load float, ptr addrspace(3) %tmp59, align 4
+  %tmp61 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 35
+  %tmp62 = load float, ptr addrspace(3) %tmp61, align 4
   %tmp63 = tail call float @llvm.fmuladd.f32(float %tmp58, float %tmp60, float %tmp62)
-  %tmp64 = getelementptr inbounds float, float addrspace(3)* %arg, i32 37
-  %tmp65 = load float, float addrspace(3)* %tmp64, align 4
-  %tmp66 = getelementptr inbounds float, float addrspace(3)* %arg, i32 38
-  %tmp67 = load float, float addrspace(3)* %tmp66, align 4
-  %tmp68 = getelementptr inbounds float, float addrspace(3)* %arg, i32 39
-  %tmp69 = load float, float addrspace(3)* %tmp68, align 4
+  %tmp64 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 37
+  %tmp65 = load float, ptr addrspace(3) %tmp64, align 4
+  %tmp66 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 38
+  %tmp67 = load float, ptr addrspace(3) %tmp66, align 4
+  %tmp68 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 39
+  %tmp69 = load float, ptr addrspace(3) %tmp68, align 4
   %tmp70 = tail call float @llvm.fmuladd.f32(float %tmp65, float %tmp67, float %tmp69)
-  %tmp71 = getelementptr inbounds float, float addrspace(3)* %arg, i32 41
-  %tmp72 = load float, float addrspace(3)* %tmp71, align 4
-  %tmp73 = getelementptr inbounds float, float addrspace(3)* %arg, i32 42
-  %tmp74 = load float, float addrspace(3)* %tmp73, align 4
-  %tmp75 = getelementptr inbounds float, float addrspace(3)* %arg, i32 43
-  %tmp76 = load float, float addrspace(3)* %tmp75, align 4
+  %tmp71 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 41
+  %tmp72 = load float, ptr addrspace(3) %tmp71, align 4
+  %tmp73 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 42
+  %tmp74 = load float, ptr addrspace(3) %tmp73, align 4
+  %tmp75 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 43
+  %tmp76 = load float, ptr addrspace(3) %tmp75, align 4
   %tmp77 = tail call float @llvm.fmuladd.f32(float %tmp72, float %tmp74, float %tmp76)
-  %tmp78 = getelementptr inbounds float, float addrspace(3)* %arg, i32 45
-  %tmp79 = load float, float addrspace(3)* %tmp78, align 4
-  %tmp80 = getelementptr inbounds float, float addrspace(3)* %arg, i32 46
-  %tmp81 = load float, float addrspace(3)* %tmp80, align 4
-  %tmp82 = getelementptr inbounds float, float addrspace(3)* %arg, i32 47
-  %tmp83 = load float, float addrspace(3)* %tmp82, align 4
+  %tmp78 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 45
+  %tmp79 = load float, ptr addrspace(3) %tmp78, align 4
+  %tmp80 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 46
+  %tmp81 = load float, ptr addrspace(3) %tmp80, align 4
+  %tmp82 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 47
+  %tmp83 = load float, ptr addrspace(3) %tmp82, align 4
   %tmp84 = tail call float @llvm.fmuladd.f32(float %tmp79, float %tmp81, float %tmp83)
-  %tmp85 = getelementptr inbounds float, float addrspace(3)* %arg, i32 49
-  %tmp86 = load float, float addrspace(3)* %tmp85, align 4
-  %tmp87 = getelementptr inbounds float, float addrspace(3)* %arg, i32 50
-  %tmp88 = load float, float addrspace(3)* %tmp87, align 4
-  %tmp89 = getelementptr inbounds float, float addrspace(3)* %arg, i32 51
-  %tmp90 = load float, float addrspace(3)* %tmp89, align 4
+  %tmp85 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 49
+  %tmp86 = load float, ptr addrspace(3) %tmp85, align 4
+  %tmp87 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 50
+  %tmp88 = load float, ptr addrspace(3) %tmp87, align 4
+  %tmp89 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 51
+  %tmp90 = load float, ptr addrspace(3) %tmp89, align 4
   %tmp91 = tail call float @llvm.fmuladd.f32(float %tmp86, float %tmp88, float %tmp90)
-  %tmp92 = getelementptr inbounds float, float addrspace(3)* %arg, i32 53
-  %tmp93 = load float, float addrspace(3)* %tmp92, align 4
-  %tmp94 = getelementptr inbounds float, float addrspace(3)* %arg, i32 54
-  %tmp95 = load float, float addrspace(3)* %tmp94, align 4
-  %tmp96 = getelementptr inbounds float, float addrspace(3)* %arg, i32 55
-  %tmp97 = load float, float addrspace(3)* %tmp96, align 4
+  %tmp92 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 53
+  %tmp93 = load float, ptr addrspace(3) %tmp92, align 4
+  %tmp94 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 54
+  %tmp95 = load float, ptr addrspace(3) %tmp94, align 4
+  %tmp96 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 55
+  %tmp97 = load float, ptr addrspace(3) %tmp96, align 4
   %tmp98 = tail call float @llvm.fmuladd.f32(float %tmp93, float %tmp95, float %tmp97)
-  %tmp99 = getelementptr inbounds float, float addrspace(3)* %arg, i32 57
-  %tmp100 = load float, float addrspace(3)* %tmp99, align 4
-  %tmp101 = getelementptr inbounds float, float addrspace(3)* %arg, i32 58
-  %tmp102 = load float, float addrspace(3)* %tmp101, align 4
-  %tmp103 = getelementptr inbounds float, float addrspace(3)* %arg, i32 59
-  %tmp104 = load float, float addrspace(3)* %tmp103, align 4
+  %tmp99 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 57
+  %tmp100 = load float, ptr addrspace(3) %tmp99, align 4
+  %tmp101 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 58
+  %tmp102 = load float, ptr addrspace(3) %tmp101, align 4
+  %tmp103 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 59
+  %tmp104 = load float, ptr addrspace(3) %tmp103, align 4
   %tmp105 = tail call float @llvm.fmuladd.f32(float %tmp100, float %tmp102, float %tmp104)
-  %tmp106 = getelementptr inbounds float, float addrspace(3)* %arg, i32 61
-  %tmp107 = load float, float addrspace(3)* %tmp106, align 4
-  %tmp108 = getelementptr inbounds float, float addrspace(3)* %arg, i32 62
-  %tmp109 = load float, float addrspace(3)* %tmp108, align 4
-  %tmp110 = getelementptr inbounds float, float addrspace(3)* %arg, i32 63
-  %tmp111 = load float, float addrspace(3)* %tmp110, align 4
+  %tmp106 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 61
+  %tmp107 = load float, ptr addrspace(3) %tmp106, align 4
+  %tmp108 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 62
+  %tmp109 = load float, ptr addrspace(3) %tmp108, align 4
+  %tmp110 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 63
+  %tmp111 = load float, ptr addrspace(3) %tmp110, align 4
   %tmp112 = tail call float @llvm.fmuladd.f32(float %tmp107, float %tmp109, float %tmp111)
-  %tmp113 = getelementptr inbounds float, float addrspace(3)* %arg, i32 65
-  %tmp114 = load float, float addrspace(3)* %tmp113, align 4
-  %tmp115 = getelementptr inbounds float, float addrspace(3)* %arg, i32 66
-  %tmp116 = load float, float addrspace(3)* %tmp115, align 4
-  %tmp117 = getelementptr inbounds float, float addrspace(3)* %arg, i32 67
-  %tmp118 = load float, float addrspace(3)* %tmp117, align 4
+  %tmp113 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 65
+  %tmp114 = load float, ptr addrspace(3) %tmp113, align 4
+  %tmp115 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 66
+  %tmp116 = load float, ptr addrspace(3) %tmp115, align 4
+  %tmp117 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 67
+  %tmp118 = load float, ptr addrspace(3) %tmp117, align 4
   %tmp119 = tail call float @llvm.fmuladd.f32(float %tmp114, float %tmp116, float %tmp118)
-  %tmp120 = getelementptr inbounds float, float addrspace(3)* %arg, i32 69
-  %tmp121 = load float, float addrspace(3)* %tmp120, align 4
-  %tmp122 = getelementptr inbounds float, float addrspace(3)* %arg, i32 70
-  %tmp123 = load float, float addrspace(3)* %tmp122, align 4
-  %tmp124 = getelementptr inbounds float, float addrspace(3)* %arg, i32 71
-  %tmp125 = load float, float addrspace(3)* %tmp124, align 4
+  %tmp120 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 69
+  %tmp121 = load float, ptr addrspace(3) %tmp120, align 4
+  %tmp122 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 70
+  %tmp123 = load float, ptr addrspace(3) %tmp122, align 4
+  %tmp124 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 71
+  %tmp125 = load float, ptr addrspace(3) %tmp124, align 4
   %tmp126 = tail call float @llvm.fmuladd.f32(float %tmp121, float %tmp123, float %tmp125)
-  %tmp127 = getelementptr inbounds float, float addrspace(3)* %arg, i32 73
-  %tmp128 = load float, float addrspace(3)* %tmp127, align 4
-  %tmp129 = getelementptr inbounds float, float addrspace(3)* %arg, i32 74
-  %tmp130 = load float, float addrspace(3)* %tmp129, align 4
-  %tmp131 = getelementptr inbounds float, float addrspace(3)* %arg, i32 75
-  %tmp132 = load float, float addrspace(3)* %tmp131, align 4
+  %tmp127 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 73
+  %tmp128 = load float, ptr addrspace(3) %tmp127, align 4
+  %tmp129 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 74
+  %tmp130 = load float, ptr addrspace(3) %tmp129, align 4
+  %tmp131 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 75
+  %tmp132 = load float, ptr addrspace(3) %tmp131, align 4
   %tmp133 = tail call float @llvm.fmuladd.f32(float %tmp128, float %tmp130, float %tmp132)
-  %tmp134 = getelementptr inbounds float, float addrspace(3)* %arg, i32 77
-  %tmp135 = load float, float addrspace(3)* %tmp134, align 4
-  %tmp136 = getelementptr inbounds float, float addrspace(3)* %arg, i32 78
-  %tmp137 = load float, float addrspace(3)* %tmp136, align 4
-  %tmp138 = getelementptr inbounds float, float addrspace(3)* %arg, i32 79
-  %tmp139 = load float, float addrspace(3)* %tmp138, align 4
+  %tmp134 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 77
+  %tmp135 = load float, ptr addrspace(3) %tmp134, align 4
+  %tmp136 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 78
+  %tmp137 = load float, ptr addrspace(3) %tmp136, align 4
+  %tmp138 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 79
+  %tmp139 = load float, ptr addrspace(3) %tmp138, align 4
   %tmp140 = tail call float @llvm.fmuladd.f32(float %tmp135, float %tmp137, float %tmp139)
-  %tmp141 = getelementptr inbounds float, float addrspace(3)* %arg, i32 81
-  %tmp142 = load float, float addrspace(3)* %tmp141, align 4
-  %tmp143 = getelementptr inbounds float, float addrspace(3)* %arg, i32 82
-  %tmp144 = load float, float addrspace(3)* %tmp143, align 4
-  %tmp145 = getelementptr inbounds float, float addrspace(3)* %arg, i32 83
-  %tmp146 = load float, float addrspace(3)* %tmp145, align 4
+  %tmp141 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 81
+  %tmp142 = load float, ptr addrspace(3) %tmp141, align 4
+  %tmp143 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 82
+  %tmp144 = load float, ptr addrspace(3) %tmp143, align 4
+  %tmp145 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 83
+  %tmp146 = load float, ptr addrspace(3) %tmp145, align 4
   %tmp147 = tail call float @llvm.fmuladd.f32(float %tmp142, float %tmp144, float %tmp146)
-  %tmp148 = getelementptr inbounds float, float addrspace(3)* %arg, i32 85
-  %tmp149 = load float, float addrspace(3)* %tmp148, align 4
-  %tmp150 = getelementptr inbounds float, float addrspace(3)* %arg, i32 86
-  %tmp151 = load float, float addrspace(3)* %tmp150, align 4
-  %tmp152 = getelementptr inbounds float, float addrspace(3)* %arg, i32 87
-  %tmp153 = load float, float addrspace(3)* %tmp152, align 4
+  %tmp148 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 85
+  %tmp149 = load float, ptr addrspace(3) %tmp148, align 4
+  %tmp150 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 86
+  %tmp151 = load float, ptr addrspace(3) %tmp150, align 4
+  %tmp152 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 87
+  %tmp153 = load float, ptr addrspace(3) %tmp152, align 4
   %tmp154 = tail call float @llvm.fmuladd.f32(float %tmp149, float %tmp151, float %tmp153)
-  %tmp155 = getelementptr inbounds float, float addrspace(3)* %arg, i32 89
-  %tmp156 = load float, float addrspace(3)* %tmp155, align 4
-  %tmp157 = getelementptr inbounds float, float addrspace(3)* %arg, i32 90
-  %tmp158 = load float, float addrspace(3)* %tmp157, align 4
-  %tmp159 = getelementptr inbounds float, float addrspace(3)* %arg, i32 91
-  %tmp160 = load float, float addrspace(3)* %tmp159, align 4
+  %tmp155 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 89
+  %tmp156 = load float, ptr addrspace(3) %tmp155, align 4
+  %tmp157 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 90
+  %tmp158 = load float, ptr addrspace(3) %tmp157, align 4
+  %tmp159 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 91
+  %tmp160 = load float, ptr addrspace(3) %tmp159, align 4
   %tmp161 = tail call float @llvm.fmuladd.f32(float %tmp156, float %tmp158, float %tmp160)
-  %tmp162 = getelementptr inbounds float, float addrspace(3)* %arg, i32 93
-  %tmp163 = load float, float addrspace(3)* %tmp162, align 4
-  %tmp164 = getelementptr inbounds float, float addrspace(3)* %arg, i32 94
-  %tmp165 = load float, float addrspace(3)* %tmp164, align 4
-  %tmp166 = getelementptr inbounds float, float addrspace(3)* %arg, i32 95
-  %tmp167 = load float, float addrspace(3)* %tmp166, align 4
+  %tmp162 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 93
+  %tmp163 = load float, ptr addrspace(3) %tmp162, align 4
+  %tmp164 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 94
+  %tmp165 = load float, ptr addrspace(3) %tmp164, align 4
+  %tmp166 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 95
+  %tmp167 = load float, ptr addrspace(3) %tmp166, align 4
   %tmp168 = tail call float @llvm.fmuladd.f32(float %tmp163, float %tmp165, float %tmp167)
-  %tmp169 = getelementptr inbounds float, float addrspace(3)* %arg, i32 97
-  %tmp170 = load float, float addrspace(3)* %tmp169, align 4
-  %tmp171 = getelementptr inbounds float, float addrspace(3)* %arg, i32 98
-  %tmp172 = load float, float addrspace(3)* %tmp171, align 4
-  %tmp173 = getelementptr inbounds float, float addrspace(3)* %arg, i32 99
-  %tmp174 = load float, float addrspace(3)* %tmp173, align 4
+  %tmp169 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 97
+  %tmp170 = load float, ptr addrspace(3) %tmp169, align 4
+  %tmp171 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 98
+  %tmp172 = load float, ptr addrspace(3) %tmp171, align 4
+  %tmp173 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 99
+  %tmp174 = load float, ptr addrspace(3) %tmp173, align 4
   %tmp175 = tail call float @llvm.fmuladd.f32(float %tmp170, float %tmp172, float %tmp174)
-  %tmp176 = getelementptr inbounds float, float addrspace(3)* %arg, i32 101
-  %tmp177 = load float, float addrspace(3)* %tmp176, align 4
-  %tmp178 = getelementptr inbounds float, float addrspace(3)* %arg, i32 102
-  %tmp179 = load float, float addrspace(3)* %tmp178, align 4
-  %tmp180 = getelementptr inbounds float, float addrspace(3)* %arg, i32 103
-  %tmp181 = load float, float addrspace(3)* %tmp180, align 4
+  %tmp176 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 101
+  %tmp177 = load float, ptr addrspace(3) %tmp176, align 4
+  %tmp178 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 102
+  %tmp179 = load float, ptr addrspace(3) %tmp178, align 4
+  %tmp180 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 103
+  %tmp181 = load float, ptr addrspace(3) %tmp180, align 4
   %tmp182 = tail call float @llvm.fmuladd.f32(float %tmp177, float %tmp179, float %tmp181)
-  %tmp183 = getelementptr inbounds float, float addrspace(3)* %arg, i32 105
-  %tmp184 = load float, float addrspace(3)* %tmp183, align 4
-  %tmp185 = getelementptr inbounds float, float addrspace(3)* %arg, i32 106
-  %tmp186 = load float, float addrspace(3)* %tmp185, align 4
-  %tmp187 = getelementptr inbounds float, float addrspace(3)* %arg, i32 107
-  %tmp188 = load float, float addrspace(3)* %tmp187, align 4
+  %tmp183 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 105
+  %tmp184 = load float, ptr addrspace(3) %tmp183, align 4
+  %tmp185 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 106
+  %tmp186 = load float, ptr addrspace(3) %tmp185, align 4
+  %tmp187 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 107
+  %tmp188 = load float, ptr addrspace(3) %tmp187, align 4
   %tmp189 = tail call float @llvm.fmuladd.f32(float %tmp184, float %tmp186, float %tmp188)
-  %tmp190 = getelementptr inbounds float, float addrspace(3)* %arg, i32 109
-  %tmp191 = load float, float addrspace(3)* %tmp190, align 4
-  %tmp192 = getelementptr inbounds float, float addrspace(3)* %arg, i32 110
-  %tmp193 = load float, float addrspace(3)* %tmp192, align 4
-  %tmp194 = getelementptr inbounds float, float addrspace(3)* %arg, i32 111
-  %tmp195 = load float, float addrspace(3)* %tmp194, align 4
+  %tmp190 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 109
+  %tmp191 = load float, ptr addrspace(3) %tmp190, align 4
+  %tmp192 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 110
+  %tmp193 = load float, ptr addrspace(3) %tmp192, align 4
+  %tmp194 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 111
+  %tmp195 = load float, ptr addrspace(3) %tmp194, align 4
   %tmp196 = tail call float @llvm.fmuladd.f32(float %tmp191, float %tmp193, float %tmp195)
-  %tmp197 = getelementptr inbounds float, float addrspace(3)* %arg, i32 113
-  %tmp198 = load float, float addrspace(3)* %tmp197, align 4
-  %tmp199 = getelementptr inbounds float, float addrspace(3)* %arg, i32 114
-  %tmp200 = load float, float addrspace(3)* %tmp199, align 4
-  %tmp201 = getelementptr inbounds float, float addrspace(3)* %arg, i32 115
-  %tmp202 = load float, float addrspace(3)* %tmp201, align 4
+  %tmp197 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 113
+  %tmp198 = load float, ptr addrspace(3) %tmp197, align 4
+  %tmp199 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 114
+  %tmp200 = load float, ptr addrspace(3) %tmp199, align 4
+  %tmp201 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 115
+  %tmp202 = load float, ptr addrspace(3) %tmp201, align 4
   %tmp203 = tail call float @llvm.fmuladd.f32(float %tmp198, float %tmp200, float %tmp202)
-  %tmp204 = getelementptr inbounds float, float addrspace(3)* %arg, i32 117
-  %tmp205 = load float, float addrspace(3)* %tmp204, align 4
-  %tmp206 = getelementptr inbounds float, float addrspace(3)* %arg, i32 118
-  %tmp207 = load float, float addrspace(3)* %tmp206, align 4
-  %tmp208 = getelementptr inbounds float, float addrspace(3)* %arg, i32 119
-  %tmp209 = load float, float addrspace(3)* %tmp208, align 4
+  %tmp204 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 117
+  %tmp205 = load float, ptr addrspace(3) %tmp204, align 4
+  %tmp206 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 118
+  %tmp207 = load float, ptr addrspace(3) %tmp206, align 4
+  %tmp208 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 119
+  %tmp209 = load float, ptr addrspace(3) %tmp208, align 4
   %tmp210 = tail call float @llvm.fmuladd.f32(float %tmp205, float %tmp207, float %tmp209)
-  %tmp211 = getelementptr inbounds float, float addrspace(3)* %arg, i32 121
-  %tmp212 = load float, float addrspace(3)* %tmp211, align 4
-  %tmp213 = getelementptr inbounds float, float addrspace(3)* %arg, i32 122
-  %tmp214 = load float, float addrspace(3)* %tmp213, align 4
-  %tmp215 = getelementptr inbounds float, float addrspace(3)* %arg, i32 123
-  %tmp216 = load float, float addrspace(3)* %tmp215, align 4
+  %tmp211 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 121
+  %tmp212 = load float, ptr addrspace(3) %tmp211, align 4
+  %tmp213 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 122
+  %tmp214 = load float, ptr addrspace(3) %tmp213, align 4
+  %tmp215 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 123
+  %tmp216 = load float, ptr addrspace(3) %tmp215, align 4
   %tmp217 = tail call float @llvm.fmuladd.f32(float %tmp212, float %tmp214, float %tmp216)
-  %tmp218 = getelementptr inbounds float, float addrspace(3)* %arg, i32 125
-  %tmp219 = load float, float addrspace(3)* %tmp218, align 4
-  %tmp220 = getelementptr inbounds float, float addrspace(3)* %arg, i32 126
-  %tmp221 = load float, float addrspace(3)* %tmp220, align 4
-  %tmp222 = getelementptr inbounds float, float addrspace(3)* %arg, i32 127
-  %tmp223 = load float, float addrspace(3)* %tmp222, align 4
+  %tmp218 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 125
+  %tmp219 = load float, ptr addrspace(3) %tmp218, align 4
+  %tmp220 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 126
+  %tmp221 = load float, ptr addrspace(3) %tmp220, align 4
+  %tmp222 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 127
+  %tmp223 = load float, ptr addrspace(3) %tmp222, align 4
   %tmp224 = tail call float @llvm.fmuladd.f32(float %tmp219, float %tmp221, float %tmp223)
-  %tmp225 = getelementptr inbounds float, float addrspace(3)* %arg, i32 129
-  %tmp226 = load float, float addrspace(3)* %tmp225, align 4
-  %tmp227 = getelementptr inbounds float, float addrspace(3)* %arg, i32 130
-  %tmp228 = load float, float addrspace(3)* %tmp227, align 4
-  %tmp229 = getelementptr inbounds float, float addrspace(3)* %arg, i32 131
-  %tmp230 = load float, float addrspace(3)* %tmp229, align 4
+  %tmp225 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 129
+  %tmp226 = load float, ptr addrspace(3) %tmp225, align 4
+  %tmp227 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 130
+  %tmp228 = load float, ptr addrspace(3) %tmp227, align 4
+  %tmp229 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 131
+  %tmp230 = load float, ptr addrspace(3) %tmp229, align 4
   %tmp231 = tail call float @llvm.fmuladd.f32(float %tmp226, float %tmp228, float %tmp230)
-  %tmp232 = getelementptr inbounds float, float addrspace(3)* %arg, i32 133
-  %tmp233 = load float, float addrspace(3)* %tmp232, align 4
-  %tmp234 = getelementptr inbounds float, float addrspace(3)* %arg, i32 134
-  %tmp235 = load float, float addrspace(3)* %tmp234, align 4
-  %tmp236 = getelementptr inbounds float, float addrspace(3)* %arg, i32 135
-  %tmp237 = load float, float addrspace(3)* %tmp236, align 4
+  %tmp232 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 133
+  %tmp233 = load float, ptr addrspace(3) %tmp232, align 4
+  %tmp234 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 134
+  %tmp235 = load float, ptr addrspace(3) %tmp234, align 4
+  %tmp236 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 135
+  %tmp237 = load float, ptr addrspace(3) %tmp236, align 4
   %tmp238 = tail call float @llvm.fmuladd.f32(float %tmp233, float %tmp235, float %tmp237)
-  %tmp239 = getelementptr inbounds float, float addrspace(3)* %arg, i32 137
-  %tmp240 = load float, float addrspace(3)* %tmp239, align 4
-  %tmp241 = getelementptr inbounds float, float addrspace(3)* %arg, i32 138
-  %tmp242 = load float, float addrspace(3)* %tmp241, align 4
-  %tmp243 = getelementptr inbounds float, float addrspace(3)* %arg, i32 139
-  %tmp244 = load float, float addrspace(3)* %tmp243, align 4
+  %tmp239 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 137
+  %tmp240 = load float, ptr addrspace(3) %tmp239, align 4
+  %tmp241 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 138
+  %tmp242 = load float, ptr addrspace(3) %tmp241, align 4
+  %tmp243 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 139
+  %tmp244 = load float, ptr addrspace(3) %tmp243, align 4
   %tmp245 = tail call float @llvm.fmuladd.f32(float %tmp240, float %tmp242, float %tmp244)
-  %tmp246 = getelementptr inbounds float, float addrspace(3)* %arg, i32 141
-  %tmp247 = load float, float addrspace(3)* %tmp246, align 4
-  %tmp248 = getelementptr inbounds float, float addrspace(3)* %arg, i32 142
-  %tmp249 = load float, float addrspace(3)* %tmp248, align 4
-  %tmp250 = getelementptr inbounds float, float addrspace(3)* %arg, i32 143
-  %tmp251 = load float, float addrspace(3)* %tmp250, align 4
+  %tmp246 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 141
+  %tmp247 = load float, ptr addrspace(3) %tmp246, align 4
+  %tmp248 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 142
+  %tmp249 = load float, ptr addrspace(3) %tmp248, align 4
+  %tmp250 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 143
+  %tmp251 = load float, ptr addrspace(3) %tmp250, align 4
   %tmp252 = tail call float @llvm.fmuladd.f32(float %tmp247, float %tmp249, float %tmp251)
-  %tmp253 = getelementptr inbounds float, float addrspace(3)* %arg, i32 145
-  %tmp254 = load float, float addrspace(3)* %tmp253, align 4
-  %tmp255 = getelementptr inbounds float, float addrspace(3)* %arg, i32 146
-  %tmp256 = load float, float addrspace(3)* %tmp255, align 4
-  %tmp257 = getelementptr inbounds float, float addrspace(3)* %arg, i32 147
-  %tmp258 = load float, float addrspace(3)* %tmp257, align 4
+  %tmp253 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 145
+  %tmp254 = load float, ptr addrspace(3) %tmp253, align 4
+  %tmp255 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 146
+  %tmp256 = load float, ptr addrspace(3) %tmp255, align 4
+  %tmp257 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 147
+  %tmp258 = load float, ptr addrspace(3) %tmp257, align 4
   %tmp259 = tail call float @llvm.fmuladd.f32(float %tmp254, float %tmp256, float %tmp258)
-  %tmp260 = getelementptr inbounds float, float addrspace(3)* %arg, i32 149
-  %tmp261 = load float, float addrspace(3)* %tmp260, align 4
-  %tmp262 = getelementptr inbounds float, float addrspace(3)* %arg, i32 150
-  %tmp263 = load float, float addrspace(3)* %tmp262, align 4
-  %tmp264 = getelementptr inbounds float, float addrspace(3)* %arg, i32 151
-  %tmp265 = load float, float addrspace(3)* %tmp264, align 4
+  %tmp260 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 149
+  %tmp261 = load float, ptr addrspace(3) %tmp260, align 4
+  %tmp262 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 150
+  %tmp263 = load float, ptr addrspace(3) %tmp262, align 4
+  %tmp264 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 151
+  %tmp265 = load float, ptr addrspace(3) %tmp264, align 4
   %tmp266 = tail call float @llvm.fmuladd.f32(float %tmp261, float %tmp263, float %tmp265)
-  %tmp267 = getelementptr inbounds float, float addrspace(3)* %arg, i32 153
-  %tmp268 = load float, float addrspace(3)* %tmp267, align 4
-  %tmp269 = getelementptr inbounds float, float addrspace(3)* %arg, i32 154
-  %tmp270 = load float, float addrspace(3)* %tmp269, align 4
-  %tmp271 = getelementptr inbounds float, float addrspace(3)* %arg, i32 155
-  %tmp272 = load float, float addrspace(3)* %tmp271, align 4
+  %tmp267 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 153
+  %tmp268 = load float, ptr addrspace(3) %tmp267, align 4
+  %tmp269 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 154
+  %tmp270 = load float, ptr addrspace(3) %tmp269, align 4
+  %tmp271 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 155
+  %tmp272 = load float, ptr addrspace(3) %tmp271, align 4
   %tmp273 = tail call float @llvm.fmuladd.f32(float %tmp268, float %tmp270, float %tmp272)
-  %tmp274 = getelementptr inbounds float, float addrspace(3)* %arg, i32 157
-  %tmp275 = load float, float addrspace(3)* %tmp274, align 4
-  %tmp276 = getelementptr inbounds float, float addrspace(3)* %arg, i32 158
-  %tmp277 = load float, float addrspace(3)* %tmp276, align 4
-  %tmp278 = getelementptr inbounds float, float addrspace(3)* %arg, i32 159
-  %tmp279 = load float, float addrspace(3)* %tmp278, align 4
+  %tmp274 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 157
+  %tmp275 = load float, ptr addrspace(3) %tmp274, align 4
+  %tmp276 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 158
+  %tmp277 = load float, ptr addrspace(3) %tmp276, align 4
+  %tmp278 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 159
+  %tmp279 = load float, ptr addrspace(3) %tmp278, align 4
   %tmp280 = tail call float @llvm.fmuladd.f32(float %tmp275, float %tmp277, float %tmp279)
-  %tmp281 = getelementptr inbounds float, float addrspace(3)* %arg, i32 161
-  %tmp282 = load float, float addrspace(3)* %tmp281, align 4
-  %tmp283 = getelementptr inbounds float, float addrspace(3)* %arg, i32 162
-  %tmp284 = load float, float addrspace(3)* %tmp283, align 4
-  %tmp285 = getelementptr inbounds float, float addrspace(3)* %arg, i32 163
-  %tmp286 = load float, float addrspace(3)* %tmp285, align 4
+  %tmp281 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 161
+  %tmp282 = load float, ptr addrspace(3) %tmp281, align 4
+  %tmp283 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 162
+  %tmp284 = load float, ptr addrspace(3) %tmp283, align 4
+  %tmp285 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 163
+  %tmp286 = load float, ptr addrspace(3) %tmp285, align 4
   %tmp287 = tail call float @llvm.fmuladd.f32(float %tmp282, float %tmp284, float %tmp286)
-  %tmp288 = getelementptr inbounds float, float addrspace(3)* %arg, i32 165
-  %tmp289 = load float, float addrspace(3)* %tmp288, align 4
-  %tmp290 = getelementptr inbounds float, float addrspace(3)* %arg, i32 166
-  %tmp291 = load float, float addrspace(3)* %tmp290, align 4
-  %tmp292 = getelementptr inbounds float, float addrspace(3)* %arg, i32 167
-  %tmp293 = load float, float addrspace(3)* %tmp292, align 4
+  %tmp288 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 165
+  %tmp289 = load float, ptr addrspace(3) %tmp288, align 4
+  %tmp290 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 166
+  %tmp291 = load float, ptr addrspace(3) %tmp290, align 4
+  %tmp292 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 167
+  %tmp293 = load float, ptr addrspace(3) %tmp292, align 4
   %tmp294 = tail call float @llvm.fmuladd.f32(float %tmp289, float %tmp291, float %tmp293)
-  %tmp295 = getelementptr inbounds float, float addrspace(3)* %arg, i32 169
-  %tmp296 = load float, float addrspace(3)* %tmp295, align 4
-  %tmp297 = getelementptr inbounds float, float addrspace(3)* %arg, i32 170
-  %tmp298 = load float, float addrspace(3)* %tmp297, align 4
-  %tmp299 = getelementptr inbounds float, float addrspace(3)* %arg, i32 171
-  %tmp300 = load float, float addrspace(3)* %tmp299, align 4
+  %tmp295 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 169
+  %tmp296 = load float, ptr addrspace(3) %tmp295, align 4
+  %tmp297 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 170
+  %tmp298 = load float, ptr addrspace(3) %tmp297, align 4
+  %tmp299 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 171
+  %tmp300 = load float, ptr addrspace(3) %tmp299, align 4
   %tmp301 = tail call float @llvm.fmuladd.f32(float %tmp296, float %tmp298, float %tmp300)
-  %tmp302 = getelementptr inbounds float, float addrspace(3)* %arg, i32 173
-  %tmp303 = load float, float addrspace(3)* %tmp302, align 4
-  %tmp304 = getelementptr inbounds float, float addrspace(3)* %arg, i32 174
-  %tmp305 = load float, float addrspace(3)* %tmp304, align 4
-  %tmp306 = getelementptr inbounds float, float addrspace(3)* %arg, i32 175
-  %tmp307 = load float, float addrspace(3)* %tmp306, align 4
+  %tmp302 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 173
+  %tmp303 = load float, ptr addrspace(3) %tmp302, align 4
+  %tmp304 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 174
+  %tmp305 = load float, ptr addrspace(3) %tmp304, align 4
+  %tmp306 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 175
+  %tmp307 = load float, ptr addrspace(3) %tmp306, align 4
   %tmp308 = tail call float @llvm.fmuladd.f32(float %tmp303, float %tmp305, float %tmp307)
-  %tmp309 = getelementptr inbounds float, float addrspace(3)* %arg, i32 177
-  %tmp310 = load float, float addrspace(3)* %tmp309, align 4
-  %tmp311 = getelementptr inbounds float, float addrspace(3)* %arg, i32 178
-  %tmp312 = load float, float addrspace(3)* %tmp311, align 4
-  %tmp313 = getelementptr inbounds float, float addrspace(3)* %arg, i32 179
-  %tmp314 = load float, float addrspace(3)* %tmp313, align 4
+  %tmp309 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 177
+  %tmp310 = load float, ptr addrspace(3) %tmp309, align 4
+  %tmp311 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 178
+  %tmp312 = load float, ptr addrspace(3) %tmp311, align 4
+  %tmp313 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 179
+  %tmp314 = load float, ptr addrspace(3) %tmp313, align 4
   %tmp315 = tail call float @llvm.fmuladd.f32(float %tmp310, float %tmp312, float %tmp314)
-  %tmp316 = getelementptr inbounds float, float addrspace(3)* %arg, i32 181
-  %tmp317 = load float, float addrspace(3)* %tmp316, align 4
-  %tmp318 = getelementptr inbounds float, float addrspace(3)* %arg, i32 182
-  %tmp319 = load float, float addrspace(3)* %tmp318, align 4
-  %tmp320 = getelementptr inbounds float, float addrspace(3)* %arg, i32 183
-  %tmp321 = load float, float addrspace(3)* %tmp320, align 4
+  %tmp316 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 181
+  %tmp317 = load float, ptr addrspace(3) %tmp316, align 4
+  %tmp318 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 182
+  %tmp319 = load float, ptr addrspace(3) %tmp318, align 4
+  %tmp320 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 183
+  %tmp321 = load float, ptr addrspace(3) %tmp320, align 4
   %tmp322 = tail call float @llvm.fmuladd.f32(float %tmp317, float %tmp319, float %tmp321)
-  %tmp323 = getelementptr inbounds float, float addrspace(3)* %arg, i32 185
-  %tmp324 = load float, float addrspace(3)* %tmp323, align 4
-  %tmp325 = getelementptr inbounds float, float addrspace(3)* %arg, i32 186
-  %tmp326 = load float, float addrspace(3)* %tmp325, align 4
-  %tmp327 = getelementptr inbounds float, float addrspace(3)* %arg, i32 187
-  %tmp328 = load float, float addrspace(3)* %tmp327, align 4
+  %tmp323 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 185
+  %tmp324 = load float, ptr addrspace(3) %tmp323, align 4
+  %tmp325 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 186
+  %tmp326 = load float, ptr addrspace(3) %tmp325, align 4
+  %tmp327 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 187
+  %tmp328 = load float, ptr addrspace(3) %tmp327, align 4
   %tmp329 = tail call float @llvm.fmuladd.f32(float %tmp324, float %tmp326, float %tmp328)
-  %tmp330 = getelementptr inbounds float, float addrspace(3)* %arg, i32 189
-  %tmp331 = load float, float addrspace(3)* %tmp330, align 4
-  %tmp332 = getelementptr inbounds float, float addrspace(3)* %arg, i32 190
-  %tmp333 = load float, float addrspace(3)* %tmp332, align 4
-  %tmp334 = getelementptr inbounds float, float addrspace(3)* %arg, i32 191
-  %tmp335 = load float, float addrspace(3)* %tmp334, align 4
+  %tmp330 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 189
+  %tmp331 = load float, ptr addrspace(3) %tmp330, align 4
+  %tmp332 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 190
+  %tmp333 = load float, ptr addrspace(3) %tmp332, align 4
+  %tmp334 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 191
+  %tmp335 = load float, ptr addrspace(3) %tmp334, align 4
   %tmp336 = tail call float @llvm.fmuladd.f32(float %tmp331, float %tmp333, float %tmp335)
-  %tmp337 = getelementptr inbounds float, float addrspace(3)* %arg, i32 193
-  %tmp338 = load float, float addrspace(3)* %tmp337, align 4
-  %tmp339 = getelementptr inbounds float, float addrspace(3)* %arg, i32 194
-  %tmp340 = load float, float addrspace(3)* %tmp339, align 4
-  %tmp341 = getelementptr inbounds float, float addrspace(3)* %arg, i32 195
-  %tmp342 = load float, float addrspace(3)* %tmp341, align 4
+  %tmp337 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 193
+  %tmp338 = load float, ptr addrspace(3) %tmp337, align 4
+  %tmp339 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 194
+  %tmp340 = load float, ptr addrspace(3) %tmp339, align 4
+  %tmp341 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 195
+  %tmp342 = load float, ptr addrspace(3) %tmp341, align 4
   %tmp343 = tail call float @llvm.fmuladd.f32(float %tmp338, float %tmp340, float %tmp342)
-  %tmp344 = getelementptr inbounds float, float addrspace(3)* %arg, i32 197
-  %tmp345 = load float, float addrspace(3)* %tmp344, align 4
-  %tmp346 = getelementptr inbounds float, float addrspace(3)* %arg, i32 198
-  %tmp347 = load float, float addrspace(3)* %tmp346, align 4
-  %tmp348 = getelementptr inbounds float, float addrspace(3)* %arg, i32 199
-  %tmp349 = load float, float addrspace(3)* %tmp348, align 4
+  %tmp344 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 197
+  %tmp345 = load float, ptr addrspace(3) %tmp344, align 4
+  %tmp346 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 198
+  %tmp347 = load float, ptr addrspace(3) %tmp346, align 4
+  %tmp348 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 199
+  %tmp349 = load float, ptr addrspace(3) %tmp348, align 4
   %tmp350 = tail call float @llvm.fmuladd.f32(float %tmp345, float %tmp347, float %tmp349)
-  %tmp351 = getelementptr inbounds float, float addrspace(3)* %arg, i32 201
-  %tmp352 = load float, float addrspace(3)* %tmp351, align 4
-  %tmp353 = getelementptr inbounds float, float addrspace(3)* %arg, i32 202
-  %tmp354 = load float, float addrspace(3)* %tmp353, align 4
-  %tmp355 = getelementptr inbounds float, float addrspace(3)* %arg, i32 203
-  %tmp356 = load float, float addrspace(3)* %tmp355, align 4
+  %tmp351 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 201
+  %tmp352 = load float, ptr addrspace(3) %tmp351, align 4
+  %tmp353 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 202
+  %tmp354 = load float, ptr addrspace(3) %tmp353, align 4
+  %tmp355 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 203
+  %tmp356 = load float, ptr addrspace(3) %tmp355, align 4
   %tmp357 = tail call float @llvm.fmuladd.f32(float %tmp352, float %tmp354, float %tmp356)
-  %tmp358 = getelementptr inbounds float, float addrspace(3)* %arg, i32 205
-  %tmp359 = load float, float addrspace(3)* %tmp358, align 4
-  %tmp360 = getelementptr inbounds float, float addrspace(3)* %arg, i32 206
-  %tmp361 = load float, float addrspace(3)* %tmp360, align 4
-  %tmp362 = getelementptr inbounds float, float addrspace(3)* %arg, i32 207
-  %tmp363 = load float, float addrspace(3)* %tmp362, align 4
+  %tmp358 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 205
+  %tmp359 = load float, ptr addrspace(3) %tmp358, align 4
+  %tmp360 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 206
+  %tmp361 = load float, ptr addrspace(3) %tmp360, align 4
+  %tmp362 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 207
+  %tmp363 = load float, ptr addrspace(3) %tmp362, align 4
   %tmp364 = tail call float @llvm.fmuladd.f32(float %tmp359, float %tmp361, float %tmp363)
-  %tmp365 = getelementptr inbounds float, float addrspace(3)* %arg, i32 209
-  %tmp366 = load float, float addrspace(3)* %tmp365, align 4
-  %tmp367 = getelementptr inbounds float, float addrspace(3)* %arg, i32 210
-  %tmp368 = load float, float addrspace(3)* %tmp367, align 4
-  %tmp369 = getelementptr inbounds float, float addrspace(3)* %arg, i32 211
-  %tmp370 = load float, float addrspace(3)* %tmp369, align 4
+  %tmp365 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 209
+  %tmp366 = load float, ptr addrspace(3) %tmp365, align 4
+  %tmp367 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 210
+  %tmp368 = load float, ptr addrspace(3) %tmp367, align 4
+  %tmp369 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 211
+  %tmp370 = load float, ptr addrspace(3) %tmp369, align 4
   %tmp371 = tail call float @llvm.fmuladd.f32(float %tmp366, float %tmp368, float %tmp370)
-  %tmp372 = getelementptr inbounds float, float addrspace(3)* %arg, i32 213
-  %tmp373 = load float, float addrspace(3)* %tmp372, align 4
-  %tmp374 = getelementptr inbounds float, float addrspace(3)* %arg, i32 214
-  %tmp375 = load float, float addrspace(3)* %tmp374, align 4
-  %tmp376 = getelementptr inbounds float, float addrspace(3)* %arg, i32 215
-  %tmp377 = load float, float addrspace(3)* %tmp376, align 4
+  %tmp372 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 213
+  %tmp373 = load float, ptr addrspace(3) %tmp372, align 4
+  %tmp374 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 214
+  %tmp375 = load float, ptr addrspace(3) %tmp374, align 4
+  %tmp376 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 215
+  %tmp377 = load float, ptr addrspace(3) %tmp376, align 4
   %tmp378 = tail call float @llvm.fmuladd.f32(float %tmp373, float %tmp375, float %tmp377)
-  %tmp379 = getelementptr inbounds float, float addrspace(3)* %arg, i32 217
-  %tmp380 = load float, float addrspace(3)* %tmp379, align 4
-  %tmp381 = getelementptr inbounds float, float addrspace(3)* %arg, i32 218
-  %tmp382 = load float, float addrspace(3)* %tmp381, align 4
-  %tmp383 = getelementptr inbounds float, float addrspace(3)* %arg, i32 219
-  %tmp384 = load float, float addrspace(3)* %tmp383, align 4
+  %tmp379 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 217
+  %tmp380 = load float, ptr addrspace(3) %tmp379, align 4
+  %tmp381 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 218
+  %tmp382 = load float, ptr addrspace(3) %tmp381, align 4
+  %tmp383 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 219
+  %tmp384 = load float, ptr addrspace(3) %tmp383, align 4
   %tmp385 = tail call float @llvm.fmuladd.f32(float %tmp380, float %tmp382, float %tmp384)
-  %tmp386 = getelementptr inbounds float, float addrspace(3)* %arg, i32 221
-  %tmp387 = load float, float addrspace(3)* %tmp386, align 4
-  %tmp388 = getelementptr inbounds float, float addrspace(3)* %arg, i32 222
-  %tmp389 = load float, float addrspace(3)* %tmp388, align 4
-  %tmp390 = getelementptr inbounds float, float addrspace(3)* %arg, i32 223
-  %tmp391 = load float, float addrspace(3)* %tmp390, align 4
+  %tmp386 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 221
+  %tmp387 = load float, ptr addrspace(3) %tmp386, align 4
+  %tmp388 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 222
+  %tmp389 = load float, ptr addrspace(3) %tmp388, align 4
+  %tmp390 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 223
+  %tmp391 = load float, ptr addrspace(3) %tmp390, align 4
   %tmp392 = tail call float @llvm.fmuladd.f32(float %tmp387, float %tmp389, float %tmp391)
-  %tmp393 = getelementptr inbounds float, float addrspace(3)* %arg, i32 225
-  %tmp394 = load float, float addrspace(3)* %tmp393, align 4
-  %tmp395 = getelementptr inbounds float, float addrspace(3)* %arg, i32 226
-  %tmp396 = load float, float addrspace(3)* %tmp395, align 4
-  %tmp397 = getelementptr inbounds float, float addrspace(3)* %arg, i32 227
-  %tmp398 = load float, float addrspace(3)* %tmp397, align 4
+  %tmp393 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 225
+  %tmp394 = load float, ptr addrspace(3) %tmp393, align 4
+  %tmp395 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 226
+  %tmp396 = load float, ptr addrspace(3) %tmp395, align 4
+  %tmp397 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 227
+  %tmp398 = load float, ptr addrspace(3) %tmp397, align 4
   %tmp399 = tail call float @llvm.fmuladd.f32(float %tmp394, float %tmp396, float %tmp398)
-  %tmp400 = getelementptr inbounds float, float addrspace(3)* %arg, i32 229
-  %tmp401 = load float, float addrspace(3)* %tmp400, align 4
-  %tmp402 = getelementptr inbounds float, float addrspace(3)* %arg, i32 230
-  %tmp403 = load float, float addrspace(3)* %tmp402, align 4
-  %tmp404 = getelementptr inbounds float, float addrspace(3)* %arg, i32 231
-  %tmp405 = load float, float addrspace(3)* %tmp404, align 4
+  %tmp400 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 229
+  %tmp401 = load float, ptr addrspace(3) %tmp400, align 4
+  %tmp402 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 230
+  %tmp403 = load float, ptr addrspace(3) %tmp402, align 4
+  %tmp404 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 231
+  %tmp405 = load float, ptr addrspace(3) %tmp404, align 4
   %tmp406 = tail call float @llvm.fmuladd.f32(float %tmp401, float %tmp403, float %tmp405)
-  %tmp407 = getelementptr inbounds float, float addrspace(3)* %arg, i32 233
-  %tmp408 = load float, float addrspace(3)* %tmp407, align 4
-  %tmp409 = getelementptr inbounds float, float addrspace(3)* %arg, i32 234
-  %tmp410 = load float, float addrspace(3)* %tmp409, align 4
-  %tmp411 = getelementptr inbounds float, float addrspace(3)* %arg, i32 235
-  %tmp412 = load float, float addrspace(3)* %tmp411, align 4
+  %tmp407 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 233
+  %tmp408 = load float, ptr addrspace(3) %tmp407, align 4
+  %tmp409 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 234
+  %tmp410 = load float, ptr addrspace(3) %tmp409, align 4
+  %tmp411 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 235
+  %tmp412 = load float, ptr addrspace(3) %tmp411, align 4
   %tmp413 = tail call float @llvm.fmuladd.f32(float %tmp408, float %tmp410, float %tmp412)
-  %tmp414 = getelementptr inbounds float, float addrspace(3)* %arg, i32 237
-  %tmp415 = load float, float addrspace(3)* %tmp414, align 4
-  %tmp416 = getelementptr inbounds float, float addrspace(3)* %arg, i32 238
-  %tmp417 = load float, float addrspace(3)* %tmp416, align 4
-  %tmp418 = getelementptr inbounds float, float addrspace(3)* %arg, i32 239
-  %tmp419 = load float, float addrspace(3)* %tmp418, align 4
+  %tmp414 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 237
+  %tmp415 = load float, ptr addrspace(3) %tmp414, align 4
+  %tmp416 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 238
+  %tmp417 = load float, ptr addrspace(3) %tmp416, align 4
+  %tmp418 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 239
+  %tmp419 = load float, ptr addrspace(3) %tmp418, align 4
   %tmp420 = tail call float @llvm.fmuladd.f32(float %tmp415, float %tmp417, float %tmp419)
-  %tmp421 = getelementptr inbounds float, float addrspace(3)* %arg, i32 241
-  %tmp422 = load float, float addrspace(3)* %tmp421, align 4
-  %tmp423 = getelementptr inbounds float, float addrspace(3)* %arg, i32 242
-  %tmp424 = load float, float addrspace(3)* %tmp423, align 4
-  %tmp425 = getelementptr inbounds float, float addrspace(3)* %arg, i32 243
-  %tmp426 = load float, float addrspace(3)* %tmp425, align 4
+  %tmp421 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 241
+  %tmp422 = load float, ptr addrspace(3) %tmp421, align 4
+  %tmp423 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 242
+  %tmp424 = load float, ptr addrspace(3) %tmp423, align 4
+  %tmp425 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 243
+  %tmp426 = load float, ptr addrspace(3) %tmp425, align 4
   %tmp427 = tail call float @llvm.fmuladd.f32(float %tmp422, float %tmp424, float %tmp426)
-  %tmp428 = getelementptr inbounds float, float addrspace(3)* %arg, i32 245
-  %tmp429 = load float, float addrspace(3)* %tmp428, align 4
-  %tmp430 = getelementptr inbounds float, float addrspace(3)* %arg, i32 246
-  %tmp431 = load float, float addrspace(3)* %tmp430, align 4
-  %tmp432 = getelementptr inbounds float, float addrspace(3)* %arg, i32 247
-  %tmp433 = load float, float addrspace(3)* %tmp432, align 4
+  %tmp428 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 245
+  %tmp429 = load float, ptr addrspace(3) %tmp428, align 4
+  %tmp430 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 246
+  %tmp431 = load float, ptr addrspace(3) %tmp430, align 4
+  %tmp432 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 247
+  %tmp433 = load float, ptr addrspace(3) %tmp432, align 4
   %tmp434 = tail call float @llvm.fmuladd.f32(float %tmp429, float %tmp431, float %tmp433)
-  %tmp435 = getelementptr inbounds float, float addrspace(3)* %arg, i32 249
-  %tmp436 = load float, float addrspace(3)* %tmp435, align 4
-  %tmp437 = getelementptr inbounds float, float addrspace(3)* %arg, i32 250
-  %tmp438 = load float, float addrspace(3)* %tmp437, align 4
-  %tmp439 = getelementptr inbounds float, float addrspace(3)* %arg, i32 251
-  %tmp440 = load float, float addrspace(3)* %tmp439, align 4
+  %tmp435 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 249
+  %tmp436 = load float, ptr addrspace(3) %tmp435, align 4
+  %tmp437 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 250
+  %tmp438 = load float, ptr addrspace(3) %tmp437, align 4
+  %tmp439 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 251
+  %tmp440 = load float, ptr addrspace(3) %tmp439, align 4
   %tmp441 = tail call float @llvm.fmuladd.f32(float %tmp436, float %tmp438, float %tmp440)
-  %tmp442 = getelementptr inbounds float, float addrspace(3)* %arg, i32 253
-  %tmp443 = load float, float addrspace(3)* %tmp442, align 4
-  %tmp444 = getelementptr inbounds float, float addrspace(3)* %arg, i32 254
-  %tmp445 = load float, float addrspace(3)* %tmp444, align 4
-  %tmp446 = getelementptr inbounds float, float addrspace(3)* %arg, i32 255
-  %tmp447 = load float, float addrspace(3)* %tmp446, align 4
+  %tmp442 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 253
+  %tmp443 = load float, ptr addrspace(3) %tmp442, align 4
+  %tmp444 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 254
+  %tmp445 = load float, ptr addrspace(3) %tmp444, align 4
+  %tmp446 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 255
+  %tmp447 = load float, ptr addrspace(3) %tmp446, align 4
   %tmp448 = tail call float @llvm.fmuladd.f32(float %tmp443, float %tmp445, float %tmp447)
-  store float %tmp7, float addrspace(1)* %arg1, align 4
-  %tmp449 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 1
-  store float %tmp14, float addrspace(1)* %tmp449, align 4
-  %tmp450 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 2
-  store float %tmp21, float addrspace(1)* %tmp450, align 4
-  %tmp451 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 3
-  store float %tmp28, float addrspace(1)* %tmp451, align 4
-  %tmp452 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 4
-  store float %tmp35, float addrspace(1)* %tmp452, align 4
-  %tmp453 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 5
-  store float %tmp42, float addrspace(1)* %tmp453, align 4
-  %tmp454 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 6
-  store float %tmp49, float addrspace(1)* %tmp454, align 4
-  %tmp455 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 7
-  store float %tmp56, float addrspace(1)* %tmp455, align 4
-  %tmp456 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 8
-  store float %tmp63, float addrspace(1)* %tmp456, align 4
-  %tmp457 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 9
-  store float %tmp70, float addrspace(1)* %tmp457, align 4
-  %tmp458 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 10
-  store float %tmp77, float addrspace(1)* %tmp458, align 4
-  %tmp459 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 11
-  store float %tmp84, float addrspace(1)* %tmp459, align 4
-  %tmp460 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 12
-  store float %tmp91, float addrspace(1)* %tmp460, align 4
-  %tmp461 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 13
-  store float %tmp98, float addrspace(1)* %tmp461, align 4
-  %tmp462 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 14
-  store float %tmp105, float addrspace(1)* %tmp462, align 4
-  %tmp463 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 15
-  store float %tmp112, float addrspace(1)* %tmp463, align 4
-  %tmp464 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 16
-  store float %tmp119, float addrspace(1)* %tmp464, align 4
-  %tmp465 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 17
-  store float %tmp126, float addrspace(1)* %tmp465, align 4
-  %tmp466 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 18
-  store float %tmp133, float addrspace(1)* %tmp466, align 4
-  %tmp467 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 19
-  store float %tmp140, float addrspace(1)* %tmp467, align 4
-  %tmp468 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 20
-  store float %tmp147, float addrspace(1)* %tmp468, align 4
-  %tmp469 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 21
-  store float %tmp154, float addrspace(1)* %tmp469, align 4
-  %tmp470 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 22
-  store float %tmp161, float addrspace(1)* %tmp470, align 4
-  %tmp471 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 23
-  store float %tmp168, float addrspace(1)* %tmp471, align 4
-  %tmp472 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 24
-  store float %tmp175, float addrspace(1)* %tmp472, align 4
-  %tmp473 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 25
-  store float %tmp182, float addrspace(1)* %tmp473, align 4
-  %tmp474 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 26
-  store float %tmp189, float addrspace(1)* %tmp474, align 4
-  %tmp475 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 27
-  store float %tmp196, float addrspace(1)* %tmp475, align 4
-  %tmp476 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 28
-  store float %tmp203, float addrspace(1)* %tmp476, align 4
-  %tmp477 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 29
-  store float %tmp210, float addrspace(1)* %tmp477, align 4
-  %tmp478 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 30
-  store float %tmp217, float addrspace(1)* %tmp478, align 4
-  %tmp479 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 31
-  store float %tmp224, float addrspace(1)* %tmp479, align 4
-  %tmp480 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 32
-  store float %tmp231, float addrspace(1)* %tmp480, align 4
-  %tmp481 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 33
-  store float %tmp238, float addrspace(1)* %tmp481, align 4
-  %tmp482 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 34
-  store float %tmp245, float addrspace(1)* %tmp482, align 4
-  %tmp483 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 35
-  store float %tmp252, float addrspace(1)* %tmp483, align 4
-  %tmp484 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 36
-  store float %tmp259, float addrspace(1)* %tmp484, align 4
-  %tmp485 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 37
-  store float %tmp266, float addrspace(1)* %tmp485, align 4
-  %tmp486 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 38
-  store float %tmp273, float addrspace(1)* %tmp486, align 4
-  %tmp487 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 39
-  store float %tmp280, float addrspace(1)* %tmp487, align 4
-  %tmp488 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 40
-  store float %tmp287, float addrspace(1)* %tmp488, align 4
-  %tmp489 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 41
-  store float %tmp294, float addrspace(1)* %tmp489, align 4
-  %tmp490 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 42
-  store float %tmp301, float addrspace(1)* %tmp490, align 4
-  %tmp491 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 43
-  store float %tmp308, float addrspace(1)* %tmp491, align 4
-  %tmp492 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 44
-  store float %tmp315, float addrspace(1)* %tmp492, align 4
-  %tmp493 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 45
-  store float %tmp322, float addrspace(1)* %tmp493, align 4
-  %tmp494 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 46
-  store float %tmp329, float addrspace(1)* %tmp494, align 4
-  %tmp495 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 47
-  store float %tmp336, float addrspace(1)* %tmp495, align 4
-  %tmp496 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 48
-  store float %tmp343, float addrspace(1)* %tmp496, align 4
-  %tmp497 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 49
-  store float %tmp350, float addrspace(1)* %tmp497, align 4
-  %tmp498 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 50
-  store float %tmp357, float addrspace(1)* %tmp498, align 4
-  %tmp499 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 51
-  store float %tmp364, float addrspace(1)* %tmp499, align 4
-  %tmp500 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 52
-  store float %tmp371, float addrspace(1)* %tmp500, align 4
-  %tmp501 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 53
-  store float %tmp378, float addrspace(1)* %tmp501, align 4
-  %tmp502 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 54
-  store float %tmp385, float addrspace(1)* %tmp502, align 4
-  %tmp503 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 55
-  store float %tmp392, float addrspace(1)* %tmp503, align 4
-  %tmp504 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 56
-  store float %tmp399, float addrspace(1)* %tmp504, align 4
-  %tmp505 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 57
-  store float %tmp406, float addrspace(1)* %tmp505, align 4
-  %tmp506 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 58
-  store float %tmp413, float addrspace(1)* %tmp506, align 4
-  %tmp507 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 59
-  store float %tmp420, float addrspace(1)* %tmp507, align 4
-  %tmp508 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 60
-  store float %tmp427, float addrspace(1)* %tmp508, align 4
-  %tmp509 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 61
-  store float %tmp434, float addrspace(1)* %tmp509, align 4
-  %tmp510 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 62
-  store float %tmp441, float addrspace(1)* %tmp510, align 4
-  %tmp511 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 63
-  store float %tmp448, float addrspace(1)* %tmp511, align 4
+  store float %tmp7, ptr addrspace(1) %arg1, align 4
+  %tmp449 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 1
+  store float %tmp14, ptr addrspace(1) %tmp449, align 4
+  %tmp450 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 2
+  store float %tmp21, ptr addrspace(1) %tmp450, align 4
+  %tmp451 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 3
+  store float %tmp28, ptr addrspace(1) %tmp451, align 4
+  %tmp452 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 4
+  store float %tmp35, ptr addrspace(1) %tmp452, align 4
+  %tmp453 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 5
+  store float %tmp42, ptr addrspace(1) %tmp453, align 4
+  %tmp454 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 6
+  store float %tmp49, ptr addrspace(1) %tmp454, align 4
+  %tmp455 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 7
+  store float %tmp56, ptr addrspace(1) %tmp455, align 4
+  %tmp456 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 8
+  store float %tmp63, ptr addrspace(1) %tmp456, align 4
+  %tmp457 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 9
+  store float %tmp70, ptr addrspace(1) %tmp457, align 4
+  %tmp458 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 10
+  store float %tmp77, ptr addrspace(1) %tmp458, align 4
+  %tmp459 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 11
+  store float %tmp84, ptr addrspace(1) %tmp459, align 4
+  %tmp460 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 12
+  store float %tmp91, ptr addrspace(1) %tmp460, align 4
+  %tmp461 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 13
+  store float %tmp98, ptr addrspace(1) %tmp461, align 4
+  %tmp462 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 14
+  store float %tmp105, ptr addrspace(1) %tmp462, align 4
+  %tmp463 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 15
+  store float %tmp112, ptr addrspace(1) %tmp463, align 4
+  %tmp464 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 16
+  store float %tmp119, ptr addrspace(1) %tmp464, align 4
+  %tmp465 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 17
+  store float %tmp126, ptr addrspace(1) %tmp465, align 4
+  %tmp466 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 18
+  store float %tmp133, ptr addrspace(1) %tmp466, align 4
+  %tmp467 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 19
+  store float %tmp140, ptr addrspace(1) %tmp467, align 4
+  %tmp468 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 20
+  store float %tmp147, ptr addrspace(1) %tmp468, align 4
+  %tmp469 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 21
+  store float %tmp154, ptr addrspace(1) %tmp469, align 4
+  %tmp470 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 22
+  store float %tmp161, ptr addrspace(1) %tmp470, align 4
+  %tmp471 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 23
+  store float %tmp168, ptr addrspace(1) %tmp471, align 4
+  %tmp472 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 24
+  store float %tmp175, ptr addrspace(1) %tmp472, align 4
+  %tmp473 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 25
+  store float %tmp182, ptr addrspace(1) %tmp473, align 4
+  %tmp474 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 26
+  store float %tmp189, ptr addrspace(1) %tmp474, align 4
+  %tmp475 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 27
+  store float %tmp196, ptr addrspace(1) %tmp475, align 4
+  %tmp476 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 28
+  store float %tmp203, ptr addrspace(1) %tmp476, align 4
+  %tmp477 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 29
+  store float %tmp210, ptr addrspace(1) %tmp477, align 4
+  %tmp478 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 30
+  store float %tmp217, ptr addrspace(1) %tmp478, align 4
+  %tmp479 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 31
+  store float %tmp224, ptr addrspace(1) %tmp479, align 4
+  %tmp480 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 32
+  store float %tmp231, ptr addrspace(1) %tmp480, align 4
+  %tmp481 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 33
+  store float %tmp238, ptr addrspace(1) %tmp481, align 4
+  %tmp482 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 34
+  store float %tmp245, ptr addrspace(1) %tmp482, align 4
+  %tmp483 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 35
+  store float %tmp252, ptr addrspace(1) %tmp483, align 4
+  %tmp484 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 36
+  store float %tmp259, ptr addrspace(1) %tmp484, align 4
+  %tmp485 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 37
+  store float %tmp266, ptr addrspace(1) %tmp485, align 4
+  %tmp486 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 38
+  store float %tmp273, ptr addrspace(1) %tmp486, align 4
+  %tmp487 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 39
+  store float %tmp280, ptr addrspace(1) %tmp487, align 4
+  %tmp488 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 40
+  store float %tmp287, ptr addrspace(1) %tmp488, align 4
+  %tmp489 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 41
+  store float %tmp294, ptr addrspace(1) %tmp489, align 4
+  %tmp490 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 42
+  store float %tmp301, ptr addrspace(1) %tmp490, align 4
+  %tmp491 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 43
+  store float %tmp308, ptr addrspace(1) %tmp491, align 4
+  %tmp492 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 44
+  store float %tmp315, ptr addrspace(1) %tmp492, align 4
+  %tmp493 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 45
+  store float %tmp322, ptr addrspace(1) %tmp493, align 4
+  %tmp494 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 46
+  store float %tmp329, ptr addrspace(1) %tmp494, align 4
+  %tmp495 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 47
+  store float %tmp336, ptr addrspace(1) %tmp495, align 4
+  %tmp496 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 48
+  store float %tmp343, ptr addrspace(1) %tmp496, align 4
+  %tmp497 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 49
+  store float %tmp350, ptr addrspace(1) %tmp497, align 4
+  %tmp498 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 50
+  store float %tmp357, ptr addrspace(1) %tmp498, align 4
+  %tmp499 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 51
+  store float %tmp364, ptr addrspace(1) %tmp499, align 4
+  %tmp500 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 52
+  store float %tmp371, ptr addrspace(1) %tmp500, align 4
+  %tmp501 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 53
+  store float %tmp378, ptr addrspace(1) %tmp501, align 4
+  %tmp502 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 54
+  store float %tmp385, ptr addrspace(1) %tmp502, align 4
+  %tmp503 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 55
+  store float %tmp392, ptr addrspace(1) %tmp503, align 4
+  %tmp504 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 56
+  store float %tmp399, ptr addrspace(1) %tmp504, align 4
+  %tmp505 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 57
+  store float %tmp406, ptr addrspace(1) %tmp505, align 4
+  %tmp506 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 58
+  store float %tmp413, ptr addrspace(1) %tmp506, align 4
+  %tmp507 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 59
+  store float %tmp420, ptr addrspace(1) %tmp507, align 4
+  %tmp508 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 60
+  store float %tmp427, ptr addrspace(1) %tmp508, align 4
+  %tmp509 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 61
+  store float %tmp434, ptr addrspace(1) %tmp509, align 4
+  %tmp510 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 62
+  store float %tmp441, ptr addrspace(1) %tmp510, align 4
+  %tmp511 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 63
+  store float %tmp448, ptr addrspace(1) %tmp511, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
index 87b2d29c8b0fb..a568af00f05c1 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@@ -13,188 +13,188 @@
 ; VI: NumSgprs: {{[0-9]$}}
 ; VI: NumVgprs: {{[1-3][0-9]$}}
 
-define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) {
+define amdgpu_kernel void @load_fma_store(ptr addrspace(3) nocapture readonly %in_arg, ptr addrspace(1) nocapture %out_arg) {
 bb:
-  %adr.a.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20004
-  %adr.b.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20252
-  %adr.c.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20508
-  %adr.a.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20772
-  %adr.b.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21020
-  %adr.c.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21276
-  %adr.a.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21540
-  %adr.b.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21788
-  %adr.c.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22044
-  %adr.a.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22308
-  %adr.b.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22556
-  %adr.c.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22812
-  %adr.a.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23076
-  %adr.b.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23324
-  %adr.c.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23580
-  %adr.a.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23844
-  %adr.b.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24092
-  %adr.c.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24348
-  %adr.a.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24612
-  %adr.b.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24860
-  %adr.c.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25116
-  %adr.a.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25380
-  %adr.b.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25628
-  %adr.c.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25884
-  %adr.a.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26148
-  %adr.b.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26396
-  %adr.c.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26652
-  %adr.a.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26916
-  %adr.b.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27164
-  %adr.c.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27420
-  %adr.a.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27684
-  %adr.b.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27932
-  %adr.c.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28188
-  %adr.a.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28452
-  %adr.b.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28700
-  %adr.c.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28956
-  %adr.a.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29220
-  %adr.b.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29468
-  %adr.c.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29724
-  %adr.a.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29988
-  %adr.b.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30236
-  %adr.c.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30492
-  %adr.a.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30756
-  %adr.b.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31004
-  %adr.c.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31260
-  %adr.a.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31524
-  %adr.b.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31772
-  %adr.c.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32028
-  %adr.a.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32292
-  %adr.b.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32540
-  %adr.c.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32796
-  %adr.a.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33060
-  %adr.b.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33308
-  %adr.c.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33564
-  %adr.a.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33828
-  %adr.b.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34076
-  %adr.c.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34332
-  %adr.a.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34596
-  %adr.b.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34844
-  %adr.c.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35100
-  %adr.a.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35364
-  %adr.b.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35612
-  %adr.c.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35868
-  %adr.a.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36132
-  %adr.b.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36380
-  %adr.c.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36636
-  %adr.a.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36900
-  %adr.b.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37148
-  %adr.c.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37404
-  %adr.a.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37668
-  %adr.b.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37916
-  %adr.c.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38172
-  %adr.a.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38436
-  %adr.b.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38684
-  %adr.c.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38940
-  %adr.a.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39204
-  %adr.b.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39452
-  %adr.c.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39708
-  %adr.a.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39972
-  %adr.b.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40220
-  %adr.c.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40476
-  %adr.a.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40740
-  %adr.b.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40988
-  %adr.c.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41244
-  %adr.a.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41508
-  %adr.b.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41756
-  %adr.c.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42012
-  %adr.a.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42276
-  %adr.b.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42524
-  %adr.c.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42780
-  %a.0 = load float, float addrspace(3)* %adr.a.0, align 4
-  %b.0 = load float, float addrspace(3)* %adr.b.0, align 4
-  %c.0 = load float, float addrspace(3)* %adr.c.0, align 4
-  %a.1 = load float, float addrspace(3)* %adr.a.1, align 4
-  %b.1 = load float, float addrspace(3)* %adr.b.1, align 4
-  %c.1 = load float, float addrspace(3)* %adr.c.1, align 4
-  %a.2 = load float, float addrspace(3)* %adr.a.2, align 4
-  %b.2 = load float, float addrspace(3)* %adr.b.2, align 4
-  %c.2 = load float, float addrspace(3)* %adr.c.2, align 4
-  %a.3 = load float, float addrspace(3)* %adr.a.3, align 4
-  %b.3 = load float, float addrspace(3)* %adr.b.3, align 4
-  %c.3 = load float, float addrspace(3)* %adr.c.3, align 4
-  %a.4 = load float, float addrspace(3)* %adr.a.4, align 4
-  %b.4 = load float, float addrspace(3)* %adr.b.4, align 4
-  %c.4 = load float, float addrspace(3)* %adr.c.4, align 4
-  %a.5 = load float, float addrspace(3)* %adr.a.5, align 4
-  %b.5 = load float, float addrspace(3)* %adr.b.5, align 4
-  %c.5 = load float, float addrspace(3)* %adr.c.5, align 4
-  %a.6 = load float, float addrspace(3)* %adr.a.6, align 4
-  %b.6 = load float, float addrspace(3)* %adr.b.6, align 4
-  %c.6 = load float, float addrspace(3)* %adr.c.6, align 4
-  %a.7 = load float, float addrspace(3)* %adr.a.7, align 4
-  %b.7 = load float, float addrspace(3)* %adr.b.7, align 4
-  %c.7 = load float, float addrspace(3)* %adr.c.7, align 4
-  %a.8 = load float, float addrspace(3)* %adr.a.8, align 4
-  %b.8 = load float, float addrspace(3)* %adr.b.8, align 4
-  %c.8 = load float, float addrspace(3)* %adr.c.8, align 4
-  %a.9 = load float, float addrspace(3)* %adr.a.9, align 4
-  %b.9 = load float, float addrspace(3)* %adr.b.9, align 4
-  %c.9 = load float, float addrspace(3)* %adr.c.9, align 4
-  %a.10 = load float, float addrspace(3)* %adr.a.10, align 4
-  %b.10 = load float, float addrspace(3)* %adr.b.10, align 4
-  %c.10 = load float, float addrspace(3)* %adr.c.10, align 4
-  %a.11 = load float, float addrspace(3)* %adr.a.11, align 4
-  %b.11 = load float, float addrspace(3)* %adr.b.11, align 4
-  %c.11 = load float, float addrspace(3)* %adr.c.11, align 4
-  %a.12 = load float, float addrspace(3)* %adr.a.12, align 4
-  %b.12 = load float, float addrspace(3)* %adr.b.12, align 4
-  %c.12 = load float, float addrspace(3)* %adr.c.12, align 4
-  %a.13 = load float, float addrspace(3)* %adr.a.13, align 4
-  %b.13 = load float, float addrspace(3)* %adr.b.13, align 4
-  %c.13 = load float, float addrspace(3)* %adr.c.13, align 4
-  %a.14 = load float, float addrspace(3)* %adr.a.14, align 4
-  %b.14 = load float, float addrspace(3)* %adr.b.14, align 4
-  %c.14 = load float, float addrspace(3)* %adr.c.14, align 4
-  %a.15 = load float, float addrspace(3)* %adr.a.15, align 4
-  %b.15 = load float, float addrspace(3)* %adr.b.15, align 4
-  %c.15 = load float, float addrspace(3)* %adr.c.15, align 4
-  %a.16 = load float, float addrspace(3)* %adr.a.16, align 4
-  %b.16 = load float, float addrspace(3)* %adr.b.16, align 4
-  %c.16 = load float, float addrspace(3)* %adr.c.16, align 4
-  %a.17 = load float, float addrspace(3)* %adr.a.17, align 4
-  %b.17 = load float, float addrspace(3)* %adr.b.17, align 4
-  %c.17 = load float, float addrspace(3)* %adr.c.17, align 4
-  %a.18 = load float, float addrspace(3)* %adr.a.18, align 4
-  %b.18 = load float, float addrspace(3)* %adr.b.18, align 4
-  %c.18 = load float, float addrspace(3)* %adr.c.18, align 4
-  %a.19 = load float, float addrspace(3)* %adr.a.19, align 4
-  %b.19 = load float, float addrspace(3)* %adr.b.19, align 4
-  %c.19 = load float, float addrspace(3)* %adr.c.19, align 4
-  %a.20 = load float, float addrspace(3)* %adr.a.20, align 4
-  %b.20 = load float, float addrspace(3)* %adr.b.20, align 4
-  %c.20 = load float, float addrspace(3)* %adr.c.20, align 4
-  %a.21 = load float, float addrspace(3)* %adr.a.21, align 4
-  %b.21 = load float, float addrspace(3)* %adr.b.21, align 4
-  %c.21 = load float, float addrspace(3)* %adr.c.21, align 4
-  %a.22 = load float, float addrspace(3)* %adr.a.22, align 4
-  %b.22 = load float, float addrspace(3)* %adr.b.22, align 4
-  %c.22 = load float, float addrspace(3)* %adr.c.22, align 4
-  %a.23 = load float, float addrspace(3)* %adr.a.23, align 4
-  %b.23 = load float, float addrspace(3)* %adr.b.23, align 4
-  %c.23 = load float, float addrspace(3)* %adr.c.23, align 4
-  %a.24 = load float, float addrspace(3)* %adr.a.24, align 4
-  %b.24 = load float, float addrspace(3)* %adr.b.24, align 4
-  %c.24 = load float, float addrspace(3)* %adr.c.24, align 4
-  %a.25 = load float, float addrspace(3)* %adr.a.25, align 4
-  %b.25 = load float, float addrspace(3)* %adr.b.25, align 4
-  %c.25 = load float, float addrspace(3)* %adr.c.25, align 4
-  %a.26 = load float, float addrspace(3)* %adr.a.26, align 4
-  %b.26 = load float, float addrspace(3)* %adr.b.26, align 4
-  %c.26 = load float, float addrspace(3)* %adr.c.26, align 4
-  %a.27 = load float, float addrspace(3)* %adr.a.27, align 4
-  %b.27 = load float, float addrspace(3)* %adr.b.27, align 4
-  %c.27 = load float, float addrspace(3)* %adr.c.27, align 4
-  %a.28 = load float, float addrspace(3)* %adr.a.28, align 4
-  %b.28 = load float, float addrspace(3)* %adr.b.28, align 4
-  %c.28 = load float, float addrspace(3)* %adr.c.28, align 4
-  %a.29 = load float, float addrspace(3)* %adr.a.29, align 4
-  %b.29 = load float, float addrspace(3)* %adr.b.29, align 4
-  %c.29 = load float, float addrspace(3)* %adr.c.29, align 4
+  %adr.a.0 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 20004
+  %adr.b.0 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 20252
+  %adr.c.0 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 20508
+  %adr.a.1 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 20772
+  %adr.b.1 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 21020
+  %adr.c.1 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 21276
+  %adr.a.2 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 21540
+  %adr.b.2 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 21788
+  %adr.c.2 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 22044
+  %adr.a.3 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 22308
+  %adr.b.3 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 22556
+  %adr.c.3 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 22812
+  %adr.a.4 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 23076
+  %adr.b.4 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 23324
+  %adr.c.4 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 23580
+  %adr.a.5 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 23844
+  %adr.b.5 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 24092
+  %adr.c.5 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 24348
+  %adr.a.6 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 24612
+  %adr.b.6 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 24860
+  %adr.c.6 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 25116
+  %adr.a.7 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 25380
+  %adr.b.7 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 25628
+  %adr.c.7 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 25884
+  %adr.a.8 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 26148
+  %adr.b.8 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 26396
+  %adr.c.8 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 26652
+  %adr.a.9 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 26916
+  %adr.b.9 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 27164
+  %adr.c.9 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 27420
+  %adr.a.10 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 27684
+  %adr.b.10 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 27932
+  %adr.c.10 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 28188
+  %adr.a.11 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 28452
+  %adr.b.11 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 28700
+  %adr.c.11 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 28956
+  %adr.a.12 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 29220
+  %adr.b.12 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 29468
+  %adr.c.12 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 29724
+  %adr.a.13 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 29988
+  %adr.b.13 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 30236
+  %adr.c.13 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 30492
+  %adr.a.14 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 30756
+  %adr.b.14 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 31004
+  %adr.c.14 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 31260
+  %adr.a.15 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 31524
+  %adr.b.15 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 31772
+  %adr.c.15 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 32028
+  %adr.a.16 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 32292
+  %adr.b.16 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 32540
+  %adr.c.16 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 32796
+  %adr.a.17 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 33060
+  %adr.b.17 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 33308
+  %adr.c.17 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 33564
+  %adr.a.18 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 33828
+  %adr.b.18 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 34076
+  %adr.c.18 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 34332
+  %adr.a.19 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 34596
+  %adr.b.19 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 34844
+  %adr.c.19 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 35100
+  %adr.a.20 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 35364
+  %adr.b.20 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 35612
+  %adr.c.20 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 35868
+  %adr.a.21 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 36132
+  %adr.b.21 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 36380
+  %adr.c.21 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 36636
+  %adr.a.22 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 36900
+  %adr.b.22 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 37148
+  %adr.c.22 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 37404
+  %adr.a.23 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 37668
+  %adr.b.23 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 37916
+  %adr.c.23 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 38172
+  %adr.a.24 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 38436
+  %adr.b.24 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 38684
+  %adr.c.24 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 38940
+  %adr.a.25 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 39204
+  %adr.b.25 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 39452
+  %adr.c.25 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 39708
+  %adr.a.26 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 39972
+  %adr.b.26 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 40220
+  %adr.c.26 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 40476
+  %adr.a.27 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 40740
+  %adr.b.27 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 40988
+  %adr.c.27 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 41244
+  %adr.a.28 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 41508
+  %adr.b.28 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 41756
+  %adr.c.28 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 42012
+  %adr.a.29 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 42276
+  %adr.b.29 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 42524
+  %adr.c.29 = getelementptr inbounds float, ptr addrspace(3) %in_arg, i32 42780
+  %a.0 = load float, ptr addrspace(3) %adr.a.0, align 4
+  %b.0 = load float, ptr addrspace(3) %adr.b.0, align 4
+  %c.0 = load float, ptr addrspace(3) %adr.c.0, align 4
+  %a.1 = load float, ptr addrspace(3) %adr.a.1, align 4
+  %b.1 = load float, ptr addrspace(3) %adr.b.1, align 4
+  %c.1 = load float, ptr addrspace(3) %adr.c.1, align 4
+  %a.2 = load float, ptr addrspace(3) %adr.a.2, align 4
+  %b.2 = load float, ptr addrspace(3) %adr.b.2, align 4
+  %c.2 = load float, ptr addrspace(3) %adr.c.2, align 4
+  %a.3 = load float, ptr addrspace(3) %adr.a.3, align 4
+  %b.3 = load float, ptr addrspace(3) %adr.b.3, align 4
+  %c.3 = load float, ptr addrspace(3) %adr.c.3, align 4
+  %a.4 = load float, ptr addrspace(3) %adr.a.4, align 4
+  %b.4 = load float, ptr addrspace(3) %adr.b.4, align 4
+  %c.4 = load float, ptr addrspace(3) %adr.c.4, align 4
+  %a.5 = load float, ptr addrspace(3) %adr.a.5, align 4
+  %b.5 = load float, ptr addrspace(3) %adr.b.5, align 4
+  %c.5 = load float, ptr addrspace(3) %adr.c.5, align 4
+  %a.6 = load float, ptr addrspace(3) %adr.a.6, align 4
+  %b.6 = load float, ptr addrspace(3) %adr.b.6, align 4
+  %c.6 = load float, ptr addrspace(3) %adr.c.6, align 4
+  %a.7 = load float, ptr addrspace(3) %adr.a.7, align 4
+  %b.7 = load float, ptr addrspace(3) %adr.b.7, align 4
+  %c.7 = load float, ptr addrspace(3) %adr.c.7, align 4
+  %a.8 = load float, ptr addrspace(3) %adr.a.8, align 4
+  %b.8 = load float, ptr addrspace(3) %adr.b.8, align 4
+  %c.8 = load float, ptr addrspace(3) %adr.c.8, align 4
+  %a.9 = load float, ptr addrspace(3) %adr.a.9, align 4
+  %b.9 = load float, ptr addrspace(3) %adr.b.9, align 4
+  %c.9 = load float, ptr addrspace(3) %adr.c.9, align 4
+  %a.10 = load float, ptr addrspace(3) %adr.a.10, align 4
+  %b.10 = load float, ptr addrspace(3) %adr.b.10, align 4
+  %c.10 = load float, ptr addrspace(3) %adr.c.10, align 4
+  %a.11 = load float, ptr addrspace(3) %adr.a.11, align 4
+  %b.11 = load float, ptr addrspace(3) %adr.b.11, align 4
+  %c.11 = load float, ptr addrspace(3) %adr.c.11, align 4
+  %a.12 = load float, ptr addrspace(3) %adr.a.12, align 4
+  %b.12 = load float, ptr addrspace(3) %adr.b.12, align 4
+  %c.12 = load float, ptr addrspace(3) %adr.c.12, align 4
+  %a.13 = load float, ptr addrspace(3) %adr.a.13, align 4
+  %b.13 = load float, ptr addrspace(3) %adr.b.13, align 4
+  %c.13 = load float, ptr addrspace(3) %adr.c.13, align 4
+  %a.14 = load float, ptr addrspace(3) %adr.a.14, align 4
+  %b.14 = load float, ptr addrspace(3) %adr.b.14, align 4
+  %c.14 = load float, ptr addrspace(3) %adr.c.14, align 4
+  %a.15 = load float, ptr addrspace(3) %adr.a.15, align 4
+  %b.15 = load float, ptr addrspace(3) %adr.b.15, align 4
+  %c.15 = load float, ptr addrspace(3) %adr.c.15, align 4
+  %a.16 = load float, ptr addrspace(3) %adr.a.16, align 4
+  %b.16 = load float, ptr addrspace(3) %adr.b.16, align 4
+  %c.16 = load float, ptr addrspace(3) %adr.c.16, align 4
+  %a.17 = load float, ptr addrspace(3) %adr.a.17, align 4
+  %b.17 = load float, ptr addrspace(3) %adr.b.17, align 4
+  %c.17 = load float, ptr addrspace(3) %adr.c.17, align 4
+  %a.18 = load float, ptr addrspace(3) %adr.a.18, align 4
+  %b.18 = load float, ptr addrspace(3) %adr.b.18, align 4
+  %c.18 = load float, ptr addrspace(3) %adr.c.18, align 4
+  %a.19 = load float, ptr addrspace(3) %adr.a.19, align 4
+  %b.19 = load float, ptr addrspace(3) %adr.b.19, align 4
+  %c.19 = load float, ptr addrspace(3) %adr.c.19, align 4
+  %a.20 = load float, ptr addrspace(3) %adr.a.20, align 4
+  %b.20 = load float, ptr addrspace(3) %adr.b.20, align 4
+  %c.20 = load float, ptr addrspace(3) %adr.c.20, align 4
+  %a.21 = load float, ptr addrspace(3) %adr.a.21, align 4
+  %b.21 = load float, ptr addrspace(3) %adr.b.21, align 4
+  %c.21 = load float, ptr addrspace(3) %adr.c.21, align 4
+  %a.22 = load float, ptr addrspace(3) %adr.a.22, align 4
+  %b.22 = load float, ptr addrspace(3) %adr.b.22, align 4
+  %c.22 = load float, ptr addrspace(3) %adr.c.22, align 4
+  %a.23 = load float, ptr addrspace(3) %adr.a.23, align 4
+  %b.23 = load float, ptr addrspace(3) %adr.b.23, align 4
+  %c.23 = load float, ptr addrspace(3) %adr.c.23, align 4
+  %a.24 = load float, ptr addrspace(3) %adr.a.24, align 4
+  %b.24 = load float, ptr addrspace(3) %adr.b.24, align 4
+  %c.24 = load float, ptr addrspace(3) %adr.c.24, align 4
+  %a.25 = load float, ptr addrspace(3) %adr.a.25, align 4
+  %b.25 = load float, ptr addrspace(3) %adr.b.25, align 4
+  %c.25 = load float, ptr addrspace(3) %adr.c.25, align 4
+  %a.26 = load float, ptr addrspace(3) %adr.a.26, align 4
+  %b.26 = load float, ptr addrspace(3) %adr.b.26, align 4
+  %c.26 = load float, ptr addrspace(3) %adr.c.26, align 4
+  %a.27 = load float, ptr addrspace(3) %adr.a.27, align 4
+  %b.27 = load float, ptr addrspace(3) %adr.b.27, align 4
+  %c.27 = load float, ptr addrspace(3) %adr.c.27, align 4
+  %a.28 = load float, ptr addrspace(3) %adr.a.28, align 4
+  %b.28 = load float, ptr addrspace(3) %adr.b.28, align 4
+  %c.28 = load float, ptr addrspace(3) %adr.c.28, align 4
+  %a.29 = load float, ptr addrspace(3) %adr.a.29, align 4
+  %b.29 = load float, ptr addrspace(3) %adr.b.29, align 4
+  %c.29 = load float, ptr addrspace(3) %adr.c.29, align 4
   %res.0 = tail call float @llvm.fmuladd.f32(float %a.0, float %b.0, float %c.0)
   %res.1 = tail call float @llvm.fmuladd.f32(float %a.1, float %b.1, float %c.1)
   %res.2 = tail call float @llvm.fmuladd.f32(float %a.2, float %b.2, float %c.2)
@@ -225,66 +225,65 @@ bb:
   %res.27 = tail call float @llvm.fmuladd.f32(float %a.27, float %b.27, float %c.27)
   %res.28 = tail call float @llvm.fmuladd.f32(float %a.28, float %b.28, float %c.28)
   %res.29 = tail call float @llvm.fmuladd.f32(float %a.29, float %b.29, float %c.29)
-  %adr.res.0 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 0
-  %adr.res.1 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 2
-  %adr.res.2 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 4
-  %adr.res.3 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 6
-  %adr.res.4 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 8
-  %adr.res.5 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 10
-  %adr.res.6 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 12
-  %adr.res.7 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 14
-  %adr.res.8 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 16
-  %adr.res.9 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 18
-  %adr.res.10 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 20
-  %adr.res.11 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 22
-  %adr.res.12 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 24
-  %adr.res.13 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 26
-  %adr.res.14 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 28
-  %adr.res.15 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 30
-  %adr.res.16 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 32
-  %adr.res.17 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 34
-  %adr.res.18 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 36
-  %adr.res.19 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 38
-  %adr.res.20 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 40
-  %adr.res.21 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 42
-  %adr.res.22 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 44
-  %adr.res.23 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 46
-  %adr.res.24 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 48
-  %adr.res.25 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 50
-  %adr.res.26 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 52
-  %adr.res.27 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 54
-  %adr.res.28 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 56
-  %adr.res.29 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 58
-  store float %res.0, float addrspace(1)* %adr.res.0, align 4
-  store float %res.1, float addrspace(1)* %adr.res.1, align 4
-  store float %res.2, float addrspace(1)* %adr.res.2, align 4
-  store float %res.3, float addrspace(1)* %adr.res.3, align 4
-  store float %res.4, float addrspace(1)* %adr.res.4, align 4
-  store float %res.5, float addrspace(1)* %adr.res.5, align 4
-  store float %res.6, float addrspace(1)* %adr.res.6, align 4
-  store float %res.7, float addrspace(1)* %adr.res.7, align 4
-  store float %res.8, float addrspace(1)* %adr.res.8, align 4
-  store float %res.9, float addrspace(1)* %adr.res.9, align 4
-  store float %res.10, float addrspace(1)* %adr.res.10, align 4
-  store float %res.11, float addrspace(1)* %adr.res.11, align 4
-  store float %res.12, float addrspace(1)* %adr.res.12, align 4
-  store float %res.13, float addrspace(1)* %adr.res.13, align 4
-  store float %res.14, float addrspace(1)* %adr.res.14, align 4
-  store float %res.15, float addrspace(1)* %adr.res.15, align 4
-  store float %res.16, float addrspace(1)* %adr.res.16, align 4
-  store float %res.17, float addrspace(1)* %adr.res.17, align 4
-  store float %res.18, float addrspace(1)* %adr.res.18, align 4
-  store float %res.19, float addrspace(1)* %adr.res.19, align 4
-  store float %res.20, float addrspace(1)* %adr.res.20, align 4
-  store float %res.21, float addrspace(1)* %adr.res.21, align 4
-  store float %res.22, float addrspace(1)* %adr.res.22, align 4
-  store float %res.23, float addrspace(1)* %adr.res.23, align 4
-  store float %res.24, float addrspace(1)* %adr.res.24, align 4
-  store float %res.25, float addrspace(1)* %adr.res.25, align 4
-  store float %res.26, float addrspace(1)* %adr.res.26, align 4
-  store float %res.27, float addrspace(1)* %adr.res.27, align 4
-  store float %res.28, float addrspace(1)* %adr.res.28, align 4
-  store float %res.29, float addrspace(1)* %adr.res.29, align 4
+  %adr.res.1 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 2
+  %adr.res.2 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 4
+  %adr.res.3 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 6
+  %adr.res.4 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 8
+  %adr.res.5 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 10
+  %adr.res.6 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 12
+  %adr.res.7 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 14
+  %adr.res.8 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 16
+  %adr.res.9 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 18
+  %adr.res.10 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 20
+  %adr.res.11 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 22
+  %adr.res.12 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 24
+  %adr.res.13 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 26
+  %adr.res.14 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 28
+  %adr.res.15 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 30
+  %adr.res.16 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 32
+  %adr.res.17 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 34
+  %adr.res.18 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 36
+  %adr.res.19 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 38
+  %adr.res.20 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 40
+  %adr.res.21 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 42
+  %adr.res.22 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 44
+  %adr.res.23 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 46
+  %adr.res.24 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 48
+  %adr.res.25 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 50
+  %adr.res.26 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 52
+  %adr.res.27 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 54
+  %adr.res.28 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 56
+  %adr.res.29 = getelementptr inbounds float, ptr addrspace(1) %out_arg, i64 58
+  store float %res.0, ptr addrspace(1) %out_arg, align 4
+  store float %res.1, ptr addrspace(1) %adr.res.1, align 4
+  store float %res.2, ptr addrspace(1) %adr.res.2, align 4
+  store float %res.3, ptr addrspace(1) %adr.res.3, align 4
+  store float %res.4, ptr addrspace(1) %adr.res.4, align 4
+  store float %res.5, ptr addrspace(1) %adr.res.5, align 4
+  store float %res.6, ptr addrspace(1) %adr.res.6, align 4
+  store float %res.7, ptr addrspace(1) %adr.res.7, align 4
+  store float %res.8, ptr addrspace(1) %adr.res.8, align 4
+  store float %res.9, ptr addrspace(1) %adr.res.9, align 4
+  store float %res.10, ptr addrspace(1) %adr.res.10, align 4
+  store float %res.11, ptr addrspace(1) %adr.res.11, align 4
+  store float %res.12, ptr addrspace(1) %adr.res.12, align 4
+  store float %res.13, ptr addrspace(1) %adr.res.13, align 4
+  store float %res.14, ptr addrspace(1) %adr.res.14, align 4
+  store float %res.15, ptr addrspace(1) %adr.res.15, align 4
+  store float %res.16, ptr addrspace(1) %adr.res.16, align 4
+  store float %res.17, ptr addrspace(1) %adr.res.17, align 4
+  store float %res.18, ptr addrspace(1) %adr.res.18, align 4
+  store float %res.19, ptr addrspace(1) %adr.res.19, align 4
+  store float %res.20, ptr addrspace(1) %adr.res.20, align 4
+  store float %res.21, ptr addrspace(1) %adr.res.21, align 4
+  store float %res.22, ptr addrspace(1) %adr.res.22, align 4
+  store float %res.23, ptr addrspace(1) %adr.res.23, align 4
+  store float %res.24, ptr addrspace(1) %adr.res.24, align 4
+  store float %res.25, ptr addrspace(1) %adr.res.25, align 4
+  store float %res.26, ptr addrspace(1) %adr.res.26, align 4
+  store float %res.27, ptr addrspace(1) %adr.res.27, align 4
+  store float %res.28, ptr addrspace(1) %adr.res.28, align 4
+  store float %res.29, ptr addrspace(1) %adr.res.29, align 4
   ret void
 }
 declare float @llvm.fmuladd.f32(float, float, float) #0

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
index d567e1535f64c..bc47838a32aa3 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
@@ -13,583 +13,583 @@
 ;
 ; MISCHED: NumVgprs: {{[7-9][0-9]$}}
 
-define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(1)* nocapture %arg1) #1 {
+define amdgpu_kernel void @load_fma_store(ptr addrspace(3) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) #1 {
 bb:
-  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1
-  %tmp2 = load float, float addrspace(3)* %tmp, align 4
-  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2
-  %tmp4 = load float, float addrspace(3)* %tmp3, align 4
-  %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 3
-  %tmp6 = load float, float addrspace(3)* %tmp5, align 4
+  %tmp = getelementptr inbounds float, ptr addrspace(3) %arg, i32 1
+  %tmp2 = load float, ptr addrspace(3) %tmp, align 4
+  %tmp3 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 2
+  %tmp4 = load float, ptr addrspace(3) %tmp3, align 4
+  %tmp5 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 3
+  %tmp6 = load float, ptr addrspace(3) %tmp5, align 4
   %tmp7 = tail call float @llvm.fmuladd.f32(float %tmp2, float %tmp4, float %tmp6)
-  %tmp8 = getelementptr inbounds float, float addrspace(3)* %arg, i32 5
-  %tmp9 = load float, float addrspace(3)* %tmp8, align 4
-  %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6
-  %tmp11 = load float, float addrspace(3)* %tmp10, align 4
-  %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 7
-  %tmp13 = load float, float addrspace(3)* %tmp12, align 4
+  %tmp8 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 5
+  %tmp9 = load float, ptr addrspace(3) %tmp8, align 4
+  %tmp10 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 6
+  %tmp11 = load float, ptr addrspace(3) %tmp10, align 4
+  %tmp12 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 7
+  %tmp13 = load float, ptr addrspace(3) %tmp12, align 4
   %tmp14 = tail call float @llvm.fmuladd.f32(float %tmp9, float %tmp11, float %tmp13)
-  %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 9
-  %tmp16 = load float, float addrspace(3)* %tmp15, align 4
-  %tmp17 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10
-  %tmp18 = load float, float addrspace(3)* %tmp17, align 4
-  %tmp19 = getelementptr inbounds float, float addrspace(3)* %arg, i32 11
-  %tmp20 = load float, float addrspace(3)* %tmp19, align 4
+  %tmp15 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 9
+  %tmp16 = load float, ptr addrspace(3) %tmp15, align 4
+  %tmp17 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 10
+  %tmp18 = load float, ptr addrspace(3) %tmp17, align 4
+  %tmp19 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 11
+  %tmp20 = load float, ptr addrspace(3) %tmp19, align 4
   %tmp21 = tail call float @llvm.fmuladd.f32(float %tmp16, float %tmp18, float %tmp20)
-  %tmp22 = getelementptr inbounds float, float addrspace(3)* %arg, i32 13
-  %tmp23 = load float, float addrspace(3)* %tmp22, align 4
-  %tmp24 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14
-  %tmp25 = load float, float addrspace(3)* %tmp24, align 4
-  %tmp26 = getelementptr inbounds float, float addrspace(3)* %arg, i32 15
-  %tmp27 = load float, float addrspace(3)* %tmp26, align 4
+  %tmp22 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 13
+  %tmp23 = load float, ptr addrspace(3) %tmp22, align 4
+  %tmp24 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 14
+  %tmp25 = load float, ptr addrspace(3) %tmp24, align 4
+  %tmp26 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 15
+  %tmp27 = load float, ptr addrspace(3) %tmp26, align 4
   %tmp28 = tail call float @llvm.fmuladd.f32(float %tmp23, float %tmp25, float %tmp27)
-  %tmp29 = getelementptr inbounds float, float addrspace(3)* %arg, i32 17
-  %tmp30 = load float, float addrspace(3)* %tmp29, align 4
-  %tmp31 = getelementptr inbounds float, float addrspace(3)* %arg, i32 18
-  %tmp32 = load float, float addrspace(3)* %tmp31, align 4
-  %tmp33 = getelementptr inbounds float, float addrspace(3)* %arg, i32 19
-  %tmp34 = load float, float addrspace(3)* %tmp33, align 4
+  %tmp29 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 17
+  %tmp30 = load float, ptr addrspace(3) %tmp29, align 4
+  %tmp31 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 18
+  %tmp32 = load float, ptr addrspace(3) %tmp31, align 4
+  %tmp33 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 19
+  %tmp34 = load float, ptr addrspace(3) %tmp33, align 4
   %tmp35 = tail call float @llvm.fmuladd.f32(float %tmp30, float %tmp32, float %tmp34)
-  %tmp36 = getelementptr inbounds float, float addrspace(3)* %arg, i32 21
-  %tmp37 = load float, float addrspace(3)* %tmp36, align 4
-  %tmp38 = getelementptr inbounds float, float addrspace(3)* %arg, i32 22
-  %tmp39 = load float, float addrspace(3)* %tmp38, align 4
-  %tmp40 = getelementptr inbounds float, float addrspace(3)* %arg, i32 23
-  %tmp41 = load float, float addrspace(3)* %tmp40, align 4
+  %tmp36 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 21
+  %tmp37 = load float, ptr addrspace(3) %tmp36, align 4
+  %tmp38 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 22
+  %tmp39 = load float, ptr addrspace(3) %tmp38, align 4
+  %tmp40 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 23
+  %tmp41 = load float, ptr addrspace(3) %tmp40, align 4
   %tmp42 = tail call float @llvm.fmuladd.f32(float %tmp37, float %tmp39, float %tmp41)
-  %tmp43 = getelementptr inbounds float, float addrspace(3)* %arg, i32 25
-  %tmp44 = load float, float addrspace(3)* %tmp43, align 4
-  %tmp45 = getelementptr inbounds float, float addrspace(3)* %arg, i32 26
-  %tmp46 = load float, float addrspace(3)* %tmp45, align 4
-  %tmp47 = getelementptr inbounds float, float addrspace(3)* %arg, i32 27
-  %tmp48 = load float, float addrspace(3)* %tmp47, align 4
+  %tmp43 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 25
+  %tmp44 = load float, ptr addrspace(3) %tmp43, align 4
+  %tmp45 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 26
+  %tmp46 = load float, ptr addrspace(3) %tmp45, align 4
+  %tmp47 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 27
+  %tmp48 = load float, ptr addrspace(3) %tmp47, align 4
   %tmp49 = tail call float @llvm.fmuladd.f32(float %tmp44, float %tmp46, float %tmp48)
-  %tmp50 = getelementptr inbounds float, float addrspace(3)* %arg, i32 29
-  %tmp51 = load float, float addrspace(3)* %tmp50, align 4
-  %tmp52 = getelementptr inbounds float, float addrspace(3)* %arg, i32 30
-  %tmp53 = load float, float addrspace(3)* %tmp52, align 4
-  %tmp54 = getelementptr inbounds float, float addrspace(3)* %arg, i32 31
-  %tmp55 = load float, float addrspace(3)* %tmp54, align 4
+  %tmp50 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 29
+  %tmp51 = load float, ptr addrspace(3) %tmp50, align 4
+  %tmp52 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 30
+  %tmp53 = load float, ptr addrspace(3) %tmp52, align 4
+  %tmp54 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 31
+  %tmp55 = load float, ptr addrspace(3) %tmp54, align 4
   %tmp56 = tail call float @llvm.fmuladd.f32(float %tmp51, float %tmp53, float %tmp55)
-  %tmp57 = getelementptr inbounds float, float addrspace(3)* %arg, i32 33
-  %tmp58 = load float, float addrspace(3)* %tmp57, align 4
-  %tmp59 = getelementptr inbounds float, float addrspace(3)* %arg, i32 34
-  %tmp60 = load float, float addrspace(3)* %tmp59, align 4
-  %tmp61 = getelementptr inbounds float, float addrspace(3)* %arg, i32 35
-  %tmp62 = load float, float addrspace(3)* %tmp61, align 4
+  %tmp57 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 33
+  %tmp58 = load float, ptr addrspace(3) %tmp57, align 4
+  %tmp59 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 34
+  %tmp60 = load float, ptr addrspace(3) %tmp59, align 4
+  %tmp61 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 35
+  %tmp62 = load float, ptr addrspace(3) %tmp61, align 4
   %tmp63 = tail call float @llvm.fmuladd.f32(float %tmp58, float %tmp60, float %tmp62)
-  %tmp64 = getelementptr inbounds float, float addrspace(3)* %arg, i32 37
-  %tmp65 = load float, float addrspace(3)* %tmp64, align 4
-  %tmp66 = getelementptr inbounds float, float addrspace(3)* %arg, i32 38
-  %tmp67 = load float, float addrspace(3)* %tmp66, align 4
-  %tmp68 = getelementptr inbounds float, float addrspace(3)* %arg, i32 39
-  %tmp69 = load float, float addrspace(3)* %tmp68, align 4
+  %tmp64 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 37
+  %tmp65 = load float, ptr addrspace(3) %tmp64, align 4
+  %tmp66 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 38
+  %tmp67 = load float, ptr addrspace(3) %tmp66, align 4
+  %tmp68 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 39
+  %tmp69 = load float, ptr addrspace(3) %tmp68, align 4
   %tmp70 = tail call float @llvm.fmuladd.f32(float %tmp65, float %tmp67, float %tmp69)
-  %tmp71 = getelementptr inbounds float, float addrspace(3)* %arg, i32 41
-  %tmp72 = load float, float addrspace(3)* %tmp71, align 4
-  %tmp73 = getelementptr inbounds float, float addrspace(3)* %arg, i32 42
-  %tmp74 = load float, float addrspace(3)* %tmp73, align 4
-  %tmp75 = getelementptr inbounds float, float addrspace(3)* %arg, i32 43
-  %tmp76 = load float, float addrspace(3)* %tmp75, align 4
+  %tmp71 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 41
+  %tmp72 = load float, ptr addrspace(3) %tmp71, align 4
+  %tmp73 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 42
+  %tmp74 = load float, ptr addrspace(3) %tmp73, align 4
+  %tmp75 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 43
+  %tmp76 = load float, ptr addrspace(3) %tmp75, align 4
   %tmp77 = tail call float @llvm.fmuladd.f32(float %tmp72, float %tmp74, float %tmp76)
-  %tmp78 = getelementptr inbounds float, float addrspace(3)* %arg, i32 45
-  %tmp79 = load float, float addrspace(3)* %tmp78, align 4
-  %tmp80 = getelementptr inbounds float, float addrspace(3)* %arg, i32 46
-  %tmp81 = load float, float addrspace(3)* %tmp80, align 4
-  %tmp82 = getelementptr inbounds float, float addrspace(3)* %arg, i32 47
-  %tmp83 = load float, float addrspace(3)* %tmp82, align 4
+  %tmp78 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 45
+  %tmp79 = load float, ptr addrspace(3) %tmp78, align 4
+  %tmp80 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 46
+  %tmp81 = load float, ptr addrspace(3) %tmp80, align 4
+  %tmp82 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 47
+  %tmp83 = load float, ptr addrspace(3) %tmp82, align 4
   %tmp84 = tail call float @llvm.fmuladd.f32(float %tmp79, float %tmp81, float %tmp83)
-  %tmp85 = getelementptr inbounds float, float addrspace(3)* %arg, i32 49
-  %tmp86 = load float, float addrspace(3)* %tmp85, align 4
-  %tmp87 = getelementptr inbounds float, float addrspace(3)* %arg, i32 50
-  %tmp88 = load float, float addrspace(3)* %tmp87, align 4
-  %tmp89 = getelementptr inbounds float, float addrspace(3)* %arg, i32 51
-  %tmp90 = load float, float addrspace(3)* %tmp89, align 4
+  %tmp85 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 49
+  %tmp86 = load float, ptr addrspace(3) %tmp85, align 4
+  %tmp87 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 50
+  %tmp88 = load float, ptr addrspace(3) %tmp87, align 4
+  %tmp89 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 51
+  %tmp90 = load float, ptr addrspace(3) %tmp89, align 4
   %tmp91 = tail call float @llvm.fmuladd.f32(float %tmp86, float %tmp88, float %tmp90)
-  %tmp92 = getelementptr inbounds float, float addrspace(3)* %arg, i32 53
-  %tmp93 = load float, float addrspace(3)* %tmp92, align 4
-  %tmp94 = getelementptr inbounds float, float addrspace(3)* %arg, i32 54
-  %tmp95 = load float, float addrspace(3)* %tmp94, align 4
-  %tmp96 = getelementptr inbounds float, float addrspace(3)* %arg, i32 55
-  %tmp97 = load float, float addrspace(3)* %tmp96, align 4
+  %tmp92 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 53
+  %tmp93 = load float, ptr addrspace(3) %tmp92, align 4
+  %tmp94 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 54
+  %tmp95 = load float, ptr addrspace(3) %tmp94, align 4
+  %tmp96 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 55
+  %tmp97 = load float, ptr addrspace(3) %tmp96, align 4
   %tmp98 = tail call float @llvm.fmuladd.f32(float %tmp93, float %tmp95, float %tmp97)
-  %tmp99 = getelementptr inbounds float, float addrspace(3)* %arg, i32 57
-  %tmp100 = load float, float addrspace(3)* %tmp99, align 4
-  %tmp101 = getelementptr inbounds float, float addrspace(3)* %arg, i32 58
-  %tmp102 = load float, float addrspace(3)* %tmp101, align 4
-  %tmp103 = getelementptr inbounds float, float addrspace(3)* %arg, i32 59
-  %tmp104 = load float, float addrspace(3)* %tmp103, align 4
+  %tmp99 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 57
+  %tmp100 = load float, ptr addrspace(3) %tmp99, align 4
+  %tmp101 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 58
+  %tmp102 = load float, ptr addrspace(3) %tmp101, align 4
+  %tmp103 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 59
+  %tmp104 = load float, ptr addrspace(3) %tmp103, align 4
   %tmp105 = tail call float @llvm.fmuladd.f32(float %tmp100, float %tmp102, float %tmp104)
-  %tmp106 = getelementptr inbounds float, float addrspace(3)* %arg, i32 61
-  %tmp107 = load float, float addrspace(3)* %tmp106, align 4
-  %tmp108 = getelementptr inbounds float, float addrspace(3)* %arg, i32 62
-  %tmp109 = load float, float addrspace(3)* %tmp108, align 4
-  %tmp110 = getelementptr inbounds float, float addrspace(3)* %arg, i32 63
-  %tmp111 = load float, float addrspace(3)* %tmp110, align 4
+  %tmp106 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 61
+  %tmp107 = load float, ptr addrspace(3) %tmp106, align 4
+  %tmp108 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 62
+  %tmp109 = load float, ptr addrspace(3) %tmp108, align 4
+  %tmp110 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 63
+  %tmp111 = load float, ptr addrspace(3) %tmp110, align 4
   %tmp112 = tail call float @llvm.fmuladd.f32(float %tmp107, float %tmp109, float %tmp111)
-  %tmp113 = getelementptr inbounds float, float addrspace(3)* %arg, i32 65
-  %tmp114 = load float, float addrspace(3)* %tmp113, align 4
-  %tmp115 = getelementptr inbounds float, float addrspace(3)* %arg, i32 66
-  %tmp116 = load float, float addrspace(3)* %tmp115, align 4
-  %tmp117 = getelementptr inbounds float, float addrspace(3)* %arg, i32 67
-  %tmp118 = load float, float addrspace(3)* %tmp117, align 4
+  %tmp113 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 65
+  %tmp114 = load float, ptr addrspace(3) %tmp113, align 4
+  %tmp115 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 66
+  %tmp116 = load float, ptr addrspace(3) %tmp115, align 4
+  %tmp117 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 67
+  %tmp118 = load float, ptr addrspace(3) %tmp117, align 4
   %tmp119 = tail call float @llvm.fmuladd.f32(float %tmp114, float %tmp116, float %tmp118)
-  %tmp120 = getelementptr inbounds float, float addrspace(3)* %arg, i32 69
-  %tmp121 = load float, float addrspace(3)* %tmp120, align 4
-  %tmp122 = getelementptr inbounds float, float addrspace(3)* %arg, i32 70
-  %tmp123 = load float, float addrspace(3)* %tmp122, align 4
-  %tmp124 = getelementptr inbounds float, float addrspace(3)* %arg, i32 71
-  %tmp125 = load float, float addrspace(3)* %tmp124, align 4
+  %tmp120 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 69
+  %tmp121 = load float, ptr addrspace(3) %tmp120, align 4
+  %tmp122 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 70
+  %tmp123 = load float, ptr addrspace(3) %tmp122, align 4
+  %tmp124 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 71
+  %tmp125 = load float, ptr addrspace(3) %tmp124, align 4
   %tmp126 = tail call float @llvm.fmuladd.f32(float %tmp121, float %tmp123, float %tmp125)
-  %tmp127 = getelementptr inbounds float, float addrspace(3)* %arg, i32 73
-  %tmp128 = load float, float addrspace(3)* %tmp127, align 4
-  %tmp129 = getelementptr inbounds float, float addrspace(3)* %arg, i32 74
-  %tmp130 = load float, float addrspace(3)* %tmp129, align 4
-  %tmp131 = getelementptr inbounds float, float addrspace(3)* %arg, i32 75
-  %tmp132 = load float, float addrspace(3)* %tmp131, align 4
+  %tmp127 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 73
+  %tmp128 = load float, ptr addrspace(3) %tmp127, align 4
+  %tmp129 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 74
+  %tmp130 = load float, ptr addrspace(3) %tmp129, align 4
+  %tmp131 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 75
+  %tmp132 = load float, ptr addrspace(3) %tmp131, align 4
   %tmp133 = tail call float @llvm.fmuladd.f32(float %tmp128, float %tmp130, float %tmp132)
-  %tmp134 = getelementptr inbounds float, float addrspace(3)* %arg, i32 77
-  %tmp135 = load float, float addrspace(3)* %tmp134, align 4
-  %tmp136 = getelementptr inbounds float, float addrspace(3)* %arg, i32 78
-  %tmp137 = load float, float addrspace(3)* %tmp136, align 4
-  %tmp138 = getelementptr inbounds float, float addrspace(3)* %arg, i32 79
-  %tmp139 = load float, float addrspace(3)* %tmp138, align 4
+  %tmp134 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 77
+  %tmp135 = load float, ptr addrspace(3) %tmp134, align 4
+  %tmp136 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 78
+  %tmp137 = load float, ptr addrspace(3) %tmp136, align 4
+  %tmp138 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 79
+  %tmp139 = load float, ptr addrspace(3) %tmp138, align 4
   %tmp140 = tail call float @llvm.fmuladd.f32(float %tmp135, float %tmp137, float %tmp139)
-  %tmp141 = getelementptr inbounds float, float addrspace(3)* %arg, i32 81
-  %tmp142 = load float, float addrspace(3)* %tmp141, align 4
-  %tmp143 = getelementptr inbounds float, float addrspace(3)* %arg, i32 82
-  %tmp144 = load float, float addrspace(3)* %tmp143, align 4
-  %tmp145 = getelementptr inbounds float, float addrspace(3)* %arg, i32 83
-  %tmp146 = load float, float addrspace(3)* %tmp145, align 4
+  %tmp141 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 81
+  %tmp142 = load float, ptr addrspace(3) %tmp141, align 4
+  %tmp143 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 82
+  %tmp144 = load float, ptr addrspace(3) %tmp143, align 4
+  %tmp145 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 83
+  %tmp146 = load float, ptr addrspace(3) %tmp145, align 4
   %tmp147 = tail call float @llvm.fmuladd.f32(float %tmp142, float %tmp144, float %tmp146)
-  %tmp148 = getelementptr inbounds float, float addrspace(3)* %arg, i32 85
-  %tmp149 = load float, float addrspace(3)* %tmp148, align 4
-  %tmp150 = getelementptr inbounds float, float addrspace(3)* %arg, i32 86
-  %tmp151 = load float, float addrspace(3)* %tmp150, align 4
-  %tmp152 = getelementptr inbounds float, float addrspace(3)* %arg, i32 87
-  %tmp153 = load float, float addrspace(3)* %tmp152, align 4
+  %tmp148 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 85
+  %tmp149 = load float, ptr addrspace(3) %tmp148, align 4
+  %tmp150 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 86
+  %tmp151 = load float, ptr addrspace(3) %tmp150, align 4
+  %tmp152 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 87
+  %tmp153 = load float, ptr addrspace(3) %tmp152, align 4
   %tmp154 = tail call float @llvm.fmuladd.f32(float %tmp149, float %tmp151, float %tmp153)
-  %tmp155 = getelementptr inbounds float, float addrspace(3)* %arg, i32 89
-  %tmp156 = load float, float addrspace(3)* %tmp155, align 4
-  %tmp157 = getelementptr inbounds float, float addrspace(3)* %arg, i32 90
-  %tmp158 = load float, float addrspace(3)* %tmp157, align 4
-  %tmp159 = getelementptr inbounds float, float addrspace(3)* %arg, i32 91
-  %tmp160 = load float, float addrspace(3)* %tmp159, align 4
+  %tmp155 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 89
+  %tmp156 = load float, ptr addrspace(3) %tmp155, align 4
+  %tmp157 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 90
+  %tmp158 = load float, ptr addrspace(3) %tmp157, align 4
+  %tmp159 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 91
+  %tmp160 = load float, ptr addrspace(3) %tmp159, align 4
   %tmp161 = tail call float @llvm.fmuladd.f32(float %tmp156, float %tmp158, float %tmp160)
-  %tmp162 = getelementptr inbounds float, float addrspace(3)* %arg, i32 93
-  %tmp163 = load float, float addrspace(3)* %tmp162, align 4
-  %tmp164 = getelementptr inbounds float, float addrspace(3)* %arg, i32 94
-  %tmp165 = load float, float addrspace(3)* %tmp164, align 4
-  %tmp166 = getelementptr inbounds float, float addrspace(3)* %arg, i32 95
-  %tmp167 = load float, float addrspace(3)* %tmp166, align 4
+  %tmp162 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 93
+  %tmp163 = load float, ptr addrspace(3) %tmp162, align 4
+  %tmp164 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 94
+  %tmp165 = load float, ptr addrspace(3) %tmp164, align 4
+  %tmp166 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 95
+  %tmp167 = load float, ptr addrspace(3) %tmp166, align 4
   %tmp168 = tail call float @llvm.fmuladd.f32(float %tmp163, float %tmp165, float %tmp167)
-  %tmp169 = getelementptr inbounds float, float addrspace(3)* %arg, i32 97
-  %tmp170 = load float, float addrspace(3)* %tmp169, align 4
-  %tmp171 = getelementptr inbounds float, float addrspace(3)* %arg, i32 98
-  %tmp172 = load float, float addrspace(3)* %tmp171, align 4
-  %tmp173 = getelementptr inbounds float, float addrspace(3)* %arg, i32 99
-  %tmp174 = load float, float addrspace(3)* %tmp173, align 4
+  %tmp169 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 97
+  %tmp170 = load float, ptr addrspace(3) %tmp169, align 4
+  %tmp171 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 98
+  %tmp172 = load float, ptr addrspace(3) %tmp171, align 4
+  %tmp173 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 99
+  %tmp174 = load float, ptr addrspace(3) %tmp173, align 4
   %tmp175 = tail call float @llvm.fmuladd.f32(float %tmp170, float %tmp172, float %tmp174)
-  %tmp176 = getelementptr inbounds float, float addrspace(3)* %arg, i32 101
-  %tmp177 = load float, float addrspace(3)* %tmp176, align 4
-  %tmp178 = getelementptr inbounds float, float addrspace(3)* %arg, i32 102
-  %tmp179 = load float, float addrspace(3)* %tmp178, align 4
-  %tmp180 = getelementptr inbounds float, float addrspace(3)* %arg, i32 103
-  %tmp181 = load float, float addrspace(3)* %tmp180, align 4
+  %tmp176 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 101
+  %tmp177 = load float, ptr addrspace(3) %tmp176, align 4
+  %tmp178 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 102
+  %tmp179 = load float, ptr addrspace(3) %tmp178, align 4
+  %tmp180 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 103
+  %tmp181 = load float, ptr addrspace(3) %tmp180, align 4
   %tmp182 = tail call float @llvm.fmuladd.f32(float %tmp177, float %tmp179, float %tmp181)
-  %tmp183 = getelementptr inbounds float, float addrspace(3)* %arg, i32 105
-  %tmp184 = load float, float addrspace(3)* %tmp183, align 4
-  %tmp185 = getelementptr inbounds float, float addrspace(3)* %arg, i32 106
-  %tmp186 = load float, float addrspace(3)* %tmp185, align 4
-  %tmp187 = getelementptr inbounds float, float addrspace(3)* %arg, i32 107
-  %tmp188 = load float, float addrspace(3)* %tmp187, align 4
+  %tmp183 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 105
+  %tmp184 = load float, ptr addrspace(3) %tmp183, align 4
+  %tmp185 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 106
+  %tmp186 = load float, ptr addrspace(3) %tmp185, align 4
+  %tmp187 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 107
+  %tmp188 = load float, ptr addrspace(3) %tmp187, align 4
   %tmp189 = tail call float @llvm.fmuladd.f32(float %tmp184, float %tmp186, float %tmp188)
-  %tmp190 = getelementptr inbounds float, float addrspace(3)* %arg, i32 109
-  %tmp191 = load float, float addrspace(3)* %tmp190, align 4
-  %tmp192 = getelementptr inbounds float, float addrspace(3)* %arg, i32 110
-  %tmp193 = load float, float addrspace(3)* %tmp192, align 4
-  %tmp194 = getelementptr inbounds float, float addrspace(3)* %arg, i32 111
-  %tmp195 = load float, float addrspace(3)* %tmp194, align 4
+  %tmp190 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 109
+  %tmp191 = load float, ptr addrspace(3) %tmp190, align 4
+  %tmp192 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 110
+  %tmp193 = load float, ptr addrspace(3) %tmp192, align 4
+  %tmp194 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 111
+  %tmp195 = load float, ptr addrspace(3) %tmp194, align 4
   %tmp196 = tail call float @llvm.fmuladd.f32(float %tmp191, float %tmp193, float %tmp195)
-  %tmp197 = getelementptr inbounds float, float addrspace(3)* %arg, i32 113
-  %tmp198 = load float, float addrspace(3)* %tmp197, align 4
-  %tmp199 = getelementptr inbounds float, float addrspace(3)* %arg, i32 114
-  %tmp200 = load float, float addrspace(3)* %tmp199, align 4
-  %tmp201 = getelementptr inbounds float, float addrspace(3)* %arg, i32 115
-  %tmp202 = load float, float addrspace(3)* %tmp201, align 4
+  %tmp197 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 113
+  %tmp198 = load float, ptr addrspace(3) %tmp197, align 4
+  %tmp199 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 114
+  %tmp200 = load float, ptr addrspace(3) %tmp199, align 4
+  %tmp201 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 115
+  %tmp202 = load float, ptr addrspace(3) %tmp201, align 4
   %tmp203 = tail call float @llvm.fmuladd.f32(float %tmp198, float %tmp200, float %tmp202)
-  %tmp204 = getelementptr inbounds float, float addrspace(3)* %arg, i32 117
-  %tmp205 = load float, float addrspace(3)* %tmp204, align 4
-  %tmp206 = getelementptr inbounds float, float addrspace(3)* %arg, i32 118
-  %tmp207 = load float, float addrspace(3)* %tmp206, align 4
-  %tmp208 = getelementptr inbounds float, float addrspace(3)* %arg, i32 119
-  %tmp209 = load float, float addrspace(3)* %tmp208, align 4
+  %tmp204 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 117
+  %tmp205 = load float, ptr addrspace(3) %tmp204, align 4
+  %tmp206 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 118
+  %tmp207 = load float, ptr addrspace(3) %tmp206, align 4
+  %tmp208 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 119
+  %tmp209 = load float, ptr addrspace(3) %tmp208, align 4
   %tmp210 = tail call float @llvm.fmuladd.f32(float %tmp205, float %tmp207, float %tmp209)
-  %tmp211 = getelementptr inbounds float, float addrspace(3)* %arg, i32 121
-  %tmp212 = load float, float addrspace(3)* %tmp211, align 4
-  %tmp213 = getelementptr inbounds float, float addrspace(3)* %arg, i32 122
-  %tmp214 = load float, float addrspace(3)* %tmp213, align 4
-  %tmp215 = getelementptr inbounds float, float addrspace(3)* %arg, i32 123
-  %tmp216 = load float, float addrspace(3)* %tmp215, align 4
+  %tmp211 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 121
+  %tmp212 = load float, ptr addrspace(3) %tmp211, align 4
+  %tmp213 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 122
+  %tmp214 = load float, ptr addrspace(3) %tmp213, align 4
+  %tmp215 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 123
+  %tmp216 = load float, ptr addrspace(3) %tmp215, align 4
   %tmp217 = tail call float @llvm.fmuladd.f32(float %tmp212, float %tmp214, float %tmp216)
-  %tmp218 = getelementptr inbounds float, float addrspace(3)* %arg, i32 125
-  %tmp219 = load float, float addrspace(3)* %tmp218, align 4
-  %tmp220 = getelementptr inbounds float, float addrspace(3)* %arg, i32 126
-  %tmp221 = load float, float addrspace(3)* %tmp220, align 4
-  %tmp222 = getelementptr inbounds float, float addrspace(3)* %arg, i32 127
-  %tmp223 = load float, float addrspace(3)* %tmp222, align 4
+  %tmp218 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 125
+  %tmp219 = load float, ptr addrspace(3) %tmp218, align 4
+  %tmp220 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 126
+  %tmp221 = load float, ptr addrspace(3) %tmp220, align 4
+  %tmp222 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 127
+  %tmp223 = load float, ptr addrspace(3) %tmp222, align 4
   %tmp224 = tail call float @llvm.fmuladd.f32(float %tmp219, float %tmp221, float %tmp223)
-  %tmp225 = getelementptr inbounds float, float addrspace(3)* %arg, i32 129
-  %tmp226 = load float, float addrspace(3)* %tmp225, align 4
-  %tmp227 = getelementptr inbounds float, float addrspace(3)* %arg, i32 130
-  %tmp228 = load float, float addrspace(3)* %tmp227, align 4
-  %tmp229 = getelementptr inbounds float, float addrspace(3)* %arg, i32 131
-  %tmp230 = load float, float addrspace(3)* %tmp229, align 4
+  %tmp225 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 129
+  %tmp226 = load float, ptr addrspace(3) %tmp225, align 4
+  %tmp227 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 130
+  %tmp228 = load float, ptr addrspace(3) %tmp227, align 4
+  %tmp229 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 131
+  %tmp230 = load float, ptr addrspace(3) %tmp229, align 4
   %tmp231 = tail call float @llvm.fmuladd.f32(float %tmp226, float %tmp228, float %tmp230)
-  %tmp232 = getelementptr inbounds float, float addrspace(3)* %arg, i32 133
-  %tmp233 = load float, float addrspace(3)* %tmp232, align 4
-  %tmp234 = getelementptr inbounds float, float addrspace(3)* %arg, i32 134
-  %tmp235 = load float, float addrspace(3)* %tmp234, align 4
-  %tmp236 = getelementptr inbounds float, float addrspace(3)* %arg, i32 135
-  %tmp237 = load float, float addrspace(3)* %tmp236, align 4
+  %tmp232 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 133
+  %tmp233 = load float, ptr addrspace(3) %tmp232, align 4
+  %tmp234 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 134
+  %tmp235 = load float, ptr addrspace(3) %tmp234, align 4
+  %tmp236 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 135
+  %tmp237 = load float, ptr addrspace(3) %tmp236, align 4
   %tmp238 = tail call float @llvm.fmuladd.f32(float %tmp233, float %tmp235, float %tmp237)
-  %tmp239 = getelementptr inbounds float, float addrspace(3)* %arg, i32 137
-  %tmp240 = load float, float addrspace(3)* %tmp239, align 4
-  %tmp241 = getelementptr inbounds float, float addrspace(3)* %arg, i32 138
-  %tmp242 = load float, float addrspace(3)* %tmp241, align 4
-  %tmp243 = getelementptr inbounds float, float addrspace(3)* %arg, i32 139
-  %tmp244 = load float, float addrspace(3)* %tmp243, align 4
+  %tmp239 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 137
+  %tmp240 = load float, ptr addrspace(3) %tmp239, align 4
+  %tmp241 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 138
+  %tmp242 = load float, ptr addrspace(3) %tmp241, align 4
+  %tmp243 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 139
+  %tmp244 = load float, ptr addrspace(3) %tmp243, align 4
   %tmp245 = tail call float @llvm.fmuladd.f32(float %tmp240, float %tmp242, float %tmp244)
-  %tmp246 = getelementptr inbounds float, float addrspace(3)* %arg, i32 141
-  %tmp247 = load float, float addrspace(3)* %tmp246, align 4
-  %tmp248 = getelementptr inbounds float, float addrspace(3)* %arg, i32 142
-  %tmp249 = load float, float addrspace(3)* %tmp248, align 4
-  %tmp250 = getelementptr inbounds float, float addrspace(3)* %arg, i32 143
-  %tmp251 = load float, float addrspace(3)* %tmp250, align 4
+  %tmp246 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 141
+  %tmp247 = load float, ptr addrspace(3) %tmp246, align 4
+  %tmp248 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 142
+  %tmp249 = load float, ptr addrspace(3) %tmp248, align 4
+  %tmp250 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 143
+  %tmp251 = load float, ptr addrspace(3) %tmp250, align 4
   %tmp252 = tail call float @llvm.fmuladd.f32(float %tmp247, float %tmp249, float %tmp251)
-  %tmp253 = getelementptr inbounds float, float addrspace(3)* %arg, i32 145
-  %tmp254 = load float, float addrspace(3)* %tmp253, align 4
-  %tmp255 = getelementptr inbounds float, float addrspace(3)* %arg, i32 146
-  %tmp256 = load float, float addrspace(3)* %tmp255, align 4
-  %tmp257 = getelementptr inbounds float, float addrspace(3)* %arg, i32 147
-  %tmp258 = load float, float addrspace(3)* %tmp257, align 4
+  %tmp253 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 145
+  %tmp254 = load float, ptr addrspace(3) %tmp253, align 4
+  %tmp255 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 146
+  %tmp256 = load float, ptr addrspace(3) %tmp255, align 4
+  %tmp257 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 147
+  %tmp258 = load float, ptr addrspace(3) %tmp257, align 4
   %tmp259 = tail call float @llvm.fmuladd.f32(float %tmp254, float %tmp256, float %tmp258)
-  %tmp260 = getelementptr inbounds float, float addrspace(3)* %arg, i32 149
-  %tmp261 = load float, float addrspace(3)* %tmp260, align 4
-  %tmp262 = getelementptr inbounds float, float addrspace(3)* %arg, i32 150
-  %tmp263 = load float, float addrspace(3)* %tmp262, align 4
-  %tmp264 = getelementptr inbounds float, float addrspace(3)* %arg, i32 151
-  %tmp265 = load float, float addrspace(3)* %tmp264, align 4
+  %tmp260 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 149
+  %tmp261 = load float, ptr addrspace(3) %tmp260, align 4
+  %tmp262 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 150
+  %tmp263 = load float, ptr addrspace(3) %tmp262, align 4
+  %tmp264 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 151
+  %tmp265 = load float, ptr addrspace(3) %tmp264, align 4
   %tmp266 = tail call float @llvm.fmuladd.f32(float %tmp261, float %tmp263, float %tmp265)
-  %tmp267 = getelementptr inbounds float, float addrspace(3)* %arg, i32 153
-  %tmp268 = load float, float addrspace(3)* %tmp267, align 4
-  %tmp269 = getelementptr inbounds float, float addrspace(3)* %arg, i32 154
-  %tmp270 = load float, float addrspace(3)* %tmp269, align 4
-  %tmp271 = getelementptr inbounds float, float addrspace(3)* %arg, i32 155
-  %tmp272 = load float, float addrspace(3)* %tmp271, align 4
+  %tmp267 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 153
+  %tmp268 = load float, ptr addrspace(3) %tmp267, align 4
+  %tmp269 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 154
+  %tmp270 = load float, ptr addrspace(3) %tmp269, align 4
+  %tmp271 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 155
+  %tmp272 = load float, ptr addrspace(3) %tmp271, align 4
   %tmp273 = tail call float @llvm.fmuladd.f32(float %tmp268, float %tmp270, float %tmp272)
-  %tmp274 = getelementptr inbounds float, float addrspace(3)* %arg, i32 157
-  %tmp275 = load float, float addrspace(3)* %tmp274, align 4
-  %tmp276 = getelementptr inbounds float, float addrspace(3)* %arg, i32 158
-  %tmp277 = load float, float addrspace(3)* %tmp276, align 4
-  %tmp278 = getelementptr inbounds float, float addrspace(3)* %arg, i32 159
-  %tmp279 = load float, float addrspace(3)* %tmp278, align 4
+  %tmp274 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 157
+  %tmp275 = load float, ptr addrspace(3) %tmp274, align 4
+  %tmp276 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 158
+  %tmp277 = load float, ptr addrspace(3) %tmp276, align 4
+  %tmp278 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 159
+  %tmp279 = load float, ptr addrspace(3) %tmp278, align 4
   %tmp280 = tail call float @llvm.fmuladd.f32(float %tmp275, float %tmp277, float %tmp279)
-  %tmp281 = getelementptr inbounds float, float addrspace(3)* %arg, i32 161
-  %tmp282 = load float, float addrspace(3)* %tmp281, align 4
-  %tmp283 = getelementptr inbounds float, float addrspace(3)* %arg, i32 162
-  %tmp284 = load float, float addrspace(3)* %tmp283, align 4
-  %tmp285 = getelementptr inbounds float, float addrspace(3)* %arg, i32 163
-  %tmp286 = load float, float addrspace(3)* %tmp285, align 4
+  %tmp281 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 161
+  %tmp282 = load float, ptr addrspace(3) %tmp281, align 4
+  %tmp283 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 162
+  %tmp284 = load float, ptr addrspace(3) %tmp283, align 4
+  %tmp285 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 163
+  %tmp286 = load float, ptr addrspace(3) %tmp285, align 4
   %tmp287 = tail call float @llvm.fmuladd.f32(float %tmp282, float %tmp284, float %tmp286)
-  %tmp288 = getelementptr inbounds float, float addrspace(3)* %arg, i32 165
-  %tmp289 = load float, float addrspace(3)* %tmp288, align 4
-  %tmp290 = getelementptr inbounds float, float addrspace(3)* %arg, i32 166
-  %tmp291 = load float, float addrspace(3)* %tmp290, align 4
-  %tmp292 = getelementptr inbounds float, float addrspace(3)* %arg, i32 167
-  %tmp293 = load float, float addrspace(3)* %tmp292, align 4
+  %tmp288 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 165
+  %tmp289 = load float, ptr addrspace(3) %tmp288, align 4
+  %tmp290 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 166
+  %tmp291 = load float, ptr addrspace(3) %tmp290, align 4
+  %tmp292 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 167
+  %tmp293 = load float, ptr addrspace(3) %tmp292, align 4
   %tmp294 = tail call float @llvm.fmuladd.f32(float %tmp289, float %tmp291, float %tmp293)
-  %tmp295 = getelementptr inbounds float, float addrspace(3)* %arg, i32 169
-  %tmp296 = load float, float addrspace(3)* %tmp295, align 4
-  %tmp297 = getelementptr inbounds float, float addrspace(3)* %arg, i32 170
-  %tmp298 = load float, float addrspace(3)* %tmp297, align 4
-  %tmp299 = getelementptr inbounds float, float addrspace(3)* %arg, i32 171
-  %tmp300 = load float, float addrspace(3)* %tmp299, align 4
+  %tmp295 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 169
+  %tmp296 = load float, ptr addrspace(3) %tmp295, align 4
+  %tmp297 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 170
+  %tmp298 = load float, ptr addrspace(3) %tmp297, align 4
+  %tmp299 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 171
+  %tmp300 = load float, ptr addrspace(3) %tmp299, align 4
   %tmp301 = tail call float @llvm.fmuladd.f32(float %tmp296, float %tmp298, float %tmp300)
-  %tmp302 = getelementptr inbounds float, float addrspace(3)* %arg, i32 173
-  %tmp303 = load float, float addrspace(3)* %tmp302, align 4
-  %tmp304 = getelementptr inbounds float, float addrspace(3)* %arg, i32 174
-  %tmp305 = load float, float addrspace(3)* %tmp304, align 4
-  %tmp306 = getelementptr inbounds float, float addrspace(3)* %arg, i32 175
-  %tmp307 = load float, float addrspace(3)* %tmp306, align 4
+  %tmp302 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 173
+  %tmp303 = load float, ptr addrspace(3) %tmp302, align 4
+  %tmp304 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 174
+  %tmp305 = load float, ptr addrspace(3) %tmp304, align 4
+  %tmp306 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 175
+  %tmp307 = load float, ptr addrspace(3) %tmp306, align 4
   %tmp308 = tail call float @llvm.fmuladd.f32(float %tmp303, float %tmp305, float %tmp307)
-  %tmp309 = getelementptr inbounds float, float addrspace(3)* %arg, i32 177
-  %tmp310 = load float, float addrspace(3)* %tmp309, align 4
-  %tmp311 = getelementptr inbounds float, float addrspace(3)* %arg, i32 178
-  %tmp312 = load float, float addrspace(3)* %tmp311, align 4
-  %tmp313 = getelementptr inbounds float, float addrspace(3)* %arg, i32 179
-  %tmp314 = load float, float addrspace(3)* %tmp313, align 4
+  %tmp309 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 177
+  %tmp310 = load float, ptr addrspace(3) %tmp309, align 4
+  %tmp311 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 178
+  %tmp312 = load float, ptr addrspace(3) %tmp311, align 4
+  %tmp313 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 179
+  %tmp314 = load float, ptr addrspace(3) %tmp313, align 4
   %tmp315 = tail call float @llvm.fmuladd.f32(float %tmp310, float %tmp312, float %tmp314)
-  %tmp316 = getelementptr inbounds float, float addrspace(3)* %arg, i32 181
-  %tmp317 = load float, float addrspace(3)* %tmp316, align 4
-  %tmp318 = getelementptr inbounds float, float addrspace(3)* %arg, i32 182
-  %tmp319 = load float, float addrspace(3)* %tmp318, align 4
-  %tmp320 = getelementptr inbounds float, float addrspace(3)* %arg, i32 183
-  %tmp321 = load float, float addrspace(3)* %tmp320, align 4
+  %tmp316 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 181
+  %tmp317 = load float, ptr addrspace(3) %tmp316, align 4
+  %tmp318 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 182
+  %tmp319 = load float, ptr addrspace(3) %tmp318, align 4
+  %tmp320 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 183
+  %tmp321 = load float, ptr addrspace(3) %tmp320, align 4
   %tmp322 = tail call float @llvm.fmuladd.f32(float %tmp317, float %tmp319, float %tmp321)
-  %tmp323 = getelementptr inbounds float, float addrspace(3)* %arg, i32 185
-  %tmp324 = load float, float addrspace(3)* %tmp323, align 4
-  %tmp325 = getelementptr inbounds float, float addrspace(3)* %arg, i32 186
-  %tmp326 = load float, float addrspace(3)* %tmp325, align 4
-  %tmp327 = getelementptr inbounds float, float addrspace(3)* %arg, i32 187
-  %tmp328 = load float, float addrspace(3)* %tmp327, align 4
+  %tmp323 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 185
+  %tmp324 = load float, ptr addrspace(3) %tmp323, align 4
+  %tmp325 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 186
+  %tmp326 = load float, ptr addrspace(3) %tmp325, align 4
+  %tmp327 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 187
+  %tmp328 = load float, ptr addrspace(3) %tmp327, align 4
   %tmp329 = tail call float @llvm.fmuladd.f32(float %tmp324, float %tmp326, float %tmp328)
-  %tmp330 = getelementptr inbounds float, float addrspace(3)* %arg, i32 189
-  %tmp331 = load float, float addrspace(3)* %tmp330, align 4
-  %tmp332 = getelementptr inbounds float, float addrspace(3)* %arg, i32 190
-  %tmp333 = load float, float addrspace(3)* %tmp332, align 4
-  %tmp334 = getelementptr inbounds float, float addrspace(3)* %arg, i32 191
-  %tmp335 = load float, float addrspace(3)* %tmp334, align 4
+  %tmp330 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 189
+  %tmp331 = load float, ptr addrspace(3) %tmp330, align 4
+  %tmp332 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 190
+  %tmp333 = load float, ptr addrspace(3) %tmp332, align 4
+  %tmp334 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 191
+  %tmp335 = load float, ptr addrspace(3) %tmp334, align 4
   %tmp336 = tail call float @llvm.fmuladd.f32(float %tmp331, float %tmp333, float %tmp335)
-  %tmp337 = getelementptr inbounds float, float addrspace(3)* %arg, i32 193
-  %tmp338 = load float, float addrspace(3)* %tmp337, align 4
-  %tmp339 = getelementptr inbounds float, float addrspace(3)* %arg, i32 194
-  %tmp340 = load float, float addrspace(3)* %tmp339, align 4
-  %tmp341 = getelementptr inbounds float, float addrspace(3)* %arg, i32 195
-  %tmp342 = load float, float addrspace(3)* %tmp341, align 4
+  %tmp337 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 193
+  %tmp338 = load float, ptr addrspace(3) %tmp337, align 4
+  %tmp339 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 194
+  %tmp340 = load float, ptr addrspace(3) %tmp339, align 4
+  %tmp341 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 195
+  %tmp342 = load float, ptr addrspace(3) %tmp341, align 4
   %tmp343 = tail call float @llvm.fmuladd.f32(float %tmp338, float %tmp340, float %tmp342)
-  %tmp344 = getelementptr inbounds float, float addrspace(3)* %arg, i32 197
-  %tmp345 = load float, float addrspace(3)* %tmp344, align 4
-  %tmp346 = getelementptr inbounds float, float addrspace(3)* %arg, i32 198
-  %tmp347 = load float, float addrspace(3)* %tmp346, align 4
-  %tmp348 = getelementptr inbounds float, float addrspace(3)* %arg, i32 199
-  %tmp349 = load float, float addrspace(3)* %tmp348, align 4
+  %tmp344 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 197
+  %tmp345 = load float, ptr addrspace(3) %tmp344, align 4
+  %tmp346 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 198
+  %tmp347 = load float, ptr addrspace(3) %tmp346, align 4
+  %tmp348 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 199
+  %tmp349 = load float, ptr addrspace(3) %tmp348, align 4
   %tmp350 = tail call float @llvm.fmuladd.f32(float %tmp345, float %tmp347, float %tmp349)
-  %tmp351 = getelementptr inbounds float, float addrspace(3)* %arg, i32 201
-  %tmp352 = load float, float addrspace(3)* %tmp351, align 4
-  %tmp353 = getelementptr inbounds float, float addrspace(3)* %arg, i32 202
-  %tmp354 = load float, float addrspace(3)* %tmp353, align 4
-  %tmp355 = getelementptr inbounds float, float addrspace(3)* %arg, i32 203
-  %tmp356 = load float, float addrspace(3)* %tmp355, align 4
+  %tmp351 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 201
+  %tmp352 = load float, ptr addrspace(3) %tmp351, align 4
+  %tmp353 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 202
+  %tmp354 = load float, ptr addrspace(3) %tmp353, align 4
+  %tmp355 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 203
+  %tmp356 = load float, ptr addrspace(3) %tmp355, align 4
   %tmp357 = tail call float @llvm.fmuladd.f32(float %tmp352, float %tmp354, float %tmp356)
-  %tmp358 = getelementptr inbounds float, float addrspace(3)* %arg, i32 205
-  %tmp359 = load float, float addrspace(3)* %tmp358, align 4
-  %tmp360 = getelementptr inbounds float, float addrspace(3)* %arg, i32 206
-  %tmp361 = load float, float addrspace(3)* %tmp360, align 4
-  %tmp362 = getelementptr inbounds float, float addrspace(3)* %arg, i32 207
-  %tmp363 = load float, float addrspace(3)* %tmp362, align 4
+  %tmp358 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 205
+  %tmp359 = load float, ptr addrspace(3) %tmp358, align 4
+  %tmp360 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 206
+  %tmp361 = load float, ptr addrspace(3) %tmp360, align 4
+  %tmp362 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 207
+  %tmp363 = load float, ptr addrspace(3) %tmp362, align 4
   %tmp364 = tail call float @llvm.fmuladd.f32(float %tmp359, float %tmp361, float %tmp363)
-  %tmp365 = getelementptr inbounds float, float addrspace(3)* %arg, i32 209
-  %tmp366 = load float, float addrspace(3)* %tmp365, align 4
-  %tmp367 = getelementptr inbounds float, float addrspace(3)* %arg, i32 210
-  %tmp368 = load float, float addrspace(3)* %tmp367, align 4
-  %tmp369 = getelementptr inbounds float, float addrspace(3)* %arg, i32 211
-  %tmp370 = load float, float addrspace(3)* %tmp369, align 4
+  %tmp365 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 209
+  %tmp366 = load float, ptr addrspace(3) %tmp365, align 4
+  %tmp367 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 210
+  %tmp368 = load float, ptr addrspace(3) %tmp367, align 4
+  %tmp369 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 211
+  %tmp370 = load float, ptr addrspace(3) %tmp369, align 4
   %tmp371 = tail call float @llvm.fmuladd.f32(float %tmp366, float %tmp368, float %tmp370)
-  %tmp372 = getelementptr inbounds float, float addrspace(3)* %arg, i32 213
-  %tmp373 = load float, float addrspace(3)* %tmp372, align 4
-  %tmp374 = getelementptr inbounds float, float addrspace(3)* %arg, i32 214
-  %tmp375 = load float, float addrspace(3)* %tmp374, align 4
-  %tmp376 = getelementptr inbounds float, float addrspace(3)* %arg, i32 215
-  %tmp377 = load float, float addrspace(3)* %tmp376, align 4
+  %tmp372 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 213
+  %tmp373 = load float, ptr addrspace(3) %tmp372, align 4
+  %tmp374 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 214
+  %tmp375 = load float, ptr addrspace(3) %tmp374, align 4
+  %tmp376 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 215
+  %tmp377 = load float, ptr addrspace(3) %tmp376, align 4
   %tmp378 = tail call float @llvm.fmuladd.f32(float %tmp373, float %tmp375, float %tmp377)
-  %tmp379 = getelementptr inbounds float, float addrspace(3)* %arg, i32 217
-  %tmp380 = load float, float addrspace(3)* %tmp379, align 4
-  %tmp381 = getelementptr inbounds float, float addrspace(3)* %arg, i32 218
-  %tmp382 = load float, float addrspace(3)* %tmp381, align 4
-  %tmp383 = getelementptr inbounds float, float addrspace(3)* %arg, i32 219
-  %tmp384 = load float, float addrspace(3)* %tmp383, align 4
+  %tmp379 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 217
+  %tmp380 = load float, ptr addrspace(3) %tmp379, align 4
+  %tmp381 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 218
+  %tmp382 = load float, ptr addrspace(3) %tmp381, align 4
+  %tmp383 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 219
+  %tmp384 = load float, ptr addrspace(3) %tmp383, align 4
   %tmp385 = tail call float @llvm.fmuladd.f32(float %tmp380, float %tmp382, float %tmp384)
-  %tmp386 = getelementptr inbounds float, float addrspace(3)* %arg, i32 221
-  %tmp387 = load float, float addrspace(3)* %tmp386, align 4
-  %tmp388 = getelementptr inbounds float, float addrspace(3)* %arg, i32 222
-  %tmp389 = load float, float addrspace(3)* %tmp388, align 4
-  %tmp390 = getelementptr inbounds float, float addrspace(3)* %arg, i32 223
-  %tmp391 = load float, float addrspace(3)* %tmp390, align 4
+  %tmp386 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 221
+  %tmp387 = load float, ptr addrspace(3) %tmp386, align 4
+  %tmp388 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 222
+  %tmp389 = load float, ptr addrspace(3) %tmp388, align 4
+  %tmp390 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 223
+  %tmp391 = load float, ptr addrspace(3) %tmp390, align 4
   %tmp392 = tail call float @llvm.fmuladd.f32(float %tmp387, float %tmp389, float %tmp391)
-  %tmp393 = getelementptr inbounds float, float addrspace(3)* %arg, i32 225
-  %tmp394 = load float, float addrspace(3)* %tmp393, align 4
-  %tmp395 = getelementptr inbounds float, float addrspace(3)* %arg, i32 226
-  %tmp396 = load float, float addrspace(3)* %tmp395, align 4
-  %tmp397 = getelementptr inbounds float, float addrspace(3)* %arg, i32 227
-  %tmp398 = load float, float addrspace(3)* %tmp397, align 4
+  %tmp393 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 225
+  %tmp394 = load float, ptr addrspace(3) %tmp393, align 4
+  %tmp395 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 226
+  %tmp396 = load float, ptr addrspace(3) %tmp395, align 4
+  %tmp397 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 227
+  %tmp398 = load float, ptr addrspace(3) %tmp397, align 4
   %tmp399 = tail call float @llvm.fmuladd.f32(float %tmp394, float %tmp396, float %tmp398)
-  %tmp400 = getelementptr inbounds float, float addrspace(3)* %arg, i32 229
-  %tmp401 = load float, float addrspace(3)* %tmp400, align 4
-  %tmp402 = getelementptr inbounds float, float addrspace(3)* %arg, i32 230
-  %tmp403 = load float, float addrspace(3)* %tmp402, align 4
-  %tmp404 = getelementptr inbounds float, float addrspace(3)* %arg, i32 231
-  %tmp405 = load float, float addrspace(3)* %tmp404, align 4
+  %tmp400 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 229
+  %tmp401 = load float, ptr addrspace(3) %tmp400, align 4
+  %tmp402 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 230
+  %tmp403 = load float, ptr addrspace(3) %tmp402, align 4
+  %tmp404 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 231
+  %tmp405 = load float, ptr addrspace(3) %tmp404, align 4
   %tmp406 = tail call float @llvm.fmuladd.f32(float %tmp401, float %tmp403, float %tmp405)
-  %tmp407 = getelementptr inbounds float, float addrspace(3)* %arg, i32 233
-  %tmp408 = load float, float addrspace(3)* %tmp407, align 4
-  %tmp409 = getelementptr inbounds float, float addrspace(3)* %arg, i32 234
-  %tmp410 = load float, float addrspace(3)* %tmp409, align 4
-  %tmp411 = getelementptr inbounds float, float addrspace(3)* %arg, i32 235
-  %tmp412 = load float, float addrspace(3)* %tmp411, align 4
+  %tmp407 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 233
+  %tmp408 = load float, ptr addrspace(3) %tmp407, align 4
+  %tmp409 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 234
+  %tmp410 = load float, ptr addrspace(3) %tmp409, align 4
+  %tmp411 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 235
+  %tmp412 = load float, ptr addrspace(3) %tmp411, align 4
   %tmp413 = tail call float @llvm.fmuladd.f32(float %tmp408, float %tmp410, float %tmp412)
-  %tmp414 = getelementptr inbounds float, float addrspace(3)* %arg, i32 237
-  %tmp415 = load float, float addrspace(3)* %tmp414, align 4
-  %tmp416 = getelementptr inbounds float, float addrspace(3)* %arg, i32 238
-  %tmp417 = load float, float addrspace(3)* %tmp416, align 4
-  %tmp418 = getelementptr inbounds float, float addrspace(3)* %arg, i32 239
-  %tmp419 = load float, float addrspace(3)* %tmp418, align 4
+  %tmp414 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 237
+  %tmp415 = load float, ptr addrspace(3) %tmp414, align 4
+  %tmp416 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 238
+  %tmp417 = load float, ptr addrspace(3) %tmp416, align 4
+  %tmp418 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 239
+  %tmp419 = load float, ptr addrspace(3) %tmp418, align 4
   %tmp420 = tail call float @llvm.fmuladd.f32(float %tmp415, float %tmp417, float %tmp419)
-  %tmp421 = getelementptr inbounds float, float addrspace(3)* %arg, i32 241
-  %tmp422 = load float, float addrspace(3)* %tmp421, align 4
-  %tmp423 = getelementptr inbounds float, float addrspace(3)* %arg, i32 242
-  %tmp424 = load float, float addrspace(3)* %tmp423, align 4
-  %tmp425 = getelementptr inbounds float, float addrspace(3)* %arg, i32 243
-  %tmp426 = load float, float addrspace(3)* %tmp425, align 4
+  %tmp421 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 241
+  %tmp422 = load float, ptr addrspace(3) %tmp421, align 4
+  %tmp423 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 242
+  %tmp424 = load float, ptr addrspace(3) %tmp423, align 4
+  %tmp425 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 243
+  %tmp426 = load float, ptr addrspace(3) %tmp425, align 4
   %tmp427 = tail call float @llvm.fmuladd.f32(float %tmp422, float %tmp424, float %tmp426)
-  %tmp428 = getelementptr inbounds float, float addrspace(3)* %arg, i32 245
-  %tmp429 = load float, float addrspace(3)* %tmp428, align 4
-  %tmp430 = getelementptr inbounds float, float addrspace(3)* %arg, i32 246
-  %tmp431 = load float, float addrspace(3)* %tmp430, align 4
-  %tmp432 = getelementptr inbounds float, float addrspace(3)* %arg, i32 247
-  %tmp433 = load float, float addrspace(3)* %tmp432, align 4
+  %tmp428 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 245
+  %tmp429 = load float, ptr addrspace(3) %tmp428, align 4
+  %tmp430 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 246
+  %tmp431 = load float, ptr addrspace(3) %tmp430, align 4
+  %tmp432 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 247
+  %tmp433 = load float, ptr addrspace(3) %tmp432, align 4
   %tmp434 = tail call float @llvm.fmuladd.f32(float %tmp429, float %tmp431, float %tmp433)
-  %tmp435 = getelementptr inbounds float, float addrspace(3)* %arg, i32 249
-  %tmp436 = load float, float addrspace(3)* %tmp435, align 4
-  %tmp437 = getelementptr inbounds float, float addrspace(3)* %arg, i32 250
-  %tmp438 = load float, float addrspace(3)* %tmp437, align 4
-  %tmp439 = getelementptr inbounds float, float addrspace(3)* %arg, i32 251
-  %tmp440 = load float, float addrspace(3)* %tmp439, align 4
+  %tmp435 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 249
+  %tmp436 = load float, ptr addrspace(3) %tmp435, align 4
+  %tmp437 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 250
+  %tmp438 = load float, ptr addrspace(3) %tmp437, align 4
+  %tmp439 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 251
+  %tmp440 = load float, ptr addrspace(3) %tmp439, align 4
   %tmp441 = tail call float @llvm.fmuladd.f32(float %tmp436, float %tmp438, float %tmp440)
-  %tmp442 = getelementptr inbounds float, float addrspace(3)* %arg, i32 253
-  %tmp443 = load float, float addrspace(3)* %tmp442, align 4
-  %tmp444 = getelementptr inbounds float, float addrspace(3)* %arg, i32 254
-  %tmp445 = load float, float addrspace(3)* %tmp444, align 4
-  %tmp446 = getelementptr inbounds float, float addrspace(3)* %arg, i32 255
-  %tmp447 = load float, float addrspace(3)* %tmp446, align 4
+  %tmp442 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 253
+  %tmp443 = load float, ptr addrspace(3) %tmp442, align 4
+  %tmp444 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 254
+  %tmp445 = load float, ptr addrspace(3) %tmp444, align 4
+  %tmp446 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 255
+  %tmp447 = load float, ptr addrspace(3) %tmp446, align 4
   %tmp448 = tail call float @llvm.fmuladd.f32(float %tmp443, float %tmp445, float %tmp447)
-  store float %tmp7, float addrspace(1)* %arg1, align 4
-  %tmp449 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 1
-  store float %tmp14, float addrspace(1)* %tmp449, align 4
-  %tmp450 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 2
-  store float %tmp21, float addrspace(1)* %tmp450, align 4
-  %tmp451 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 3
-  store float %tmp28, float addrspace(1)* %tmp451, align 4
-  %tmp452 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 4
-  store float %tmp35, float addrspace(1)* %tmp452, align 4
-  %tmp453 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 5
-  store float %tmp42, float addrspace(1)* %tmp453, align 4
-  %tmp454 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 6
-  store float %tmp49, float addrspace(1)* %tmp454, align 4
-  %tmp455 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 7
-  store float %tmp56, float addrspace(1)* %tmp455, align 4
-  %tmp456 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 8
-  store float %tmp63, float addrspace(1)* %tmp456, align 4
-  %tmp457 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 9
-  store float %tmp70, float addrspace(1)* %tmp457, align 4
-  %tmp458 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 10
-  store float %tmp77, float addrspace(1)* %tmp458, align 4
-  %tmp459 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 11
-  store float %tmp84, float addrspace(1)* %tmp459, align 4
-  %tmp460 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 12
-  store float %tmp91, float addrspace(1)* %tmp460, align 4
-  %tmp461 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 13
-  store float %tmp98, float addrspace(1)* %tmp461, align 4
-  %tmp462 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 14
-  store float %tmp105, float addrspace(1)* %tmp462, align 4
-  %tmp463 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 15
-  store float %tmp112, float addrspace(1)* %tmp463, align 4
-  %tmp464 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 16
-  store float %tmp119, float addrspace(1)* %tmp464, align 4
-  %tmp465 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 17
-  store float %tmp126, float addrspace(1)* %tmp465, align 4
-  %tmp466 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 18
-  store float %tmp133, float addrspace(1)* %tmp466, align 4
-  %tmp467 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 19
-  store float %tmp140, float addrspace(1)* %tmp467, align 4
-  %tmp468 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 20
-  store float %tmp147, float addrspace(1)* %tmp468, align 4
-  %tmp469 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 21
-  store float %tmp154, float addrspace(1)* %tmp469, align 4
-  %tmp470 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 22
-  store float %tmp161, float addrspace(1)* %tmp470, align 4
-  %tmp471 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 23
-  store float %tmp168, float addrspace(1)* %tmp471, align 4
-  %tmp472 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 24
-  store float %tmp175, float addrspace(1)* %tmp472, align 4
-  %tmp473 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 25
-  store float %tmp182, float addrspace(1)* %tmp473, align 4
-  %tmp474 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 26
-  store float %tmp189, float addrspace(1)* %tmp474, align 4
-  %tmp475 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 27
-  store float %tmp196, float addrspace(1)* %tmp475, align 4
-  %tmp476 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 28
-  store float %tmp203, float addrspace(1)* %tmp476, align 4
-  %tmp477 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 29
-  store float %tmp210, float addrspace(1)* %tmp477, align 4
-  %tmp478 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 30
-  store float %tmp217, float addrspace(1)* %tmp478, align 4
-  %tmp479 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 31
-  store float %tmp224, float addrspace(1)* %tmp479, align 4
-  %tmp480 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 32
-  store float %tmp231, float addrspace(1)* %tmp480, align 4
-  %tmp481 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 33
-  store float %tmp238, float addrspace(1)* %tmp481, align 4
-  %tmp482 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 34
-  store float %tmp245, float addrspace(1)* %tmp482, align 4
-  %tmp483 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 35
-  store float %tmp252, float addrspace(1)* %tmp483, align 4
-  %tmp484 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 36
-  store float %tmp259, float addrspace(1)* %tmp484, align 4
-  %tmp485 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 37
-  store float %tmp266, float addrspace(1)* %tmp485, align 4
-  %tmp486 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 38
-  store float %tmp273, float addrspace(1)* %tmp486, align 4
-  %tmp487 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 39
-  store float %tmp280, float addrspace(1)* %tmp487, align 4
-  %tmp488 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 40
-  store float %tmp287, float addrspace(1)* %tmp488, align 4
-  %tmp489 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 41
-  store float %tmp294, float addrspace(1)* %tmp489, align 4
-  %tmp490 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 42
-  store float %tmp301, float addrspace(1)* %tmp490, align 4
-  %tmp491 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 43
-  store float %tmp308, float addrspace(1)* %tmp491, align 4
-  %tmp492 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 44
-  store float %tmp315, float addrspace(1)* %tmp492, align 4
-  %tmp493 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 45
-  store float %tmp322, float addrspace(1)* %tmp493, align 4
-  %tmp494 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 46
-  store float %tmp329, float addrspace(1)* %tmp494, align 4
-  %tmp495 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 47
-  store float %tmp336, float addrspace(1)* %tmp495, align 4
-  %tmp496 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 48
-  store float %tmp343, float addrspace(1)* %tmp496, align 4
-  %tmp497 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 49
-  store float %tmp350, float addrspace(1)* %tmp497, align 4
-  %tmp498 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 50
-  store float %tmp357, float addrspace(1)* %tmp498, align 4
-  %tmp499 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 51
-  store float %tmp364, float addrspace(1)* %tmp499, align 4
-  %tmp500 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 52
-  store float %tmp371, float addrspace(1)* %tmp500, align 4
-  %tmp501 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 53
-  store float %tmp378, float addrspace(1)* %tmp501, align 4
-  %tmp502 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 54
-  store float %tmp385, float addrspace(1)* %tmp502, align 4
-  %tmp503 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 55
-  store float %tmp392, float addrspace(1)* %tmp503, align 4
-  %tmp504 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 56
-  store float %tmp399, float addrspace(1)* %tmp504, align 4
-  %tmp505 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 57
-  store float %tmp406, float addrspace(1)* %tmp505, align 4
-  %tmp506 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 58
-  store float %tmp413, float addrspace(1)* %tmp506, align 4
-  %tmp507 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 59
-  store float %tmp420, float addrspace(1)* %tmp507, align 4
-  %tmp508 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 60
-  store float %tmp427, float addrspace(1)* %tmp508, align 4
-  %tmp509 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 61
-  store float %tmp434, float addrspace(1)* %tmp509, align 4
-  %tmp510 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 62
-  store float %tmp441, float addrspace(1)* %tmp510, align 4
-  %tmp511 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 63
-  store float %tmp448, float addrspace(1)* %tmp511, align 4
+  store float %tmp7, ptr addrspace(1) %arg1, align 4
+  %tmp449 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 1
+  store float %tmp14, ptr addrspace(1) %tmp449, align 4
+  %tmp450 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 2
+  store float %tmp21, ptr addrspace(1) %tmp450, align 4
+  %tmp451 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 3
+  store float %tmp28, ptr addrspace(1) %tmp451, align 4
+  %tmp452 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 4
+  store float %tmp35, ptr addrspace(1) %tmp452, align 4
+  %tmp453 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 5
+  store float %tmp42, ptr addrspace(1) %tmp453, align 4
+  %tmp454 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 6
+  store float %tmp49, ptr addrspace(1) %tmp454, align 4
+  %tmp455 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 7
+  store float %tmp56, ptr addrspace(1) %tmp455, align 4
+  %tmp456 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 8
+  store float %tmp63, ptr addrspace(1) %tmp456, align 4
+  %tmp457 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 9
+  store float %tmp70, ptr addrspace(1) %tmp457, align 4
+  %tmp458 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 10
+  store float %tmp77, ptr addrspace(1) %tmp458, align 4
+  %tmp459 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 11
+  store float %tmp84, ptr addrspace(1) %tmp459, align 4
+  %tmp460 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 12
+  store float %tmp91, ptr addrspace(1) %tmp460, align 4
+  %tmp461 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 13
+  store float %tmp98, ptr addrspace(1) %tmp461, align 4
+  %tmp462 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 14
+  store float %tmp105, ptr addrspace(1) %tmp462, align 4
+  %tmp463 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 15
+  store float %tmp112, ptr addrspace(1) %tmp463, align 4
+  %tmp464 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 16
+  store float %tmp119, ptr addrspace(1) %tmp464, align 4
+  %tmp465 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 17
+  store float %tmp126, ptr addrspace(1) %tmp465, align 4
+  %tmp466 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 18
+  store float %tmp133, ptr addrspace(1) %tmp466, align 4
+  %tmp467 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 19
+  store float %tmp140, ptr addrspace(1) %tmp467, align 4
+  %tmp468 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 20
+  store float %tmp147, ptr addrspace(1) %tmp468, align 4
+  %tmp469 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 21
+  store float %tmp154, ptr addrspace(1) %tmp469, align 4
+  %tmp470 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 22
+  store float %tmp161, ptr addrspace(1) %tmp470, align 4
+  %tmp471 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 23
+  store float %tmp168, ptr addrspace(1) %tmp471, align 4
+  %tmp472 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 24
+  store float %tmp175, ptr addrspace(1) %tmp472, align 4
+  %tmp473 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 25
+  store float %tmp182, ptr addrspace(1) %tmp473, align 4
+  %tmp474 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 26
+  store float %tmp189, ptr addrspace(1) %tmp474, align 4
+  %tmp475 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 27
+  store float %tmp196, ptr addrspace(1) %tmp475, align 4
+  %tmp476 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 28
+  store float %tmp203, ptr addrspace(1) %tmp476, align 4
+  %tmp477 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 29
+  store float %tmp210, ptr addrspace(1) %tmp477, align 4
+  %tmp478 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 30
+  store float %tmp217, ptr addrspace(1) %tmp478, align 4
+  %tmp479 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 31
+  store float %tmp224, ptr addrspace(1) %tmp479, align 4
+  %tmp480 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 32
+  store float %tmp231, ptr addrspace(1) %tmp480, align 4
+  %tmp481 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 33
+  store float %tmp238, ptr addrspace(1) %tmp481, align 4
+  %tmp482 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 34
+  store float %tmp245, ptr addrspace(1) %tmp482, align 4
+  %tmp483 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 35
+  store float %tmp252, ptr addrspace(1) %tmp483, align 4
+  %tmp484 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 36
+  store float %tmp259, ptr addrspace(1) %tmp484, align 4
+  %tmp485 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 37
+  store float %tmp266, ptr addrspace(1) %tmp485, align 4
+  %tmp486 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 38
+  store float %tmp273, ptr addrspace(1) %tmp486, align 4
+  %tmp487 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 39
+  store float %tmp280, ptr addrspace(1) %tmp487, align 4
+  %tmp488 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 40
+  store float %tmp287, ptr addrspace(1) %tmp488, align 4
+  %tmp489 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 41
+  store float %tmp294, ptr addrspace(1) %tmp489, align 4
+  %tmp490 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 42
+  store float %tmp301, ptr addrspace(1) %tmp490, align 4
+  %tmp491 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 43
+  store float %tmp308, ptr addrspace(1) %tmp491, align 4
+  %tmp492 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 44
+  store float %tmp315, ptr addrspace(1) %tmp492, align 4
+  %tmp493 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 45
+  store float %tmp322, ptr addrspace(1) %tmp493, align 4
+  %tmp494 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 46
+  store float %tmp329, ptr addrspace(1) %tmp494, align 4
+  %tmp495 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 47
+  store float %tmp336, ptr addrspace(1) %tmp495, align 4
+  %tmp496 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 48
+  store float %tmp343, ptr addrspace(1) %tmp496, align 4
+  %tmp497 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 49
+  store float %tmp350, ptr addrspace(1) %tmp497, align 4
+  %tmp498 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 50
+  store float %tmp357, ptr addrspace(1) %tmp498, align 4
+  %tmp499 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 51
+  store float %tmp364, ptr addrspace(1) %tmp499, align 4
+  %tmp500 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 52
+  store float %tmp371, ptr addrspace(1) %tmp500, align 4
+  %tmp501 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 53
+  store float %tmp378, ptr addrspace(1) %tmp501, align 4
+  %tmp502 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 54
+  store float %tmp385, ptr addrspace(1) %tmp502, align 4
+  %tmp503 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 55
+  store float %tmp392, ptr addrspace(1) %tmp503, align 4
+  %tmp504 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 56
+  store float %tmp399, ptr addrspace(1) %tmp504, align 4
+  %tmp505 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 57
+  store float %tmp406, ptr addrspace(1) %tmp505, align 4
+  %tmp506 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 58
+  store float %tmp413, ptr addrspace(1) %tmp506, align 4
+  %tmp507 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 59
+  store float %tmp420, ptr addrspace(1) %tmp507, align 4
+  %tmp508 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 60
+  store float %tmp427, ptr addrspace(1) %tmp508, align 4
+  %tmp509 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 61
+  store float %tmp434, ptr addrspace(1) %tmp509, align 4
+  %tmp510 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 62
+  store float %tmp441, ptr addrspace(1) %tmp510, align 4
+  %tmp511 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 63
+  store float %tmp448, ptr addrspace(1) %tmp511, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-misched-max-waves.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-misched-max-waves.ll
index 98025b184f204..b78622dead112 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-misched-max-waves.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-misched-max-waves.ll
@@ -8,68 +8,68 @@
 ; CHECK-NOT: REG-CRIT
 ; CHECK-NOT: REG-EXCESS
 
-define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(3)* nocapture %arg1) #1 {
+define amdgpu_kernel void @load_fma_store(ptr addrspace(3) nocapture readonly %arg, ptr addrspace(3) nocapture %arg1) #1 {
 bb:
-  %tmp0 = getelementptr inbounds float, float addrspace(3)* %arg, i32 1
-  %tmp1 = load float, float addrspace(3)* %tmp0, align 4
-  %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2
-  %tmp3 = load float, float addrspace(3)* %tmp2, align 4
-  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 3
-  %tmp5 = load float, float addrspace(3)* %tmp4, align 4
-  %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4
-  %tmp7 = load float, float addrspace(3)* %tmp6, align 4
-  %tmp8 = getelementptr inbounds float, float addrspace(3)* %arg, i32 5
-  %tmp9 = load float, float addrspace(3)* %tmp8, align 4
-  %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6
-  %tmp11 = load float, float addrspace(3)* %tmp10, align 4
-  %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 7
-  %tmp13 = load float, float addrspace(3)* %tmp12, align 4
-  %tmp14 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8
-  %tmp15 = load float, float addrspace(3)* %tmp14, align 4
-  %tmp16 = getelementptr inbounds float, float addrspace(3)* %arg, i32 9
-  %tmp17 = load float, float addrspace(3)* %tmp16, align 4
-  %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10
-  %tmp19 = load float, float addrspace(3)* %tmp18, align 4
-  %tmp20 = getelementptr inbounds float, float addrspace(3)* %arg, i32 11
-  %tmp21 = load float, float addrspace(3)* %tmp20, align 4
-  %tmp22 = getelementptr inbounds float, float addrspace(3)* %arg, i32 12
-  %tmp23 = load float, float addrspace(3)* %tmp22, align 4
-  %tmp24 = getelementptr inbounds float, float addrspace(3)* %arg, i32 13
-  %tmp25 = load float, float addrspace(3)* %tmp24, align 4
-  %tmp26 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14
-  %tmp27 = load float, float addrspace(3)* %tmp26, align 4
-  %tmp28 = getelementptr inbounds float, float addrspace(3)* %arg, i32 15
-  %tmp29 = load float, float addrspace(3)* %tmp28, align 4
-  %tmp30 = getelementptr inbounds float, float addrspace(3)* %arg, i32 16
-  %tmp31 = load float, float addrspace(3)* %tmp30, align 4
-  %tmp32 = getelementptr inbounds float, float addrspace(3)* %arg, i32 17
-  %tmp33 = load float, float addrspace(3)* %tmp32, align 4
-  %tmp34 = getelementptr inbounds float, float addrspace(3)* %arg, i32 18
-  %tmp35 = load float, float addrspace(3)* %tmp34, align 4
-  %tmp36 = getelementptr inbounds float, float addrspace(3)* %arg, i32 19
-  %tmp37 = load float, float addrspace(3)* %tmp36, align 4
-  %tmp38 = getelementptr inbounds float, float addrspace(3)* %arg, i32 20
-  %tmp39 = load float, float addrspace(3)* %tmp38, align 4
-  %tmp40 = getelementptr inbounds float, float addrspace(3)* %arg, i32 21
-  %tmp41 = load float, float addrspace(3)* %tmp40, align 4
-  %tmp42 = getelementptr inbounds float, float addrspace(3)* %arg, i32 22
-  %tmp43 = load float, float addrspace(3)* %tmp42, align 4
-  %tmp44 = getelementptr inbounds float, float addrspace(3)* %arg, i32 23
-  %tmp45 = load float, float addrspace(3)* %tmp44, align 4
-  %tmp46 = getelementptr inbounds float, float addrspace(3)* %arg, i32 24
-  %tmp47 = load float, float addrspace(3)* %tmp46, align 4
-  %tmp48 = getelementptr inbounds float, float addrspace(3)* %arg, i32 25
-  %tmp49 = load float, float addrspace(3)* %tmp48, align 4
-  %tmp50 = getelementptr inbounds float, float addrspace(3)* %arg, i32 26
-  %tmp51 = load float, float addrspace(3)* %tmp50, align 4
-  %tmp52 = getelementptr inbounds float, float addrspace(3)* %arg, i32 27
-  %tmp53 = load float, float addrspace(3)* %tmp52, align 4
-  %tmp54 = getelementptr inbounds float, float addrspace(3)* %arg, i32 28
-  %tmp55 = load float, float addrspace(3)* %tmp54, align 4
-  %tmp56 = getelementptr inbounds float, float addrspace(3)* %arg, i32 29
-  %tmp57 = load float, float addrspace(3)* %tmp56, align 4
-  %tmp58 = getelementptr inbounds float, float addrspace(3)* %arg, i32 30
-  %tmp59 = load float, float addrspace(3)* %tmp58, align 4
+  %tmp0 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 1
+  %tmp1 = load float, ptr addrspace(3) %tmp0, align 4
+  %tmp2 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 2
+  %tmp3 = load float, ptr addrspace(3) %tmp2, align 4
+  %tmp4 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 3
+  %tmp5 = load float, ptr addrspace(3) %tmp4, align 4
+  %tmp6 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 4
+  %tmp7 = load float, ptr addrspace(3) %tmp6, align 4
+  %tmp8 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 5
+  %tmp9 = load float, ptr addrspace(3) %tmp8, align 4
+  %tmp10 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 6
+  %tmp11 = load float, ptr addrspace(3) %tmp10, align 4
+  %tmp12 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 7
+  %tmp13 = load float, ptr addrspace(3) %tmp12, align 4
+  %tmp14 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 8
+  %tmp15 = load float, ptr addrspace(3) %tmp14, align 4
+  %tmp16 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 9
+  %tmp17 = load float, ptr addrspace(3) %tmp16, align 4
+  %tmp18 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 10
+  %tmp19 = load float, ptr addrspace(3) %tmp18, align 4
+  %tmp20 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 11
+  %tmp21 = load float, ptr addrspace(3) %tmp20, align 4
+  %tmp22 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 12
+  %tmp23 = load float, ptr addrspace(3) %tmp22, align 4
+  %tmp24 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 13
+  %tmp25 = load float, ptr addrspace(3) %tmp24, align 4
+  %tmp26 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 14
+  %tmp27 = load float, ptr addrspace(3) %tmp26, align 4
+  %tmp28 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 15
+  %tmp29 = load float, ptr addrspace(3) %tmp28, align 4
+  %tmp30 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 16
+  %tmp31 = load float, ptr addrspace(3) %tmp30, align 4
+  %tmp32 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 17
+  %tmp33 = load float, ptr addrspace(3) %tmp32, align 4
+  %tmp34 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 18
+  %tmp35 = load float, ptr addrspace(3) %tmp34, align 4
+  %tmp36 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 19
+  %tmp37 = load float, ptr addrspace(3) %tmp36, align 4
+  %tmp38 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 20
+  %tmp39 = load float, ptr addrspace(3) %tmp38, align 4
+  %tmp40 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 21
+  %tmp41 = load float, ptr addrspace(3) %tmp40, align 4
+  %tmp42 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 22
+  %tmp43 = load float, ptr addrspace(3) %tmp42, align 4
+  %tmp44 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 23
+  %tmp45 = load float, ptr addrspace(3) %tmp44, align 4
+  %tmp46 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 24
+  %tmp47 = load float, ptr addrspace(3) %tmp46, align 4
+  %tmp48 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 25
+  %tmp49 = load float, ptr addrspace(3) %tmp48, align 4
+  %tmp50 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 26
+  %tmp51 = load float, ptr addrspace(3) %tmp50, align 4
+  %tmp52 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 27
+  %tmp53 = load float, ptr addrspace(3) %tmp52, align 4
+  %tmp54 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 28
+  %tmp55 = load float, ptr addrspace(3) %tmp54, align 4
+  %tmp56 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 29
+  %tmp57 = load float, ptr addrspace(3) %tmp56, align 4
+  %tmp58 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 30
+  %tmp59 = load float, ptr addrspace(3) %tmp58, align 4
   %tmp60 = tail call float @llvm.fmuladd.f32(float %tmp1, float %tmp3, float %tmp5)
   %tmp61 = tail call float @llvm.fmuladd.f32(float %tmp7, float %tmp9, float %tmp11)
   %tmp62 = tail call float @llvm.fmuladd.f32(float %tmp13, float %tmp15, float %tmp17)
@@ -80,26 +80,26 @@ bb:
   %tmp67 = tail call float @llvm.fmuladd.f32(float %tmp43, float %tmp45, float %tmp47)
   %tmp68 = tail call float @llvm.fmuladd.f32(float %tmp49, float %tmp51, float %tmp53)
   %tmp69 = tail call float @llvm.fmuladd.f32(float %tmp55, float %tmp57, float %tmp59)
-  %tmp70 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 1
-  store float %tmp60, float addrspace(3)* %tmp70, align 4
-  %tmp71 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 2
-  store float %tmp61, float addrspace(3)* %tmp71, align 4
-  %tmp72 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 3
-  store float %tmp62, float addrspace(3)* %tmp72, align 4
-  %tmp73 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 4
-  store float %tmp63, float addrspace(3)* %tmp73, align 4
-  %tmp74 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 5
-  store float %tmp64, float addrspace(3)* %tmp74, align 4
-  %tmp75 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 6
-  store float %tmp65, float addrspace(3)* %tmp75, align 4
-  %tmp76 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 7
-  store float %tmp66, float addrspace(3)* %tmp76, align 4
-  %tmp77 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 8
-  store float %tmp67, float addrspace(3)* %tmp77, align 4
-  %tmp78 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 9
-  store float %tmp68, float addrspace(3)* %tmp78, align 4
-  %tmp79 = getelementptr inbounds float, float addrspace(3)* %arg1, i64 10
-  store float %tmp69, float addrspace(3)* %tmp79, align 4
+  %tmp70 = getelementptr inbounds float, ptr addrspace(3) %arg1, i64 1
+  store float %tmp60, ptr addrspace(3) %tmp70, align 4
+  %tmp71 = getelementptr inbounds float, ptr addrspace(3) %arg1, i64 2
+  store float %tmp61, ptr addrspace(3) %tmp71, align 4
+  %tmp72 = getelementptr inbounds float, ptr addrspace(3) %arg1, i64 3
+  store float %tmp62, ptr addrspace(3) %tmp72, align 4
+  %tmp73 = getelementptr inbounds float, ptr addrspace(3) %arg1, i64 4
+  store float %tmp63, ptr addrspace(3) %tmp73, align 4
+  %tmp74 = getelementptr inbounds float, ptr addrspace(3) %arg1, i64 5
+  store float %tmp64, ptr addrspace(3) %tmp74, align 4
+  %tmp75 = getelementptr inbounds float, ptr addrspace(3) %arg1, i64 6
+  store float %tmp65, ptr addrspace(3) %tmp75, align 4
+  %tmp76 = getelementptr inbounds float, ptr addrspace(3) %arg1, i64 7
+  store float %tmp66, ptr addrspace(3) %tmp76, align 4
+  %tmp77 = getelementptr inbounds float, ptr addrspace(3) %arg1, i64 8
+  store float %tmp67, ptr addrspace(3) %tmp77, align 4
+  %tmp78 = getelementptr inbounds float, ptr addrspace(3) %arg1, i64 9
+  store float %tmp68, ptr addrspace(3) %tmp78, align 4
+  %tmp79 = getelementptr inbounds float, ptr addrspace(3) %arg1, i64 10
+  store float %tmp69, ptr addrspace(3) %tmp79, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll b/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
index 96ebb6f83628d..538e2984c0736 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
@@ -36,63 +36,63 @@ ENDIF:                                            ; preds = %main_body, %Flow2
   %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %101, %Flow2 ]
   %15 = extractelement <4 x float> %reg1, i32 1
   %16 = extractelement <4 x float> %reg1, i32 3
-  %17 = load <4 x float>, <4 x float> addrspace(4)* null
+  %17 = load <4 x float>, ptr addrspace(4) null
   %18 = extractelement <4 x float> %17, i32 0
   %19 = fmul float %18, %0
-  %20 = load <4 x float>, <4 x float> addrspace(4)* null
+  %20 = load <4 x float>, ptr addrspace(4) null
   %21 = extractelement <4 x float> %20, i32 1
   %22 = fmul float %21, %0
-  %23 = load <4 x float>, <4 x float> addrspace(4)* null
+  %23 = load <4 x float>, ptr addrspace(4) null
   %24 = extractelement <4 x float> %23, i32 2
   %25 = fmul float %24, %0
-  %26 = load <4 x float>, <4 x float> addrspace(4)* null
+  %26 = load <4 x float>, ptr addrspace(4) null
   %27 = extractelement <4 x float> %26, i32 3
   %28 = fmul float %27, %0
-  %29 = load <4 x float>, <4 x float> addrspace(4)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(4)* null, i64 0, i32 1)
+  %29 = load <4 x float>, ptr addrspace(4) getelementptr ([1024 x <4 x float>], ptr addrspace(4) null, i64 0, i32 1)
   %30 = extractelement <4 x float> %29, i32 0
   %31 = fmul float %30, %15
   %32 = fadd float %31, %19
-  %33 = load <4 x float>, <4 x float> addrspace(4)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(4)* null, i64 0, i32 1)
+  %33 = load <4 x float>, ptr addrspace(4) getelementptr ([1024 x <4 x float>], ptr addrspace(4) null, i64 0, i32 1)
   %34 = extractelement <4 x float> %33, i32 1
   %35 = fmul float %34, %15
   %36 = fadd float %35, %22
-  %37 = load <4 x float>, <4 x float> addrspace(4)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(4)* null, i64 0, i32 1)
+  %37 = load <4 x float>, ptr addrspace(4) getelementptr ([1024 x <4 x float>], ptr addrspace(4) null, i64 0, i32 1)
   %38 = extractelement <4 x float> %37, i32 2
   %39 = fmul float %38, %15
   %40 = fadd float %39, %25
-  %41 = load <4 x float>, <4 x float> addrspace(4)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(4)* null, i64 0, i32 1)
+  %41 = load <4 x float>, ptr addrspace(4) getelementptr ([1024 x <4 x float>], ptr addrspace(4) null, i64 0, i32 1)
   %42 = extractelement <4 x float> %41, i32 3
   %43 = fmul float %42, %15
   %44 = fadd float %43, %28
-  %45 = load <4 x float>, <4 x float> addrspace(4)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(4)* null, i64 0, i32 2)
+  %45 = load <4 x float>, ptr addrspace(4) getelementptr ([1024 x <4 x float>], ptr addrspace(4) null, i64 0, i32 2)
   %46 = extractelement <4 x float> %45, i32 0
   %47 = fmul float %46, %1
   %48 = fadd float %47, %32
-  %49 = load <4 x float>, <4 x float> addrspace(4)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(4)* null, i64 0, i32 2)
+  %49 = load <4 x float>, ptr addrspace(4) getelementptr ([1024 x <4 x float>], ptr addrspace(4) null, i64 0, i32 2)
   %50 = extractelement <4 x float> %49, i32 1
   %51 = fmul float %50, %1
   %52 = fadd float %51, %36
-  %53 = load <4 x float>, <4 x float> addrspace(4)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(4)* null, i64 0, i32 2)
+  %53 = load <4 x float>, ptr addrspace(4) getelementptr ([1024 x <4 x float>], ptr addrspace(4) null, i64 0, i32 2)
   %54 = extractelement <4 x float> %53, i32 2
   %55 = fmul float %54, %1
   %56 = fadd float %55, %40
-  %57 = load <4 x float>, <4 x float> addrspace(4)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(4)* null, i64 0, i32 2)
+  %57 = load <4 x float>, ptr addrspace(4) getelementptr ([1024 x <4 x float>], ptr addrspace(4) null, i64 0, i32 2)
   %58 = extractelement <4 x float> %57, i32 3
   %59 = fmul float %58, %1
   %60 = fadd float %59, %44
-  %61 = load <4 x float>, <4 x float> addrspace(4)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(4)* null, i64 0, i32 3)
+  %61 = load <4 x float>, ptr addrspace(4) getelementptr ([1024 x <4 x float>], ptr addrspace(4) null, i64 0, i32 3)
   %62 = extractelement <4 x float> %61, i32 0
   %63 = fmul float %62, %16
   %64 = fadd float %63, %48
-  %65 = load <4 x float>, <4 x float> addrspace(4)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(4)* null, i64 0, i32 3)
+  %65 = load <4 x float>, ptr addrspace(4) getelementptr ([1024 x <4 x float>], ptr addrspace(4) null, i64 0, i32 3)
   %66 = extractelement <4 x float> %65, i32 1
   %67 = fmul float %66, %16
   %68 = fadd float %67, %52
-  %69 = load <4 x float>, <4 x float> addrspace(4)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(4)* null, i64 0, i32 3)
+  %69 = load <4 x float>, ptr addrspace(4) getelementptr ([1024 x <4 x float>], ptr addrspace(4) null, i64 0, i32 3)
   %70 = extractelement <4 x float> %69, i32 2
   %71 = fmul float %70, %16
   %72 = fadd float %71, %56
-  %73 = load <4 x float>, <4 x float> addrspace(4)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(4)* null, i64 0, i32 3)
+  %73 = load <4 x float>, ptr addrspace(4) getelementptr ([1024 x <4 x float>], ptr addrspace(4) null, i64 0, i32 3)
   %74 = extractelement <4 x float> %73, i32 3
   %75 = fmul float %74, %16
   %76 = fadd float %75, %60

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll b/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll
index 00d4ba66913db..1ba551ac2b7f4 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll
@@ -21,63 +21,63 @@ ENDIF:                                            ; preds = %ENDIF16, %LOOP, %ma
   %temp1.0 = phi float [ 1.000000e+00, %main_body ], [ %temp1.1, %LOOP ], [ %temp1.1, %ENDIF16 ]
   %temp2.0 = phi float [ 0.000000e+00, %main_body ], [ %temp2.1, %LOOP ], [ %temp2.1, %ENDIF16 ]
   %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %temp3.1, %LOOP ], [ %temp3.1, %ENDIF16 ]
-  %11 = load <4 x float>, <4 x float> addrspace(9)* null
+  %11 = load <4 x float>, ptr addrspace(9) null
   %12 = extractelement <4 x float> %11, i32 0
   %13 = fmul float %12, %0
-  %14 = load <4 x float>, <4 x float> addrspace(9)* null
+  %14 = load <4 x float>, ptr addrspace(9) null
   %15 = extractelement <4 x float> %14, i32 1
   %16 = fmul float %15, %0
-  %17 = load <4 x float>, <4 x float> addrspace(9)* null
+  %17 = load <4 x float>, ptr addrspace(9) null
   %18 = extractelement <4 x float> %17, i32 2
   %19 = fmul float %18, %0
-  %20 = load <4 x float>, <4 x float> addrspace(9)* null
+  %20 = load <4 x float>, ptr addrspace(9) null
   %21 = extractelement <4 x float> %20, i32 3
   %22 = fmul float %21, %0
-  %23 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %23 = load <4 x float>, ptr addrspace(9) getelementptr ([1024 x <4 x float>], ptr addrspace(9) null, i64 0, i32 1)
   %24 = extractelement <4 x float> %23, i32 0
   %25 = fmul float %24, %1
   %26 = fadd float %25, %13
-  %27 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %27 = load <4 x float>, ptr addrspace(9) getelementptr ([1024 x <4 x float>], ptr addrspace(9) null, i64 0, i32 1)
   %28 = extractelement <4 x float> %27, i32 1
   %29 = fmul float %28, %1
   %30 = fadd float %29, %16
-  %31 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %31 = load <4 x float>, ptr addrspace(9) getelementptr ([1024 x <4 x float>], ptr addrspace(9) null, i64 0, i32 1)
   %32 = extractelement <4 x float> %31, i32 2
   %33 = fmul float %32, %1
   %34 = fadd float %33, %19
-  %35 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %35 = load <4 x float>, ptr addrspace(9) getelementptr ([1024 x <4 x float>], ptr addrspace(9) null, i64 0, i32 1)
   %36 = extractelement <4 x float> %35, i32 3
   %37 = fmul float %36, %1
   %38 = fadd float %37, %22
-  %39 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %39 = load <4 x float>, ptr addrspace(9) getelementptr ([1024 x <4 x float>], ptr addrspace(9) null, i64 0, i32 2)
   %40 = extractelement <4 x float> %39, i32 0
   %41 = fmul float %40, %2
   %42 = fadd float %41, %26
-  %43 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %43 = load <4 x float>, ptr addrspace(9) getelementptr ([1024 x <4 x float>], ptr addrspace(9) null, i64 0, i32 2)
   %44 = extractelement <4 x float> %43, i32 1
   %45 = fmul float %44, %2
   %46 = fadd float %45, %30
-  %47 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %47 = load <4 x float>, ptr addrspace(9) getelementptr ([1024 x <4 x float>], ptr addrspace(9) null, i64 0, i32 2)
   %48 = extractelement <4 x float> %47, i32 2
   %49 = fmul float %48, %2
   %50 = fadd float %49, %34
-  %51 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %51 = load <4 x float>, ptr addrspace(9) getelementptr ([1024 x <4 x float>], ptr addrspace(9) null, i64 0, i32 2)
   %52 = extractelement <4 x float> %51, i32 3
   %53 = fmul float %52, %2
   %54 = fadd float %53, %38
-  %55 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %55 = load <4 x float>, ptr addrspace(9) getelementptr ([1024 x <4 x float>], ptr addrspace(9) null, i64 0, i32 3)
   %56 = extractelement <4 x float> %55, i32 0
   %57 = fmul float %56, %3
   %58 = fadd float %57, %42
-  %59 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %59 = load <4 x float>, ptr addrspace(9) getelementptr ([1024 x <4 x float>], ptr addrspace(9) null, i64 0, i32 3)
   %60 = extractelement <4 x float> %59, i32 1
   %61 = fmul float %60, %3
   %62 = fadd float %61, %46
-  %63 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %63 = load <4 x float>, ptr addrspace(9) getelementptr ([1024 x <4 x float>], ptr addrspace(9) null, i64 0, i32 3)
   %64 = extractelement <4 x float> %63, i32 2
   %65 = fmul float %64, %3
   %66 = fadd float %65, %50
-  %67 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %67 = load <4 x float>, ptr addrspace(9) getelementptr ([1024 x <4 x float>], ptr addrspace(9) null, i64 0, i32 3)
   %68 = extractelement <4 x float> %67, i32 3
   %69 = fmul float %68, %3
   %70 = fadd float %69, %54

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll b/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll
index 6beddf8fe947a..0f42139c66824 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll
@@ -6,37 +6,37 @@ declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <3
 ; CHECK: CritRes: {{[0-9]+}} HWXDL
 ; CHECK: Picking: Cand SU([[nid:[0-9]+]]) RES-DEMAND
 ; CHECK: Scheduling SU([[nid]]) {{.*}} V_MFMA_F32_32X32X4F16
-define amdgpu_kernel void @schedule-xdl-resource(<32 x float> addrspace(1)* %in, <32 x float> addrspace(1)* %out, <4 x half> addrspace(3)* %lds, i32 %stride) #0 {
-  %in_ptr.1 = getelementptr <32 x float>, <32 x float> addrspace(1)* %in, i32 %stride
-  %in_ptr.2 = getelementptr <32 x float>, <32 x float> addrspace(1)* %in_ptr.1, i32 %stride
-  %in_ptr.3 = getelementptr <32 x float>, <32 x float> addrspace(1)* %in_ptr.2, i32 %stride
-  %in.load.1 = load <32 x float>, <32 x float> addrspace (1)* %in_ptr.1
-  %in.load.2 = load <32 x float>, <32 x float> addrspace (1)* %in_ptr.2
-  %in.load.3 = load <32 x float>, <32 x float> addrspace (1)* %in_ptr.3
-  %lds_ptr.1 = getelementptr <4 x half>, <4 x half> addrspace(3)* %lds, i32 %stride
-  %lds_ptr.2 = getelementptr <4 x half>, <4 x half> addrspace(3)* %lds_ptr.1, i32 %stride
-  %lds_ptr.3 = getelementptr <4 x half>, <4 x half> addrspace(3)* %lds_ptr.2, i32 %stride
-  %lds.load.1 = load <4 x half>, <4 x half> addrspace(3)* %lds_ptr.1
-  %lds.load.2 = load <4 x half>, <4 x half> addrspace(3)* %lds_ptr.2
-  %lds.load.3 = load <4 x half>, <4 x half> addrspace(3)* %lds_ptr.3
+define amdgpu_kernel void @schedule-xdl-resource(ptr addrspace(1) %in, ptr addrspace(1) %out, ptr addrspace(3) %lds, i32 %stride) #0 {
+  %in_ptr.1 = getelementptr <32 x float>, ptr addrspace(1) %in, i32 %stride
+  %in_ptr.2 = getelementptr <32 x float>, ptr addrspace(1) %in_ptr.1, i32 %stride
+  %in_ptr.3 = getelementptr <32 x float>, ptr addrspace(1) %in_ptr.2, i32 %stride
+  %in.load.1 = load <32 x float>, ptr addrspace (1) %in_ptr.1
+  %in.load.2 = load <32 x float>, ptr addrspace (1) %in_ptr.2
+  %in.load.3 = load <32 x float>, ptr addrspace (1) %in_ptr.3
+  %lds_ptr.1 = getelementptr <4 x half>, ptr addrspace(3) %lds, i32 %stride
+  %lds_ptr.2 = getelementptr <4 x half>, ptr addrspace(3) %lds_ptr.1, i32 %stride
+  %lds_ptr.3 = getelementptr <4 x half>, ptr addrspace(3) %lds_ptr.2, i32 %stride
+  %lds.load.1 = load <4 x half>, ptr addrspace(3) %lds_ptr.1
+  %lds.load.2 = load <4 x half>, ptr addrspace(3) %lds_ptr.2
+  %lds.load.3 = load <4 x half>, ptr addrspace(3) %lds_ptr.3
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %lds.load.1, <4 x half> %lds.load.1, <32 x float> %in.load.1, i32 1, i32 1, i32 1)
   %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %lds.load.2, <4 x half> %lds.load.2, <32 x float> %in.load.2, i32 1, i32 1, i32 1)
   %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %lds.load.3, <4 x half> %lds.load.3, <32 x float> %in.load.3, i32 1, i32 1, i32 1)
   %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %lds.load.1, <4 x half> %lds.load.1, <32 x float> %in.load.1, i32 2, i32 2, i32 2)
   %mai.5 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %lds.load.2, <4 x half> %lds.load.2, <32 x float> %in.load.2, i32 2, i32 2, i32 2)
   %mai.6 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %lds.load.3, <4 x half> %lds.load.3, <32 x float> %in.load.3, i32 2, i32 2, i32 2)
-  %out_ptr.1 = getelementptr <32 x float>, <32 x float> addrspace(1)* %out, i32 %stride
-  %out_ptr.2 = getelementptr <32 x float>, <32 x float> addrspace(1)* %out_ptr.1, i32 %stride
-  %out_ptr.3 = getelementptr <32 x float>, <32 x float> addrspace(1)* %out_ptr.2, i32 %stride
-  %out_ptr.4 = getelementptr <32 x float>, <32 x float> addrspace(1)* %out_ptr.3, i32 %stride
-  %out_ptr.5 = getelementptr <32 x float>, <32 x float> addrspace(1)* %out_ptr.4, i32 %stride
-  %out_ptr.6 = getelementptr <32 x float>, <32 x float> addrspace(1)* %out_ptr.5, i32 %stride
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %out_ptr.1
-  store <32 x float> %mai.2, <32 x float> addrspace(1)* %out_ptr.2
-  store <32 x float> %mai.3, <32 x float> addrspace(1)* %out_ptr.3
-  store <32 x float> %mai.4, <32 x float> addrspace(1)* %out_ptr.4
-  store <32 x float> %mai.5, <32 x float> addrspace(1)* %out_ptr.5
-  store <32 x float> %mai.6, <32 x float> addrspace(1)* %out_ptr.6
+  %out_ptr.1 = getelementptr <32 x float>, ptr addrspace(1) %out, i32 %stride
+  %out_ptr.2 = getelementptr <32 x float>, ptr addrspace(1) %out_ptr.1, i32 %stride
+  %out_ptr.3 = getelementptr <32 x float>, ptr addrspace(1) %out_ptr.2, i32 %stride
+  %out_ptr.4 = getelementptr <32 x float>, ptr addrspace(1) %out_ptr.3, i32 %stride
+  %out_ptr.5 = getelementptr <32 x float>, ptr addrspace(1) %out_ptr.4, i32 %stride
+  %out_ptr.6 = getelementptr <32 x float>, ptr addrspace(1) %out_ptr.5, i32 %stride
+  store <32 x float> %mai.1, ptr addrspace(1) %out_ptr.1
+  store <32 x float> %mai.2, ptr addrspace(1) %out_ptr.2
+  store <32 x float> %mai.3, ptr addrspace(1) %out_ptr.3
+  store <32 x float> %mai.4, ptr addrspace(1) %out_ptr.4
+  store <32 x float> %mai.5, ptr addrspace(1) %out_ptr.5
+  store <32 x float> %mai.6, ptr addrspace(1) %out_ptr.6
 
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
index 4a44a89a016f7..9e6d9b2ea04bd 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
@@ -13,33 +13,31 @@
 ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8004
 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}}
 
-define amdgpu_kernel void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) {
+define amdgpu_kernel void @legal_offset_fi(ptr addrspace(1) %out, i32 %cond, i32 %if_offset, i32 %else_offset) {
 entry:
   %scratch0 = alloca [8192 x i32], addrspace(5)
   %scratch1 = alloca [8192 x i32], addrspace(5)
 
-  %scratchptr0 = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch0, i32 0, i32 0
-  store i32 1, i32 addrspace(5)* %scratchptr0
+  store i32 1, ptr addrspace(5) %scratch0
 
-  %scratchptr1 = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch1, i32 0, i32 0
-  store i32 2, i32 addrspace(5)* %scratchptr1
+  store i32 2, ptr addrspace(5) %scratch1
 
   %cmp = icmp eq i32 %cond, 0
   br i1 %cmp, label %if, label %else
 
 if:
-  %if_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch0, i32 0, i32 %if_offset
-  %if_value = load i32, i32 addrspace(5)* %if_ptr
+  %if_ptr = getelementptr [8192 x i32], ptr addrspace(5) %scratch0, i32 0, i32 %if_offset
+  %if_value = load i32, ptr addrspace(5) %if_ptr
   br label %done
 
 else:
-  %else_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch1, i32 0, i32 %else_offset
-  %else_value = load i32, i32 addrspace(5)* %else_ptr
+  %else_ptr = getelementptr [8192 x i32], ptr addrspace(5) %scratch1, i32 0, i32 %else_offset
+  %else_value = load i32, ptr addrspace(5) %else_ptr
   br label %done
 
 done:
   %value = phi i32 [%if_value, %if], [%else_value, %else]
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 
   ret void
@@ -52,36 +50,36 @@ done:
 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[OFFSET:v[0-9]+]], vcc, 0x8004
 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}}
 
-define amdgpu_kernel void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) {
+define amdgpu_kernel void @legal_offset_fi_offset(ptr addrspace(1) %out, i32 %cond, ptr addrspace(1) %offsets, i32 %if_offset, i32 %else_offset) {
 entry:
   %scratch0 = alloca [8192 x i32], addrspace(5)
   %scratch1 = alloca [8192 x i32], addrspace(5)
 
-  %offset0 = load i32, i32 addrspace(1)* %offsets
-  %scratchptr0 = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch0, i32 0, i32 %offset0
-  store i32 %offset0, i32 addrspace(5)* %scratchptr0
+  %offset0 = load i32, ptr addrspace(1) %offsets
+  %scratchptr0 = getelementptr [8192 x i32], ptr addrspace(5) %scratch0, i32 0, i32 %offset0
+  store i32 %offset0, ptr addrspace(5) %scratchptr0
 
-  %offsetptr1 = getelementptr i32, i32 addrspace(1)* %offsets, i32 1
-  %offset1 = load i32, i32 addrspace(1)* %offsetptr1
-  %scratchptr1 = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch1, i32 0, i32 %offset1
-  store i32 %offset1, i32 addrspace(5)* %scratchptr1
+  %offsetptr1 = getelementptr i32, ptr addrspace(1) %offsets, i32 1
+  %offset1 = load i32, ptr addrspace(1) %offsetptr1
+  %scratchptr1 = getelementptr [8192 x i32], ptr addrspace(5) %scratch1, i32 0, i32 %offset1
+  store i32 %offset1, ptr addrspace(5) %scratchptr1
 
   %cmp = icmp eq i32 %cond, 0
   br i1 %cmp, label %if, label %else
 
 if:
-  %if_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch0, i32 0, i32 %if_offset
-  %if_value = load i32, i32 addrspace(5)* %if_ptr
+  %if_ptr = getelementptr [8192 x i32], ptr addrspace(5) %scratch0, i32 0, i32 %if_offset
+  %if_value = load i32, ptr addrspace(5) %if_ptr
   br label %done
 
 else:
-  %else_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch1, i32 0, i32 %else_offset
-  %else_value = load i32, i32 addrspace(5)* %else_ptr
+  %else_ptr = getelementptr [8192 x i32], ptr addrspace(5) %scratch1, i32 0, i32 %else_offset
+  %else_value = load i32, ptr addrspace(5) %else_ptr
   br label %done
 
 done:
   %value = phi i32 [%if_value, %if], [%else_value, %else]
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -92,8 +90,8 @@ define amdgpu_kernel void @neg_vaddr_offset_inbounds(i32 %offset) {
 entry:
   %array = alloca [8192 x i32], addrspace(5)
   %ptr_offset = add i32 %offset, 4
-  %ptr = getelementptr inbounds [8192 x i32], [8192 x i32] addrspace(5)* %array, i32 0, i32 %ptr_offset
-  store i32 0, i32 addrspace(5)* %ptr
+  %ptr = getelementptr inbounds [8192 x i32], ptr addrspace(5) %array, i32 0, i32 %ptr_offset
+  store i32 0, ptr addrspace(5) %ptr
   ret void
 }
 
@@ -104,20 +102,20 @@ define amdgpu_kernel void @neg_vaddr_offset(i32 %offset) {
 entry:
   %array = alloca [8192 x i32], addrspace(5)
   %ptr_offset = add i32 %offset, 4
-  %ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %array, i32 0, i32 %ptr_offset
-  store i32 0, i32 addrspace(5)* %ptr
+  %ptr = getelementptr [8192 x i32], ptr addrspace(5) %array, i32 0, i32 %ptr_offset
+  store i32 0, ptr addrspace(5) %ptr
   ret void
 }
 
 ; GCN-LABEL: {{^}}pos_vaddr_offset:
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:20
-define amdgpu_kernel void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) {
+define amdgpu_kernel void @pos_vaddr_offset(ptr addrspace(1) %out, i32 %offset) {
 entry:
   %array = alloca [8192 x i32], addrspace(5)
-  %ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %array, i32 0, i32 4
-  store i32 0, i32 addrspace(5)* %ptr
-  %load_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %array, i32 0, i32 %offset
-  %val = load i32, i32 addrspace(5)* %load_ptr
-  store i32 %val, i32 addrspace(1)* %out
+  %ptr = getelementptr [8192 x i32], ptr addrspace(5) %array, i32 0, i32 4
+  store i32 0, ptr addrspace(5) %ptr
+  %load_ptr = getelementptr [8192 x i32], ptr addrspace(5) %array, i32 0, i32 %offset
+  %val = load i32, ptr addrspace(5) %load_ptr
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index 465acccd3d4d9..38bf5f9fc7ff4 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -13,7 +13,7 @@
 ; This was fixed by adding an additional pattern in R600Instructions.td to
 ; match this pattern with a CNDGE_INT.
 
-define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: sdiv_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -193,15 +193,15 @@ define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; EG-NEXT:     SUB_INT T0.X, PV.W, T0.W,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in
-  %den = load i32, i32 addrspace(1) * %den_ptr
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in
+  %den = load i32, ptr addrspace(1) %den_ptr
   %result = sdiv i32 %num, %den
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: sdiv_i32_4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -286,16 +286,16 @@ define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %num = load i32, i32 addrspace(1) * %in
+  %num = load i32, ptr addrspace(1) %in
   %result = sdiv i32 %num, 4
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; Multiply by a weird constant to make sure setIntDivIsCheap is
 ; working.
 
-define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: slow_sdiv_i32_3435:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -387,13 +387,13 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrsp
 ; EG-NEXT:     ADD_INT T0.X, PV.W, PS,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %num = load i32, i32 addrspace(1) * %in
+  %num = load i32, ptr addrspace(1) %in
   %result = sdiv i32 %num, 3435
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: sdiv_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -686,15 +686,15 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; EG-NEXT:     SUB_INT T0.X, PV.W, T0.W,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
-  %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
+  %den_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
+  %num = load <2 x i32>, ptr addrspace(1) %in
+  %den = load <2 x i32>, ptr addrspace(1) %den_ptr
   %result = sdiv <2 x i32> %num, %den
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: sdiv_v2i32_4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -797,13 +797,13 @@ define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32>
 ; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %num = load <2 x i32>, ptr addrspace(1) %in
   %result = sdiv <2 x i32> %num, <i32 4, i32 4>
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: sdiv_v4i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1329,15 +1329,15 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; EG-NEXT:     SUB_INT T2.X, PV.W, T0.W,
 ; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
+  %den_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
+  %num = load <4 x i32>, ptr addrspace(1) %in
+  %den = load <4 x i32>, ptr addrspace(1) %den_ptr
   %result = sdiv <4 x i32> %num, %den
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: sdiv_v4i32_4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1475,13 +1475,13 @@ define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32>
 ; EG-NEXT:     ASHR T1.X, PV.W, literal.x,
 ; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %num = load <4 x i32>, ptr addrspace(1) %in
   %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: v_sdiv_i8:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1617,16 +1617,16 @@ define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %i
 ; EG-NEXT:     BFE_INT T0.X, PV.W, 0.0, literal.x,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
-  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %num = load i8, i8 addrspace(1) * %in
-  %den = load i8, i8 addrspace(1) * %den_ptr
+  %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1
+  %num = load i8, ptr addrspace(1) %in
+  %den = load i8, ptr addrspace(1) %den_ptr
   %result = sdiv i8 %num, %den
   %result.ext = sext i8 %result to i32
-  store i32 %result.ext, i32 addrspace(1)* %out
+  store i32 %result.ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: v_sdiv_i23:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1806,16 +1806,16 @@ define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)*
 ; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    9(1.261169e-44), 2(2.802597e-45)
-  %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
-  %num = load i23, i23 addrspace(1) * %in
-  %den = load i23, i23 addrspace(1) * %den_ptr
+  %den_ptr = getelementptr i23, ptr addrspace(1) %in, i23 1
+  %num = load i23, ptr addrspace(1) %in
+  %den = load i23, ptr addrspace(1) %den_ptr
   %result = sdiv i23 %num, %den
   %result.ext = sext i23 %result to i32
-  store i32 %result.ext, i32 addrspace(1)* %out
+  store i32 %result.ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: v_sdiv_i24:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1985,16 +1985,16 @@ define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)*
 ; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
-  %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
-  %num = load i24, i24 addrspace(1) * %in
-  %den = load i24, i24 addrspace(1) * %den_ptr
+  %den_ptr = getelementptr i24, ptr addrspace(1) %in, i24 1
+  %num = load i24, ptr addrspace(1) %in
+  %den = load i24, ptr addrspace(1) %den_ptr
   %result = sdiv i24 %num, %den
   %result.ext = sext i24 %result to i32
-  store i32 %result.ext, i32 addrspace(1)* %out
+  store i32 %result.ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: v_sdiv_i25:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2196,37 +2196,37 @@ define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)*
 ; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    7(9.809089e-45), 2(2.802597e-45)
-  %den_ptr = getelementptr i25, i25 addrspace(1)* %in, i25 1
-  %num = load i25, i25 addrspace(1) * %in
-  %den = load i25, i25 addrspace(1) * %den_ptr
+  %den_ptr = getelementptr i25, ptr addrspace(1) %in, i25 1
+  %num = load i25, ptr addrspace(1) %in
+  %den = load i25, ptr addrspace(1) %den_ptr
   %result = sdiv i25 %num, %den
   %result.ext = sext i25 %result to i32
-  store i32 %result.ext, i32 addrspace(1)* %out
+  store i32 %result.ext, ptr addrspace(1) %out
   ret void
 }
 
 ; Tests for 64-bit divide bypass.
-; define amdgpu_kernel void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+; define amdgpu_kernel void @test_get_quotient(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
 ;   %result = sdiv i64 %a, %b
-;   store i64 %result, i64 addrspace(1)* %out, align 8
+;   store i64 %result, ptr addrspace(1) %out, align 8
 ;   ret void
 ; }
 
-; define amdgpu_kernel void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+; define amdgpu_kernel void @test_get_remainder(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
 ;   %result = srem i64 %a, %b
-;   store i64 %result, i64 addrspace(1)* %out, align 8
+;   store i64 %result, ptr addrspace(1) %out, align 8
 ;   ret void
 ; }
 
-; define amdgpu_kernel void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+; define amdgpu_kernel void @test_get_quotient_and_remainder(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
 ;   %resultdiv = sdiv i64 %a, %b
 ;   %resultrem = srem i64 %a, %b
 ;   %result = add i64 %resultdiv, %resultrem
-;   store i64 %result, i64 addrspace(1)* %out, align 8
+;   store i64 %result, ptr addrspace(1) %out, align 8
 ;   ret void
 ; }
 
-define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
+define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) {
 ; GCN-LABEL: scalarize_mulhs_4xi32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2362,8 +2362,8 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocaptu
 ; EG-NEXT:     ADD_INT T0.X, PV.W, PS,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Z, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
+  %1 = load <4 x i32>, ptr addrspace(1) %in, align 16
   %2 = sdiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
-  store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %2, ptr addrspace(1) %out, align 16
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 4529dc5f1d213..78b80ca1c5ee8 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s
 
-define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_sdiv:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -230,7 +230,7 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = sdiv i64 %x, %y
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -463,7 +463,7 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_sdiv24_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -526,7 +526,7 @@ define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64
   %1 = ashr i64 %x, 40
   %2 = ashr i64 %y, 40
   %result = sdiv i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -572,7 +572,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_sdiv32_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_sdiv32_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s8, s[0:1], 0xe
@@ -629,11 +629,11 @@ define amdgpu_kernel void @s_test_sdiv32_64(i64 addrspace(1)* %out, i64 %x, i64
   %1 = ashr i64 %x, 32
   %2 = ashr i64 %y, 32
   %result = sdiv i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_sdiv31_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -696,11 +696,11 @@ define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64
   %1 = ashr i64 %x, 33
   %2 = ashr i64 %y, 33
   %result = sdiv i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_sdiv23_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -763,11 +763,11 @@ define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64
   %1 = ashr i64 %x, 41
   %2 = ashr i64 %y, 41
   %result = sdiv i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_sdiv25_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -830,11 +830,11 @@ define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64
   %1 = ashr i64 %x, 39
   %2 = ashr i64 %y, 39
   %result = sdiv i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_sdiv24_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) {
 ; GCN-LABEL: s_test_sdiv24_v2i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
@@ -929,11 +929,11 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(<2 x i64> addrspace(1)* %out, <2
   %1 = ashr <2 x i64> %x, <i64 40, i64 40>
   %2 = ashr <2 x i64> %y, <i64 40, i64 40>
   %result = sdiv <2 x i64> %1, %2
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48 %y) {
+define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %y) {
 ; GCN-LABEL: s_test_sdiv24_48:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1073,11 +1073,11 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
   %1 = ashr i48 %x, 24
   %2 = ashr i48 %y, 24
   %result = sdiv i48 %1, %2
-  store i48 %result, i48 addrspace(1)* %out
+  store i48 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) {
 ; GCN-LABEL: s_test_sdiv_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1270,7 +1270,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = sdiv i64 24, %x
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -1764,7 +1764,7 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_sdiv24_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x) {
 ; GCN-LABEL: s_test_sdiv24_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1820,11 +1820,11 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    s_endpgm
   %x.shr = ashr i64 %x, 40
   %result = sdiv i64 24, %x.shr
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_sdiv24_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_sdiv24_k_den_i64(ptr addrspace(1) %out, i64 %x) {
 ; GCN-LABEL: s_test_sdiv24_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1878,7 +1878,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    s_endpgm
   %x.shr = ashr i64 %x, 40
   %result = sdiv i64 %x.shr, 23423
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/sdivrem24.ll b/llvm/test/CodeGen/AMDGPU/sdivrem24.ll
index 785a6f9d1db33..08fb112dcb4f0 100644
--- a/llvm/test/CodeGen/AMDGPU/sdivrem24.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdivrem24.ll
@@ -12,12 +12,12 @@
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define amdgpu_kernel void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
-  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %num = load i8, i8 addrspace(1) * %in
-  %den = load i8, i8 addrspace(1) * %den_ptr
+define amdgpu_kernel void @sdiv24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1
+  %num = load i8, ptr addrspace(1) %in
+  %den = load i8, ptr addrspace(1) %den_ptr
   %result = sdiv i8 %num, %den
-  store i8 %result, i8 addrspace(1)* %out
+  store i8 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -31,12 +31,12 @@ define amdgpu_kernel void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define amdgpu_kernel void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
-  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
-  %num = load i16, i16 addrspace(1) * %in, align 2
-  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
+define amdgpu_kernel void @sdiv24_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i16, ptr addrspace(1) %in, i16 1
+  %num = load i16, ptr addrspace(1) %in, align 2
+  %den = load i16, ptr addrspace(1) %den_ptr, align 2
   %result = sdiv i16 %num, %den
-  store i16 %result, i16 addrspace(1)* %out, align 2
+  store i16 %result, ptr addrspace(1) %out, align 2
   ret void
 }
 
@@ -50,16 +50,16 @@ define amdgpu_kernel void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)*
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define amdgpu_kernel void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @sdiv24_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
   %den.i24.0 = shl i32 %den, 8
   %num.i24 = ashr i32 %num.i24.0, 8
   %den.i24 = ashr i32 %den.i24.0, 8
   %result = sdiv i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -69,16 +69,16 @@ define amdgpu_kernel void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define amdgpu_kernel void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @sdiv25_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 7
   %den.i24.0 = shl i32 %den, 7
   %num.i24 = ashr i32 %num.i24.0, 7
   %den.i24 = ashr i32 %den.i24.0, 7
   %result = sdiv i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -88,16 +88,16 @@ define amdgpu_kernel void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define amdgpu_kernel void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @test_no_sdiv24_i32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
   %den.i24.0 = shl i32 %den, 7
   %num.i24 = ashr i32 %num.i24.0, 8
   %den.i24 = ashr i32 %den.i24.0, 7
   %result = sdiv i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -107,16 +107,16 @@ define amdgpu_kernel void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addr
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define amdgpu_kernel void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @test_no_sdiv24_i32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 7
   %den.i24.0 = shl i32 %den, 8
   %num.i24 = ashr i32 %num.i24.0, 7
   %den.i24 = ashr i32 %den.i24.0, 8
   %result = sdiv i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -130,12 +130,12 @@ define amdgpu_kernel void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addr
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define amdgpu_kernel void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
-  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %num = load i8, i8 addrspace(1) * %in
-  %den = load i8, i8 addrspace(1) * %den_ptr
+define amdgpu_kernel void @srem24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1
+  %num = load i8, ptr addrspace(1) %in
+  %den = load i8, ptr addrspace(1) %den_ptr
   %result = srem i8 %num, %den
-  store i8 %result, i8 addrspace(1)* %out
+  store i8 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -149,12 +149,12 @@ define amdgpu_kernel void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define amdgpu_kernel void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
-  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
-  %num = load i16, i16 addrspace(1) * %in, align 2
-  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
+define amdgpu_kernel void @srem24_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i16, ptr addrspace(1) %in, i16 1
+  %num = load i16, ptr addrspace(1) %in, align 2
+  %den = load i16, ptr addrspace(1) %den_ptr, align 2
   %result = srem i16 %num, %den
-  store i16 %result, i16 addrspace(1)* %out, align 2
+  store i16 %result, ptr addrspace(1) %out, align 2
   ret void
 }
 
@@ -168,16 +168,16 @@ define amdgpu_kernel void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)*
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define amdgpu_kernel void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @srem24_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
   %den.i24.0 = shl i32 %den, 8
   %num.i24 = ashr i32 %num.i24.0, 8
   %den.i24 = ashr i32 %den.i24.0, 8
   %result = srem i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -187,16 +187,16 @@ define amdgpu_kernel void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define amdgpu_kernel void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @no_srem25_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 7
   %den.i24.0 = shl i32 %den, 7
   %num.i24 = ashr i32 %num.i24.0, 7
   %den.i24 = ashr i32 %den.i24.0, 7
   %result = srem i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -206,16 +206,16 @@ define amdgpu_kernel void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define amdgpu_kernel void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @no_sdiv25_i24_i25_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
   %den.i25.0 = shl i32 %den, 7
   %num.i24 = ashr i32 %num.i24.0, 8
   %den.i25 = ashr i32 %den.i25.0, 7
   %result = sdiv i32 %num.i24, %den.i25
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -225,16 +225,16 @@ define amdgpu_kernel void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 add
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define amdgpu_kernel void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @no_sdiv25_i25_i24_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i25.0 = shl i32 %num, 7
   %den.i24.0 = shl i32 %den, 8
   %num.i25 = ashr i32 %num.i25.0, 7
   %den.i24 = ashr i32 %den.i24.0, 8
   %result = sdiv i32 %num.i25, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -244,16 +244,16 @@ define amdgpu_kernel void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 add
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define amdgpu_kernel void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @no_srem25_i24_i25_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
   %den.i25.0 = shl i32 %den, 7
   %num.i24 = ashr i32 %num.i24.0, 8
   %den.i25 = ashr i32 %den.i25.0, 7
   %result = srem i32 %num.i24, %den.i25
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -263,16 +263,16 @@ define amdgpu_kernel void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 add
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define amdgpu_kernel void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @no_srem25_i25_i24_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i25.0 = shl i32 %num, 7
   %den.i24.0 = shl i32 %den, 8
   %num.i25 = ashr i32 %num.i25.0, 7
   %den.i24 = ashr i32 %den.i24.0, 8
   %result = srem i32 %num.i25, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -283,16 +283,16 @@ define amdgpu_kernel void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 add
 
 ; EG: INT_TO_FLT
 ; EG: RECIP_IEEE
-define amdgpu_kernel void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @srem25_i24_i11_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
   %den.i11.0 = shl i32 %den, 21
   %num.i24 = ashr i32 %num.i24.0, 8
   %den.i11 = ashr i32 %den.i11.0, 21
   %result = srem i32 %num.i24, %den.i11
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -303,16 +303,16 @@ define amdgpu_kernel void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrsp
 
 ; EG: INT_TO_FLT
 ; EG: RECIP_IEEE
-define amdgpu_kernel void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @srem25_i11_i24_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i11.0 = shl i32 %num, 21
   %den.i24.0 = shl i32 %den, 8
   %num.i11 = ashr i32 %num.i11.0, 21
   %den.i24 = ashr i32 %den.i24.0, 8
   %result = srem i32 %num.i11, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -323,15 +323,15 @@ define amdgpu_kernel void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrsp
 
 ; EG: INT_TO_FLT
 ; EG: RECIP_IEEE
-define amdgpu_kernel void @srem25_i17_i12_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @srem25_i17_i12_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i17.0 = shl i32 %num, 15
   %den.i12.0 = shl i32 %den, 20
   %num.i17 = ashr i32 %num.i17.0, 15
   %den.i12 = ashr i32 %den.i12.0, 20
   %result = sdiv i32 %num.i17, %den.i12
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/sdwa-op64-test.ll b/llvm/test/CodeGen/AMDGPU/sdwa-op64-test.ll
index 95e18e56f816a..81791ea8d1c55 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-op64-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-op64-test.ll
@@ -6,17 +6,17 @@
 ; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
 ; FIJI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; FIJI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
-define void @test_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 {
+define void @test_add_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp
-  %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
+  %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i32 %tmp
+  %tmp4 = load i32, ptr addrspace(1) %tmp3, align 4
   %tmp5 = and i32 %tmp4, 255
   %tmp6 = zext i32 %tmp5 to i64
-  %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp
-  %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8
+  %tmp7 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp
+  %tmp8 = load i64, ptr addrspace(1) %tmp7, align 8
   %tmp9 = add nsw i64 %tmp8, %tmp6
-  store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8
+  store i64 %tmp9, ptr addrspace(1) %tmp7, align 8
   ret void
 }
 
@@ -26,17 +26,17 @@ bb:
 ; GFX9: v_subbrev_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
 ; FIJI: v_sub_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; FIJI: v_subbrev_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
-define void @test_sub_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 {
+define void @test_sub_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp
-  %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
+  %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i32 %tmp
+  %tmp4 = load i32, ptr addrspace(1) %tmp3, align 4
   %tmp5 = and i32 %tmp4, 255
   %tmp6 = zext i32 %tmp5 to i64
-  %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp
-  %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8
+  %tmp7 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp
+  %tmp8 = load i64, ptr addrspace(1) %tmp7, align 8
   %tmp9 = sub nsw i64 %tmp8, %tmp6
-  store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8
+  store i64 %tmp9, ptr addrspace(1) %tmp7, align 8
   ret void
 }
 
@@ -49,25 +49,25 @@ bb:
 ; FIJI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
 ; FIJI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; FIJI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
-define amdgpu_kernel void @test1_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1, i64 addrspace(1)* %arg2) #0 {
+define amdgpu_kernel void @test1_add_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, ptr addrspace(1) %arg2) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp
-  %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
+  %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i32 %tmp
+  %tmp4 = load i32, ptr addrspace(1) %tmp3, align 4
   %tmp5 = and i32 %tmp4, 255
   %tmp6 = zext i32 %tmp5 to i64
-  %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp
-  %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8
+  %tmp7 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp
+  %tmp8 = load i64, ptr addrspace(1) %tmp7, align 8
   %tmp9 = add nsw i64 %tmp8, %tmp6
-  store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8
-  %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp
-  %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4
+  store i64 %tmp9, ptr addrspace(1) %tmp7, align 8
+  %tmp13 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i32 %tmp
+  %tmp14 = load i32, ptr addrspace(1) %tmp13, align 4
   %tmp15 = and i32 %tmp14, 255
   %tmp16 = zext i32 %tmp15 to i64
-  %tmp17 = getelementptr inbounds i64, i64 addrspace(1)* %arg2, i32 %tmp
-  %tmp18 = load i64, i64 addrspace(1)* %tmp17, align 8
+  %tmp17 = getelementptr inbounds i64, ptr addrspace(1) %arg2, i32 %tmp
+  %tmp18 = load i64, ptr addrspace(1) %tmp17, align 8
   %tmp19 = add nsw i64 %tmp18, %tmp16
-  store i64 %tmp19, i64 addrspace(1)* %tmp17, align 8
+  store i64 %tmp19, ptr addrspace(1) %tmp17, align 8
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index dc53635efca71..256fa6b2a6a75 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -12,11 +12,11 @@
 ; GFX9: v_add_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10: v_add_nc_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 
-define amdgpu_kernel void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %a = load i32, i32 addrspace(1)* %in, align 4
+define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %a = load i32, ptr addrspace(1) %in, align 4
   %shr = lshr i32 %a, 16
   %add = add i32 %a, %shr
-  store i32 %add, i32 addrspace(1)* %out, align 4
+  store i32 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -28,11 +28,11 @@ define amdgpu_kernel void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; VI: v_subrev_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9: v_sub_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10: v_sub_nc_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %a = load i32, i32 addrspace(1)* %in, align 4
+define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %a = load i32, ptr addrspace(1) %in, align 4
   %shr = lshr i32 %a, 16
   %sub = sub i32 %shr, %a
-  store i32 %sub, i32 addrspace(1)* %out, align 4
+  store i32 %sub, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -44,16 +44,16 @@ define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
 
 ; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 
-define amdgpu_kernel void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) #0 {
+define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep1 = getelementptr i32, i32 addrspace(1)* %in1, i32 %idx
-  %gep2 = getelementptr i32, i32 addrspace(1)* %in2, i32 %idx
-  %a = load i32, i32 addrspace(1)* %gep1, align 4
-  %b = load i32, i32 addrspace(1)* %gep2, align 4
+  %gep1 = getelementptr i32, ptr addrspace(1) %in1, i32 %idx
+  %gep2 = getelementptr i32, ptr addrspace(1) %in2, i32 %idx
+  %a = load i32, ptr addrspace(1) %gep1, align 4
+  %b = load i32, ptr addrspace(1) %gep2, align 4
   %shra = lshr i32 %a, 16
   %shrb = lshr i32 %b, 16
   %mul = mul i32 %shra, %shrb
-  store i32 %mul, i32 addrspace(1)* %out, align 4
+  store i32 %mul, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -64,15 +64,15 @@ define amdgpu_kernel void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; GFX10: v_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; SDWA-NOT: v_mul_u32_u24_sdwa
 
-define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) #0 {
+define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gepa = getelementptr i16, i16 addrspace(1)* %ina, i32 %idx
-  %gepb = getelementptr i16, i16 addrspace(1)* %inb, i32 %idx
-  %a = load i16, i16 addrspace(1)* %gepa, align 4
-  %b = load i16, i16 addrspace(1)* %gepb, align 4
+  %gepa = getelementptr i16, ptr addrspace(1) %ina, i32 %idx
+  %gepb = getelementptr i16, ptr addrspace(1) %inb, i32 %idx
+  %a = load i16, ptr addrspace(1) %gepa, align 4
+  %b = load i16, ptr addrspace(1) %gepb, align 4
   %mul = mul i16 %a, %b
-  store i16 %mul, i16 addrspace(1)* %out, align 4
+  store i16 %mul, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -90,15 +90,15 @@ entry:
 
 ; GFX9_10: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
-define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) #0 {
+define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gepa = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %ina, i32 %idx
-  %gepb = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %inb, i32 %idx
-  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gepa, align 4
-  %b = load <2 x i16>, <2 x i16> addrspace(1)* %gepb, align 4
+  %gepa = getelementptr <2 x i16>, ptr addrspace(1) %ina, i32 %idx
+  %gepb = getelementptr <2 x i16>, ptr addrspace(1) %inb, i32 %idx
+  %a = load <2 x i16>, ptr addrspace(1) %gepa, align 4
+  %b = load <2 x i16>, ptr addrspace(1) %gepb, align 4
   %mul = mul <2 x i16> %a, %b
-  store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4
+  store <2 x i16> %mul, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -120,15 +120,15 @@ entry:
 ; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
-define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) #0 {
+define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gepa = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %ina, i32 %idx
-  %gepb = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %inb, i32 %idx
-  %a = load <4 x i16>, <4 x i16> addrspace(1)* %gepa, align 4
-  %b = load <4 x i16>, <4 x i16> addrspace(1)* %gepb, align 4
+  %gepa = getelementptr <4 x i16>, ptr addrspace(1) %ina, i32 %idx
+  %gepb = getelementptr <4 x i16>, ptr addrspace(1) %inb, i32 %idx
+  %a = load <4 x i16>, ptr addrspace(1) %gepa, align 4
+  %b = load <4 x i16>, ptr addrspace(1) %gepb, align 4
   %mul = mul <4 x i16> %a, %b
-  store <4 x i16> %mul, <4 x i16> addrspace(1)* %out, align 4
+  store <4 x i16> %mul, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -158,15 +158,15 @@ entry:
 ; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
-define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) #0 {
+define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gepa = getelementptr <8 x i16>, <8 x i16> addrspace(1)* %ina, i32 %idx
-  %gepb = getelementptr <8 x i16>, <8 x i16> addrspace(1)* %inb, i32 %idx
-  %a = load <8 x i16>, <8 x i16> addrspace(1)* %gepa, align 4
-  %b = load <8 x i16>, <8 x i16> addrspace(1)* %gepb, align 4
+  %gepa = getelementptr <8 x i16>, ptr addrspace(1) %ina, i32 %idx
+  %gepb = getelementptr <8 x i16>, ptr addrspace(1) %inb, i32 %idx
+  %a = load <8 x i16>, ptr addrspace(1) %gepa, align 4
+  %b = load <8 x i16>, ptr addrspace(1) %gepb, align 4
   %mul = mul <8 x i16> %a, %b
-  store <8 x i16> %mul, <8 x i16> addrspace(1)* %out, align 4
+  store <8 x i16> %mul, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -176,12 +176,12 @@ entry:
 ; SDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; SDWA-NOT: v_mul_f16_sdwa
 
-define amdgpu_kernel void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) #0 {
+define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
 entry:
-  %a = load half, half addrspace(1)* %ina, align 4
-  %b = load half, half addrspace(1)* %inb, align 4
+  %a = load half, ptr addrspace(1) %ina, align 4
+  %b = load half, ptr addrspace(1) %inb, align 4
   %mul = fmul half %a, %b
-  store half %mul, half addrspace(1)* %out, align 4
+  store half %mul, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -199,12 +199,12 @@ entry:
 
 ; GFX9_10: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
-define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) #0 {
+define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
 entry:
-  %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
-  %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
+  %a = load <2 x half>, ptr addrspace(1) %ina, align 4
+  %b = load <2 x half>, ptr addrspace(1) %inb, align 4
   %mul = fmul <2 x half> %a, %b
-  store <2 x half> %mul, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %mul, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -224,12 +224,12 @@ entry:
 ; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
-define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) #0 {
+define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
 entry:
-  %a = load <4 x half>, <4 x half> addrspace(1)* %ina, align 4
-  %b = load <4 x half>, <4 x half> addrspace(1)* %inb, align 4
+  %a = load <4 x half>, ptr addrspace(1) %ina, align 4
+  %b = load <4 x half>, ptr addrspace(1) %inb, align 4
   %mul = fmul <4 x half> %a, %b
-  store <4 x half> %mul, <4 x half> addrspace(1)* %out, align 4
+  store <4 x half> %mul, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -255,12 +255,12 @@ entry:
 ; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
-define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) #0 {
+define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
 entry:
-  %a = load <8 x half>, <8 x half> addrspace(1)* %ina, align 4
-  %b = load <8 x half>, <8 x half> addrspace(1)* %inb, align 4
+  %a = load <8 x half>, ptr addrspace(1) %ina, align 4
+  %b = load <8 x half>, ptr addrspace(1) %inb, align 4
   %mul = fmul <8 x half> %a, %b
-  store <8 x half> %mul, <8 x half> addrspace(1)* %out, align 4
+  store <8 x half> %mul, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -271,15 +271,15 @@ entry:
 ; GFX10: v_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; SDWA-NOT: v_mul_u32_u24_sdwa
 
-define amdgpu_kernel void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) #0 {
+define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gepa = getelementptr i8, i8 addrspace(1)* %ina, i32 %idx
-  %gepb = getelementptr i8, i8 addrspace(1)* %inb, i32 %idx
-  %a = load i8, i8 addrspace(1)* %gepa, align 4
-  %b = load i8, i8 addrspace(1)* %gepb, align 4
+  %gepa = getelementptr i8, ptr addrspace(1) %ina, i32 %idx
+  %gepb = getelementptr i8, ptr addrspace(1) %inb, i32 %idx
+  %a = load i8, ptr addrspace(1) %gepa, align 4
+  %b = load i8, ptr addrspace(1) %gepb, align 4
   %mul = mul i8 %a, %b
-  store i8 %mul, i8 addrspace(1)* %out, align 4
+  store i8 %mul, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -303,15 +303,15 @@ entry:
 
 ; GFX10: v_lshlrev_b16 v{{[0-9]+}}, 8, v
 ; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) #0 {
+define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gepa = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %ina, i32 %idx
-  %gepb = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %inb, i32 %idx
-  %a = load <2 x i8>, <2 x i8> addrspace(1)* %gepa, align 4
-  %b = load <2 x i8>, <2 x i8> addrspace(1)* %gepb, align 4
+  %gepa = getelementptr <2 x i8>, ptr addrspace(1) %ina, i32 %idx
+  %gepb = getelementptr <2 x i8>, ptr addrspace(1) %inb, i32 %idx
+  %a = load <2 x i8>, ptr addrspace(1) %gepa, align 4
+  %b = load <2 x i8>, ptr addrspace(1) %gepb, align 4
   %mul = mul <2 x i8> %a, %b
-  store <2 x i8> %mul, <2 x i8> addrspace(1)* %out, align 4
+  store <2 x i8> %mul, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -336,15 +336,15 @@ entry:
 ; GFX10-DAG: v_mul_lo_u16
 ; GFX10-DAG: v_mul_lo_u16
 
-define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) #0 {
+define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gepa = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %ina, i32 %idx
-  %gepb = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %inb, i32 %idx
-  %a = load <4 x i8>, <4 x i8> addrspace(1)* %gepa, align 4
-  %b = load <4 x i8>, <4 x i8> addrspace(1)* %gepb, align 4
+  %gepa = getelementptr <4 x i8>, ptr addrspace(1) %ina, i32 %idx
+  %gepb = getelementptr <4 x i8>, ptr addrspace(1) %inb, i32 %idx
+  %a = load <4 x i8>, ptr addrspace(1) %gepa, align 4
+  %b = load <4 x i8>, ptr addrspace(1) %gepb, align 4
   %mul = mul <4 x i8> %a, %b
-  store <4 x i8> %mul, <4 x i8> addrspace(1)* %out, align 4
+  store <4 x i8> %mul, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -379,15 +379,15 @@ entry:
 ; GFX10-DAG: v_mul_lo_u16
 ; GFX10-DAG: v_mul_lo_u16
 
-define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) #0 {
+define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gepa = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %ina, i32 %idx
-  %gepb = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %inb, i32 %idx
-  %a = load <8 x i8>, <8 x i8> addrspace(1)* %gepa, align 4
-  %b = load <8 x i8>, <8 x i8> addrspace(1)* %gepb, align 4
+  %gepa = getelementptr <8 x i8>, ptr addrspace(1) %ina, i32 %idx
+  %gepb = getelementptr <8 x i8>, ptr addrspace(1) %inb, i32 %idx
+  %a = load <8 x i8>, ptr addrspace(1) %gepa, align 4
+  %b = load <8 x i8>, ptr addrspace(1) %gepb, align 4
   %mul = mul <8 x i8> %a, %b
-  store <8 x i8> %mul, <8 x i8> addrspace(1)* %out, align 4
+  store <8 x i8> %mul, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -402,12 +402,12 @@ entry:
 
 ; FIXME: Should be able to avoid or
 define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
-    <2 x half> addrspace(1)* %r,
-    <2 x i16> addrspace(1)* %a) #0 {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) #0 {
 entry:
-  %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
+  %a.val = load <2 x i16>, ptr addrspace(1) %a
   %r.val = sitofp <2 x i16> %a.val to <2 x half>
-  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
+  store <2 x half> %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -426,13 +426,13 @@ entry:
 ; GFX9_10: v_pk_mul_f16 v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v[[SRC:[0-9]+]]
 ; GFX9_10: v_pk_add_f16 v{{[0-9]+}}, v[[DST_MUL]], v[[SRC]]
 
-define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) #0 {
+define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
 entry:
-  %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
-  %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
+  %a = load <2 x half>, ptr addrspace(1) %ina, align 4
+  %b = load <2 x half>, ptr addrspace(1) %inb, align 4
   %mul = fmul <2 x half> %a, %b
   %mac = fadd <2 x half> %mul, %b
-  store <2 x half> %mac, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %mac, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -447,13 +447,13 @@ entry:
 
 ; GFX10: v_pk_mul_lo_u16 v{{[0-9]+}}, 0x141007b, v{{[0-9]+}}
 
-define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %idx
-  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep, align 4
+  %gep = getelementptr <2 x i16>, ptr addrspace(1) %in, i32 %idx
+  %a = load <2 x i16>, ptr addrspace(1) %gep, align 4
   %mul = mul <2 x i16> %a, <i16 123, i16 321>
-  store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4
+  store <2 x i16> %mul, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -471,16 +471,16 @@ entry:
 ; GFX9_10: v_pk_mul_lo_u16 v[[DST1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9_10: v_pk_mul_lo_u16 v{{[0-9]+}}, v[[DST1]], v{{[0-9]+}}
 
-define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) #0 {
+define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
-  %gepa = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %ina, i32 %idx
-  %gepb = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %inb, i32 %idx
-  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gepa, align 4
-  %b = load <2 x i16>, <2 x i16> addrspace(1)* %gepb, align 4
+  %gepa = getelementptr <2 x i16>, ptr addrspace(1) %ina, i32 %idx
+  %gepb = getelementptr <2 x i16>, ptr addrspace(1) %inb, i32 %idx
+  %a = load <2 x i16>, ptr addrspace(1) %gepa, align 4
+  %b = load <2 x i16>, ptr addrspace(1) %gepb, align 4
   %mul = mul <2 x i16> %a, %b
   %mul2 = mul <2 x i16> %mul, %b
-  store <2 x i16> %mul2, <2 x i16> addrspace(1)* %out, align 4
+  store <2 x i16> %mul2, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -493,16 +493,16 @@ entry:
 
 ; GFX9_10: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
-define amdgpu_kernel void @add_bb_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) #0 {
+define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
 entry:
-  %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
-  %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
+  %a = load <2 x i16>, ptr addrspace(1) %ina, align 4
+  %b = load <2 x i16>, ptr addrspace(1) %inb, align 4
   br label %add_label
 add_label:
   %add = add <2 x i16> %a, %b
   br label %store_label
 store_label:
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out, align 4
+  store <2 x i16> %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -538,11 +538,11 @@ store_label:
 ; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 
-define amdgpu_kernel void @pulled_out_test(<8 x i8> addrspace(1)* %sourceA, <8 x i8> addrspace(1)* %destValues) #0 {
+define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrspace(1) %destValues) #0 {
 entry:
   %idxprom = ashr exact i64 15, 32
-  %arrayidx = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %sourceA, i64 %idxprom
-  %tmp = load <8 x i8>, <8 x i8> addrspace(1)* %arrayidx, align 8
+  %arrayidx = getelementptr inbounds <8 x i8>, ptr addrspace(1) %sourceA, i64 %idxprom
+  %tmp = load <8 x i8>, ptr addrspace(1) %arrayidx, align 8
 
   %tmp1 = extractelement <8 x i8> %tmp, i32 0
   %tmp2 = extractelement <8 x i8> %tmp, i32 1
@@ -566,8 +566,8 @@ entry:
   %tmp18 = shufflevector <2 x i8> %tmp14, <2 x i8> %tmp16, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %tmp19 = shufflevector <4 x i8> %tmp17, <4 x i8> %tmp18, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 
-  %arrayidx5 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %destValues, i64 %idxprom
-  store <8 x i8> %tmp19, <8 x i8> addrspace(1)* %arrayidx5, align 8
+  %arrayidx5 = getelementptr inbounds <8 x i8>, ptr addrspace(1) %destValues, i64 %idxprom
+  store <8 x i8> %tmp19, ptr addrspace(1) %arrayidx5, align 8
   ret void
 }
 
@@ -595,7 +595,7 @@ bb2:                                              ; preds = %bb1
 
 bb11:                                             ; preds = %bb10, %bb2
   %tmp12 = phi <2 x i32> [ %tmp6, %bb2 ], [ %tmp, %bb1 ]
-  store volatile <2 x i32> %tmp12, <2 x i32> addrspace(1)* undef
+  store volatile <2 x i32> %tmp12, ptr addrspace(1) undef
   br label %bb1
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
index 772f07f3aecd0..af27473a0a61b 100644
--- a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
@@ -3,7 +3,7 @@
 
 declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
 declare i32 @llvm.amdgcn.sffbh.i32(i32) nounwind readnone speculatable
-define amdgpu_kernel void @select_constant_cttz(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
+define amdgpu_kernel void @select_constant_cttz(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GCN-LABEL: select_constant_cttz:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -28,7 +28,7 @@ define amdgpu_kernel void @select_constant_cttz(i32 addrspace(1)* noalias %out,
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
-  %v    = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %v    = load i32, ptr addrspace(1) %arrayidx, align 4
   %sr   = lshr i32 1, %v
   %cmp  = icmp ne i32 %v, 0
   %cttz = call i32 @llvm.cttz.i32(i32 %sr, i1 true), !range !0
@@ -38,7 +38,7 @@ define amdgpu_kernel void @select_constant_cttz(i32 addrspace(1)* noalias %out,
   %cmp2 = icmp eq i32 %sel, 0
   %or   = or i1 %cmp, %cmp2
   %sel2 = select i1 %or, i32 -1, i32 %sub
-  store i32 %sel2, i32 addrspace(1)* %out
+  store i32 %sel2, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
index 48f514b4b8ed1..c12198f113c71 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
@@ -12,13 +12,13 @@
 ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
 ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
 define amdgpu_kernel void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %rcp = call float @llvm.amdgcn.rcp.legacy(float %x)
   %fneg = fsub float -0.0, %rcp
   %select = select i1 %cmp, float %fneg, float 2.0
-  store volatile float %select, float addrspace(1)* undef
+  store volatile float %select, ptr addrspace(1) undef
   ret void
 }
 
@@ -30,12 +30,12 @@ define amdgpu_kernel void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
 ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
 ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
 define amdgpu_kernel void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 {
-  %x = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %mul = call float @llvm.amdgcn.fmul.legacy(float %x, float 4.0)
   %fneg = fsub float -0.0, %mul
   %select = select i1 %cmp, float %fneg, float 2.0
-  store volatile float %select, float addrspace(1)* undef
+  store volatile float %select, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
index 5ff9698396444..3340f43b32e7c 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
@@ -9,15 +9,15 @@
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
 define amdgpu_kernel void @add_select_fabs_fabs_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
-  %z = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
+  %z = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fabs.y = call float @llvm.fabs.f32(float %y)
   %select = select i1 %cmp, float %fabs.x, float %fabs.y
   %add = fadd float %select, %z
-  store float %add, float addrspace(1)* undef
+  store float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -31,18 +31,18 @@ define amdgpu_kernel void @add_select_fabs_fabs_f32(i32 %c) #0 {
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, [[W]]
 define amdgpu_kernel void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
-  %z = load volatile float, float addrspace(1)* undef
-  %w = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
+  %z = load volatile float, ptr addrspace(1) undef
+  %w = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fabs.y = call float @llvm.fabs.f32(float %y)
   %select = select i1 %cmp, float %fabs.x, float %fabs.y
   %add0 = fadd float %select, %z
   %add1 = fadd float %fabs.x, %w
-  store volatile float %add0, float addrspace(1)* undef
-  store volatile float %add1, float addrspace(1)* undef
+  store volatile float %add0, ptr addrspace(1) undef
+  store volatile float %add1, ptr addrspace(1) undef
   ret void
 }
 
@@ -58,16 +58,16 @@ define amdgpu_kernel void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 {
 ; GCN: buffer_store_dword [[ADD]]
 ; GCN: buffer_store_dword [[X_ABS]]
 define amdgpu_kernel void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
-  %z = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
+  %z = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fabs.y = call float @llvm.fabs.f32(float %y)
   %select = select i1 %cmp, float %fabs.x, float %fabs.y
   %add0 = fadd float %select, %z
-  store volatile float %add0, float addrspace(1)* undef
-  store volatile float %fabs.x, float addrspace(1)* undef
+  store volatile float %add0, ptr addrspace(1) undef
+  store volatile float %fabs.x, ptr addrspace(1) undef
   ret void
 }
 
@@ -81,18 +81,18 @@ define amdgpu_kernel void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c)
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[Y]]|, [[W]]
 define amdgpu_kernel void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
-  %z = load volatile float, float addrspace(1)* undef
-  %w = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
+  %z = load volatile float, ptr addrspace(1) undef
+  %w = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fabs.y = call float @llvm.fabs.f32(float %y)
   %select = select i1 %cmp, float %fabs.x, float %fabs.y
   %add0 = fadd float %select, %z
   %add1 = fadd float %fabs.y, %w
-  store volatile float %add0, float addrspace(1)* undef
-  store volatile float %add1, float addrspace(1)* undef
+  store volatile float %add0, ptr addrspace(1) undef
+  store volatile float %add1, ptr addrspace(1) undef
   ret void
 }
 
@@ -104,14 +104,14 @@ define amdgpu_kernel void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[Y]], |[[X]]|,
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
 define amdgpu_kernel void @add_select_fabs_var_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
-  %z = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
+  %z = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %select = select i1 %cmp, float %fabs.x, float %y
   %add = fadd float %select, %z
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -122,13 +122,13 @@ define amdgpu_kernel void @add_select_fabs_var_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, |[[X]]|,
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
 define amdgpu_kernel void @add_select_fabs_negk_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fabs = call float @llvm.fabs.f32(float %x)
   %select = select i1 %cmp, float %fabs, float -1.0
   %add = fadd float %select, %y
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -139,12 +139,12 @@ define amdgpu_kernel void @add_select_fabs_negk_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[X]]
 define amdgpu_kernel void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2.0, float -1.0
   %fabs = call float @llvm.fabs.f32(float %select)
   %add = fadd float %fabs, %x
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -154,11 +154,11 @@ define amdgpu_kernel void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 1.0, 2.0, s
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
 define amdgpu_kernel void @add_select_posk_posk_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float 2.0, float 1.0
   %add = fadd float %select, %x
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -171,13 +171,13 @@ define amdgpu_kernel void @add_select_posk_posk_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, |[[X]]|, [[VCC]]
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
 define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fabs = call float @llvm.fabs.f32(float %x)
   %select = select i1 %cmp, float -1.0, float %fabs
   %add = fadd float %select, %y
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -191,13 +191,13 @@ define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]]
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
 define amdgpu_kernel void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fabs = call float @llvm.fabs.f32(float %x)
   %select = select i1 %cmp, float -1024.0, float %fabs
   %add = fadd float %select, %y
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -208,14 +208,14 @@ define amdgpu_kernel void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]]
 define amdgpu_kernel void @add_select_fabs_posk_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
 
   %cmp = icmp eq i32 %c, 0
   %fabs = call float @llvm.fabs.f32(float %x)
   %select = select i1 %cmp, float %fabs, float 1.0
   %add = fadd float %select, %y
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -228,13 +228,13 @@ define amdgpu_kernel void @add_select_fabs_posk_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]]
 define amdgpu_kernel void @add_select_posk_fabs_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fabs = call float @llvm.fabs.f32(float %x)
   %select = select i1 %cmp, float 1.0, float %fabs
   %add = fadd float %select, %y
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -246,15 +246,15 @@ define amdgpu_kernel void @add_select_posk_fabs_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
 define amdgpu_kernel void @add_select_fneg_fneg_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
-  %z = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
+  %z = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fsub float -0.0, %x
   %fneg.y = fsub float -0.0, %y
   %select = select i1 %cmp, float %fneg.x, float %fneg.y
   %add = fadd float %select, %z
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -268,18 +268,18 @@ define amdgpu_kernel void @add_select_fneg_fneg_f32(i32 %c) #0 {
 ; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
 ; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[W]], [[X]]
 define amdgpu_kernel void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
-  %z = load volatile float, float addrspace(1)* undef
-  %w = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
+  %z = load volatile float, ptr addrspace(1) undef
+  %w = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fsub float -0.0, %x
   %fneg.y = fsub float -0.0, %y
   %select = select i1 %cmp, float %fneg.x, float %fneg.y
   %add0 = fadd float %select, %z
   %add1 = fadd float %fneg.x, %w
-  store volatile float %add0, float addrspace(1)* undef
-  store volatile float %add1, float addrspace(1)* undef
+  store volatile float %add0, ptr addrspace(1) undef
+  store volatile float %add1, ptr addrspace(1) undef
   ret void
 }
 
@@ -295,16 +295,16 @@ define amdgpu_kernel void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
 ; GCN: buffer_store_dword [[ADD]]
 ; GCN: buffer_store_dword [[NEG_X]]
 define amdgpu_kernel void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
-  %z = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
+  %z = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fsub float -0.0, %x
   %fneg.y = fsub float -0.0, %y
   %select = select i1 %cmp, float %fneg.x, float %fneg.y
   %add0 = fadd float %select, %z
-  store volatile float %add0, float addrspace(1)* undef
-  store volatile float %fneg.x, float addrspace(1)* undef
+  store volatile float %add0, ptr addrspace(1) undef
+  store volatile float %fneg.x, ptr addrspace(1) undef
   ret void
 }
 
@@ -318,18 +318,18 @@ define amdgpu_kernel void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c)
 ; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
 ; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[W]], [[Y]]
 define amdgpu_kernel void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
-  %z = load volatile float, float addrspace(1)* undef
-  %w = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
+  %z = load volatile float, ptr addrspace(1) undef
+  %w = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fsub float -0.0, %x
   %fneg.y = fsub float -0.0, %y
   %select = select i1 %cmp, float %fneg.x, float %fneg.y
   %add0 = fadd float %select, %z
   %add1 = fadd float %fneg.y, %w
-  store volatile float %add0, float addrspace(1)* undef
-  store volatile float %add1, float addrspace(1)* undef
+  store volatile float %add0, ptr addrspace(1) undef
+  store volatile float %add1, ptr addrspace(1) undef
   ret void
 }
 
@@ -341,14 +341,14 @@ define amdgpu_kernel void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[Y]], -[[X]],
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
 define amdgpu_kernel void @add_select_fneg_var_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
-  %z = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
+  %z = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fsub float -0.0, %x
   %select = select i1 %cmp, float %fneg.x, float %y
   %add = fadd float %select, %z
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -359,13 +359,13 @@ define amdgpu_kernel void @add_select_fneg_var_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
 define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fsub float -0.0, %x
   %select = select i1 %cmp, float %fneg.x, float -1.0
   %add = fadd float %select, %y
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -377,13 +377,13 @@ define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
 define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fsub float -0.0, %x
   %select = select i1 %cmp, float %fneg.x, float 0x3FC45F3060000000
   %add = fadd float %select, %y
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -397,13 +397,13 @@ define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
 
 ; GCN: v_sub_f32_e32 v{{[0-9]+}},  [[Y]], [[SELECT]]
 define amdgpu_kernel void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fsub float -0.0, %x
   %select = select i1 %cmp, float %fneg.x, float 0xBFC45F3060000000
   %add = fadd float %select, %y
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -414,11 +414,11 @@ define amdgpu_kernel void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
 define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2.0, float -1.0
   %add = fadd float %select, %x
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -431,11 +431,11 @@ define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K1]], [[K0]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
 define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2048.0, float -4096.0
   %add = fadd float %select, %x
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -445,12 +445,12 @@ define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 1.0, 2.0, s
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
 define amdgpu_kernel void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2.0, float -1.0
   %fneg.x = fsub float -0.0, %select
   %add = fadd float %fneg.x, %x
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -463,13 +463,13 @@ define amdgpu_kernel void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
 define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fsub float -0.0, %x
   %select = select i1 %cmp, float -1.0, float %fneg.x
   %add = fadd float %select, %y
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -480,13 +480,13 @@ define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
 define amdgpu_kernel void @add_select_fneg_posk_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fsub float -0.0, %x
   %select = select i1 %cmp, float %fneg.x, float 1.0
   %add = fadd float %select, %y
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -499,13 +499,13 @@ define amdgpu_kernel void @add_select_fneg_posk_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
 define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fsub float -0.0, %x
   %select = select i1 %cmp, float 1.0, float %fneg.x
   %add = fadd float %select, %y
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -517,16 +517,16 @@ define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], |[[Y]]|, -|[[X]]|,
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
 define amdgpu_kernel void @add_select_negfabs_fabs_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
-  %z = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
+  %z = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
   %fabs.y = call float @llvm.fabs.f32(float %y)
   %select = select i1 %cmp, float %fneg.fabs.x, float %fabs.y
   %add = fadd float %select, %z
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -538,16 +538,16 @@ define amdgpu_kernel void @add_select_negfabs_fabs_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -|[[Y]]|, |[[X]]|,
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
 define amdgpu_kernel void @add_select_fabs_negfabs_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
-  %z = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
+  %z = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fabs.y = call float @llvm.fabs.f32(float %y)
   %fneg.fabs.y = fsub float -0.000000e+00, %fabs.y
   %select = select i1 %cmp, float %fabs.x, float %fneg.fabs.y
   %add = fadd float %select, %z
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -559,15 +559,15 @@ define amdgpu_kernel void @add_select_fabs_negfabs_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], |[[Y]]|, -[[X]],
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
 define amdgpu_kernel void @add_select_neg_fabs_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
-  %z = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
+  %z = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fsub float -0.000000e+00, %x
   %fabs.y = call float @llvm.fabs.f32(float %y)
   %select = select i1 %cmp, float %fneg.x, float %fabs.y
   %add = fadd float %select, %z
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -579,15 +579,15 @@ define amdgpu_kernel void @add_select_neg_fabs_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -[[Y]], |[[X]]|,
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
 define amdgpu_kernel void @add_select_fabs_neg_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
-  %z = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
+  %z = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fneg.y = fsub float -0.000000e+00, %y
   %select = select i1 %cmp, float %fabs.x, float %fneg.y
   %add = fadd float %select, %z
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -599,16 +599,16 @@ define amdgpu_kernel void @add_select_fabs_neg_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], |[[Y]]|, [[X]],
 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
 define amdgpu_kernel void @add_select_neg_negfabs_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
-  %z = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
+  %z = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fsub float -0.000000e+00, %x
   %fabs.y = call float @llvm.fabs.f32(float %y)
   %fneg.fabs.y = fsub float -0.000000e+00, %fabs.y
   %select = select i1 %cmp, float %fneg.x, float %fneg.fabs.y
   %add = fadd float %select, %z
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -620,16 +620,16 @@ define amdgpu_kernel void @add_select_neg_negfabs_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], |[[X]]|, [[Y]],
 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
 define amdgpu_kernel void @add_select_negfabs_neg_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
-  %z = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
+  %z = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
   %fneg.y = fsub float -0.000000e+00, %y
   %select = select i1 %cmp, float %fneg.y, float %fneg.fabs.x
   %add = fadd float %select, %z
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -642,14 +642,14 @@ define amdgpu_kernel void @add_select_negfabs_neg_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, |[[X]]|, [[VCC]]
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
 define amdgpu_kernel void @mul_select_negfabs_posk_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
   %select = select i1 %cmp, float %fneg.fabs.x, float 4.0
   %add = fmul float %select, %y
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -662,14 +662,14 @@ define amdgpu_kernel void @mul_select_negfabs_posk_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, |[[X]]|, [[VCC]]
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
 define amdgpu_kernel void @mul_select_posk_negfabs_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
   %select = select i1 %cmp, float 4.0, float %fneg.fabs.x
   %add = fmul float %select, %y
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -680,14 +680,14 @@ define amdgpu_kernel void @mul_select_posk_negfabs_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
 define amdgpu_kernel void @mul_select_negfabs_negk_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
   %select = select i1 %cmp, float %fneg.fabs.x, float -4.0
   %add = fmul float %select, %y
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -700,14 +700,14 @@ define amdgpu_kernel void @mul_select_negfabs_negk_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
 define amdgpu_kernel void @mul_select_negk_negfabs_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fneg.fabs.x = fsub float -0.000000e+00, %fabs.x
   %select = select i1 %cmp, float -4.0, float %fneg.fabs.x
   %add = fmul float %select, %y
-  store volatile float %add, float addrspace(1)* undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -723,13 +723,13 @@ define amdgpu_kernel void @mul_select_negk_negfabs_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
 define amdgpu_kernel void @select_fneg_posk_src_add_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %add = fadd float %x, 4.0
   %fneg = fsub float -0.0, %add
   %select = select i1 %cmp, float %fneg, float 2.0
-  store volatile float %select, float addrspace(1)* undef
+  store volatile float %select, ptr addrspace(1) undef
   ret void
 }
 
@@ -740,12 +740,12 @@ define amdgpu_kernel void @select_fneg_posk_src_add_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
 define amdgpu_kernel void @select_fneg_posk_src_sub_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %add = fsub float %x, 4.0
   %fneg = fsub float -0.0, %add
   %select = select i1 %cmp, float %fneg, float 2.0
-  store volatile float %select, float addrspace(1)* undef
+  store volatile float %select, ptr addrspace(1) undef
   ret void
 }
 
@@ -756,12 +756,12 @@ define amdgpu_kernel void @select_fneg_posk_src_sub_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[MUL]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
 define amdgpu_kernel void @select_fneg_posk_src_mul_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %mul = fmul float %x, 4.0
   %fneg = fsub float -0.0, %mul
   %select = select i1 %cmp, float %fneg, float 2.0
-  store volatile float %select, float addrspace(1)* undef
+  store volatile float %select, ptr addrspace(1) undef
   ret void
 }
 
@@ -773,13 +773,13 @@ define amdgpu_kernel void @select_fneg_posk_src_mul_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[FMA]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
 define amdgpu_kernel void @select_fneg_posk_src_fma_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %z = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %z = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fma = call float @llvm.fma.f32(float %x, float 4.0, float %z)
   %fneg = fsub float -0.0, %fma
   %select = select i1 %cmp, float %fneg, float 2.0
-  store volatile float %select, float addrspace(1)* undef
+  store volatile float %select, ptr addrspace(1) undef
   ret void
 }
 
@@ -790,13 +790,13 @@ define amdgpu_kernel void @select_fneg_posk_src_fma_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[X]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
 define amdgpu_kernel void @select_fneg_posk_src_fmad_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %z = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %z = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %fmad = call float @llvm.fmuladd.f32(float %x, float 4.0, float %z)
   %fneg = fsub float -0.0, %fmad
   %select = select i1 %cmp, float %fneg, float 2.0
-  store volatile float %select, float addrspace(1)* undef
+  store volatile float %select, ptr addrspace(1) undef
   ret void
 }
 
@@ -809,13 +809,13 @@ define amdgpu_kernel void @select_fneg_posk_src_fmad_f32(i32 %c) #0 {
 ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
 ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
 define amdgpu_kernel void @select_fneg_posk_src_rcp_f32(i32 %c) #0 {
-  %x = load volatile float, float addrspace(1)* undef
-  %y = load volatile float, float addrspace(1)* undef
+  %x = load volatile float, ptr addrspace(1) undef
+  %y = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
   %rcp = call float @llvm.amdgcn.rcp.f32(float %x)
   %fneg = fsub float -0.0, %rcp
   %select = select i1 %cmp, float %fneg, float 2.0
-  store volatile float %select, float addrspace(1)* undef
+  store volatile float %select, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/select-i1.ll b/llvm/test/CodeGen/AMDGPU/select-i1.ll
index 1024f06608d71..eb7ceb82ff9e9 100644
--- a/llvm/test/CodeGen/AMDGPU/select-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-i1.ll
@@ -6,10 +6,10 @@
 ; GCN-LABEL: {{^}}select_i1:
 ; GCN: v_cndmask_b32
 ; GCN-NOT: v_cndmask_b32
-define amdgpu_kernel void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind {
+define amdgpu_kernel void @select_i1(ptr addrspace(1) %out, i32 %cond, i1 %a, i1 %b) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i1 %a, i1 %b
-  store i1 %sel, i1 addrspace(1)* %out, align 4
+  store i1 %sel, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -24,9 +24,9 @@ define amdgpu_kernel void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1
 ; GCN: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]]
 ; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], [[V_B]], [[V_A]]
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, [[SEL]]
-define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, [8 x i32], i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
+define amdgpu_kernel void @s_minmax_i1(ptr addrspace(1) %out, [8 x i32], i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
   %cmp = icmp slt i1 %cond, false
   %sel = select i1 %cmp, i1 %a, i1 %b
-  store i1 %sel, i1 addrspace(1)* %out, align 4
+  store i1 %sel, ptr addrspace(1) %out, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/select-opt.ll b/llvm/test/CodeGen/AMDGPU/select-opt.ll
index 454a45d6365da..c9e0bc7c5cafd 100644
--- a/llvm/test/CodeGen/AMDGPU/select-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-opt.ll
@@ -14,12 +14,12 @@
 ; GCN: s_cselect_b32 [[RESULT:s[0-9]+]]
 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
 ; GCN: buffer_store_dword [[VRESULT]]
-define amdgpu_kernel void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_and_cmp_i32(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %and = and i1 %icmp0, %icmp1
   %select = select i1 %and, i32 %x, i32 %y
-  store i32 %select, i32 addrspace(1)* %out
+  store i32 %select, ptr addrspace(1) %out
   ret void
 }
 
@@ -31,12 +31,12 @@ define amdgpu_kernel void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i3
 ; GCN: s_cselect_b32 [[RESULT:s[0-9]+]]
 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
 ; GCN: buffer_store_dword [[VRESULT]]
-define amdgpu_kernel void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_and_cmp_f32(ptr addrspace(1) %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %and = and i1 %fcmp0, %fcmp1
   %select = select i1 %and, i32 %x, i32 %y
-  store i32 %select, i32 addrspace(1)* %out
+  store i32 %select, ptr addrspace(1) %out
   ret void
 }
 
@@ -52,12 +52,12 @@ define amdgpu_kernel void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, fl
 ; GCN-DAG: v_mov_b32_e32 v[[VRESULT1:[0-9]+]], [[RESULT0]]
 ; GCN-DAG: v_mov_b32_e32 v[[VRESULT0:[0-9]+]], [[RESULT1]]
 ; GCN: buffer_store_dwordx2 v[[[VRESULT0]]:[[VRESULT1]]]
-define amdgpu_kernel void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_and_cmp_i32(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %and = and i1 %icmp0, %icmp1
   %select = select i1 %and, i64 %x, i64 %y
-  store i64 %select, i64 addrspace(1)* %out
+  store i64 %select, ptr addrspace(1) %out
   ret void
 }
 
@@ -71,12 +71,12 @@ define amdgpu_kernel void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i3
 ; GCN-DAG: v_mov_b32_e32 v[[VRESULT1:[0-9]+]], [[RESULT0]]
 ; GCN-DAG: v_mov_b32_e32 v[[VRESULT0:[0-9]+]], [[RESULT1]]
 ; GCN: buffer_store_dwordx2 v[[[VRESULT0]]:[[VRESULT1]]]
-define amdgpu_kernel void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_and_cmp_f32(ptr addrspace(1) %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %and = and i1 %fcmp0, %fcmp1
   %select = select i1 %and, i64 %x, i64 %y
-  store i64 %select, i64 addrspace(1)* %out
+  store i64 %select, ptr addrspace(1) %out
   ret void
 }
 
@@ -91,12 +91,12 @@ define amdgpu_kernel void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, fl
 ; GCN-DAG: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
 ; GCN: buffer_store_dword [[VRESULT]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_or_cmp_i32(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %or = or i1 %icmp0, %icmp1
   %select = select i1 %or, i32 %x, i32 %y
-  store i32 %select, i32 addrspace(1)* %out
+  store i32 %select, ptr addrspace(1) %out
   ret void
 }
 
@@ -108,12 +108,12 @@ define amdgpu_kernel void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32
 ; GCN-DAG: s_cselect_b32 [[RESULT:s[0-9]+]]
 ; GCN-DAG: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
 ; GCN: buffer_store_dword [[VRESULT]]
-define amdgpu_kernel void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_or_cmp_f32(ptr addrspace(1) %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %or = or i1 %fcmp0, %fcmp1
   %select = select i1 %or, i32 %x, i32 %y
-  store i32 %select, i32 addrspace(1)* %out
+  store i32 %select, ptr addrspace(1) %out
   ret void
 }
 
@@ -129,12 +129,12 @@ define amdgpu_kernel void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, flo
 ; GCN-DAG: v_mov_b32_e32 v[[VRESULT1:[0-9]+]], [[RESULT0]]
 ; GCN-DAG: v_mov_b32_e32 v[[VRESULT0:[0-9]+]], [[RESULT1]]
 ; GCN: buffer_store_dwordx2 v[[[VRESULT0]]:[[VRESULT1]]]
-define amdgpu_kernel void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_or_cmp_i32(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %or = or i1 %icmp0, %icmp1
   %select = select i1 %or, i64 %x, i64 %y
-  store i64 %select, i64 addrspace(1)* %out
+  store i64 %select, ptr addrspace(1) %out
   ret void
 }
 
@@ -148,19 +148,19 @@ define amdgpu_kernel void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32
 ; GCN-DAG: v_mov_b32_e32 v[[VRESULT1:[0-9]+]], [[RESULT0]]
 ; GCN-DAG: v_mov_b32_e32 v[[VRESULT0:[0-9]+]], [[RESULT1]]
 ; GCN: buffer_store_dwordx2 v[[[VRESULT0]]:[[VRESULT1]]]
-define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_or_cmp_f32(ptr addrspace(1) %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %or = or i1 %fcmp0, %fcmp1
   %select = select i1 %or, i64 %x, i64 %y
-  store i64 %select, i64 addrspace(1)* %out
+  store i64 %select, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}regression:
 ; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0
 
-define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
+define amdgpu_kernel void @regression(ptr addrspace(1) %out, float %c0, float %c1) #0 {
 entry:
   %cmp0 = fcmp oeq float %c0, 1.0
   br i1 %cmp0, label %if0, label %endif
@@ -176,7 +176,7 @@ if1:
 endif:
   %tmp0 = phi i1 [ true, %entry ], [ %cmp2, %if1 ], [ false, %if0 ]
   %tmp2 = select i1 %tmp0, float 4.0, float 0.0
-  store float %tmp2, float addrspace(1)* %out
+  store float %tmp2, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll
index 81deec1e0dbb8..b35bcbff94d5b 100644
--- a/llvm/test/CodeGen/AMDGPU/select-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll
@@ -23,20 +23,20 @@ define float @select_undef_rhs(float %val, i1 %cond) {
 ; GCN-LABEL: {{^}}select_undef_n1:
 ; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 1.0
 ; GCN: store_dword {{[^,]+}}, [[RES]]
-define void @select_undef_n1(float addrspace(1)* %a, i32 %c) {
+define void @select_undef_n1(ptr addrspace(1) %a, i32 %c) {
   %cc = icmp eq i32 %c, 0
   %sel = select i1 %cc, float 1.000000e+00, float undef
-  store float %sel, float addrspace(1)* %a
+  store float %sel, ptr addrspace(1) %a
   ret void
 }
 
 ; GCN-LABEL: {{^}}select_undef_n2:
 ; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 1.0
 ; GCN: store_dword {{[^,]+}}, [[RES]]
-define void @select_undef_n2(float addrspace(1)* %a, i32 %c) {
+define void @select_undef_n2(ptr addrspace(1) %a, i32 %c) {
   %cc = icmp eq i32 %c, 0
   %sel = select i1 %cc, float undef, float 1.000000e+00
-  store float %sel, float addrspace(1)* %a
+  store float %sel, ptr addrspace(1) %a
   ret void
 }
 
@@ -48,18 +48,18 @@ declare float @llvm.amdgcn.rcp.f32(float)
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
 ; GCN: s_cbranch_vccnz
-define amdgpu_kernel void @undef_v6f32(<6 x float> addrspace(3)* %ptr, i1 %cond) {
+define amdgpu_kernel void @undef_v6f32(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
 
 loop:
   %phi = phi <6 x float> [ undef, %entry ], [ %add, %loop ]
-  %load = load volatile <6 x float>, <6 x float> addrspace(3)* undef
+  %load = load volatile <6 x float>, ptr addrspace(3) undef
   %add = fadd <6 x float> %load, %phi
   br i1 %cond, label %loop, label %ret
 
 ret:
-  store volatile <6 x float> %add, <6 x float> addrspace(3)* undef
+  store volatile <6 x float> %add, ptr addrspace(3) undef
   ret void
 }
 
@@ -67,18 +67,18 @@ ret:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
 ; GCN: s_cbranch_vccnz
-define amdgpu_kernel void @undef_v6i32(<6 x i32> addrspace(3)* %ptr, i1 %cond) {
+define amdgpu_kernel void @undef_v6i32(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
 
 loop:
   %phi = phi <6 x i32> [ undef, %entry ], [ %add, %loop ]
-  %load = load volatile <6 x i32>, <6 x i32> addrspace(3)* undef
+  %load = load volatile <6 x i32>, ptr addrspace(3) undef
   %add = add <6 x i32> %load, %phi
   br i1 %cond, label %loop, label %ret
 
 ret:
-  store volatile <6 x i32> %add, <6 x i32> addrspace(3)* undef
+  store volatile <6 x i32> %add, ptr addrspace(3) undef
   ret void
 }
 
@@ -87,18 +87,18 @@ ret:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
 ; GCN: s_cbranch_vccnz
-define amdgpu_kernel void @undef_v5f32(<5 x float> addrspace(3)* %ptr, i1 %cond) {
+define amdgpu_kernel void @undef_v5f32(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
 
 loop:
   %phi = phi <5 x float> [ undef, %entry ], [ %add, %loop ]
-  %load = load volatile <5 x float>, <5 x float> addrspace(3)* undef
+  %load = load volatile <5 x float>, ptr addrspace(3) undef
   %add = fadd <5 x float> %load, %phi
   br i1 %cond, label %loop, label %ret
 
 ret:
-  store volatile <5 x float> %add, <5 x float> addrspace(3)* undef
+  store volatile <5 x float> %add, ptr addrspace(3) undef
   ret void
 }
 
@@ -106,18 +106,18 @@ ret:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
 ; GCN: s_cbranch_vccnz
-define amdgpu_kernel void @undef_v5i32(<5 x i32> addrspace(3)* %ptr, i1 %cond) {
+define amdgpu_kernel void @undef_v5i32(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
 
 loop:
   %phi = phi <5 x i32> [ undef, %entry ], [ %add, %loop ]
-  %load = load volatile <5 x i32>, <5 x i32> addrspace(3)* undef
+  %load = load volatile <5 x i32>, ptr addrspace(3) undef
   %add = add <5 x i32> %load, %phi
   br i1 %cond, label %loop, label %ret
 
 ret:
-  store volatile <5 x i32> %add, <5 x i32> addrspace(3)* undef
+  store volatile <5 x i32> %add, ptr addrspace(3) undef
   ret void
 }
 
@@ -126,18 +126,18 @@ ret:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
 ; GCN: s_cbranch_vccnz
-define amdgpu_kernel void @undef_v3f64(<3 x double> addrspace(3)* %ptr, i1 %cond) {
+define amdgpu_kernel void @undef_v3f64(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
 
 loop:
   %phi = phi <3 x double> [ undef, %entry ], [ %add, %loop ]
-  %load = load volatile <3 x double>, <3 x double> addrspace(3)* %ptr
+  %load = load volatile <3 x double>, ptr addrspace(3) %ptr
   %add = fadd <3 x double> %load, %phi
   br i1 %cond, label %loop, label %ret
 
 ret:
-  store volatile <3 x double> %add, <3 x double> addrspace(3)* %ptr
+  store volatile <3 x double> %add, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -145,18 +145,18 @@ ret:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
 ; GCN: s_cbranch_vccnz
-define amdgpu_kernel void @undef_v3i64(<3 x i64> addrspace(3)* %ptr, i1 %cond) {
+define amdgpu_kernel void @undef_v3i64(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
 
 loop:
   %phi = phi <3 x i64> [ undef, %entry ], [ %add, %loop ]
-  %load = load volatile <3 x i64>, <3 x i64> addrspace(3)* %ptr
+  %load = load volatile <3 x i64>, ptr addrspace(3) %ptr
   %add = add <3 x i64> %load, %phi
   br i1 %cond, label %loop, label %ret
 
 ret:
-  store volatile <3 x i64> %add, <3 x i64> addrspace(3)* %ptr
+  store volatile <3 x i64> %add, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -165,18 +165,18 @@ ret:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
 ; GCN: s_cbranch_vccnz
-define amdgpu_kernel void @undef_v4f16(<4 x half> addrspace(3)* %ptr, i1 %cond) {
+define amdgpu_kernel void @undef_v4f16(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
 
 loop:
   %phi = phi <4 x half> [ undef, %entry ], [ %add, %loop ]
-  %load = load volatile <4 x half>, <4 x half> addrspace(3)* %ptr
+  %load = load volatile <4 x half>, ptr addrspace(3) %ptr
   %add = fadd <4 x half> %load, %phi
   br i1 %cond, label %loop, label %ret
 
 ret:
-  store volatile <4 x half> %add, <4 x half> addrspace(3)* %ptr
+  store volatile <4 x half> %add, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -184,18 +184,18 @@ ret:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
 ; GCN: s_cbranch_vccnz
-define amdgpu_kernel void @undef_v4i16(<4 x i16> addrspace(3)* %ptr, i1 %cond) {
+define amdgpu_kernel void @undef_v4i16(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
 
 loop:
   %phi = phi <4 x i16> [ undef, %entry ], [ %add, %loop ]
-  %load = load volatile <4 x i16>, <4 x i16> addrspace(3)* %ptr
+  %load = load volatile <4 x i16>, ptr addrspace(3) %ptr
   %add = add <4 x i16> %load, %phi
   br i1 %cond, label %loop, label %ret
 
 ret:
-  store volatile <4 x i16> %add, <4 x i16> addrspace(3)* %ptr
+  store volatile <4 x i16> %add, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -204,18 +204,18 @@ ret:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
 ; GCN: s_cbranch_vccnz
-define amdgpu_kernel void @undef_v2f16(<2 x half> addrspace(3)* %ptr, i1 %cond) {
+define amdgpu_kernel void @undef_v2f16(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
 
 loop:
   %phi = phi <2 x half> [ undef, %entry ], [ %add, %loop ]
-  %load = load volatile <2 x half>, <2 x half> addrspace(3)* %ptr
+  %load = load volatile <2 x half>, ptr addrspace(3) %ptr
   %add = fadd <2 x half> %load, %phi
   br i1 %cond, label %loop, label %ret
 
 ret:
-  store volatile <2 x half> %add, <2 x half> addrspace(3)* %ptr
+  store volatile <2 x half> %add, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -223,18 +223,18 @@ ret:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
 ; GCN: s_cbranch_vccnz
-define amdgpu_kernel void @undef_v2i16(<2 x i16> addrspace(3)* %ptr, i1 %cond) {
+define amdgpu_kernel void @undef_v2i16(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
 
 loop:
   %phi = phi <2 x i16> [ undef, %entry ], [ %add, %loop ]
-  %load = load volatile <2 x i16>, <2 x i16> addrspace(3)* %ptr
+  %load = load volatile <2 x i16>, ptr addrspace(3) %ptr
   %add = add <2 x i16> %load, %phi
   br i1 %cond, label %loop, label %ret
 
 ret:
-  store volatile <2 x i16> %add, <2 x i16> addrspace(3)* %ptr
+  store volatile <2 x i16> %add, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -255,6 +255,6 @@ define void @inf_loop_undef_vector(<6 x float> %arg, float %arg1, i64 %arg2) {
   %i5 = extractelement <3 x i64> %i3, i64 1
   %i6 = mul i64 %i5, %arg2
   %i7 = add i64 %i6, %i4
-  store volatile i64 %i7, i64 addrspace(1)* undef, align 4
+  store volatile i64 %i7, ptr addrspace(1) undef, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
index efaf2b2e1c469..b1cdc79016fc9 100644
--- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
@@ -16,24 +16,24 @@
 ; SelectionDAGBuilder for some reason changes the select type.
 ; VI: v_cndmask_b32
 ; VI: v_cndmask_b32
-define amdgpu_kernel void @v_select_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %a.ptr, <2 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
-  %a = load <2 x i8>, <2 x i8> addrspace(1)* %a.ptr, align 2
-  %b = load <2 x i8>, <2 x i8> addrspace(1)* %b.ptr, align 2
+define amdgpu_kernel void @v_select_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+  %a = load <2 x i8>, ptr addrspace(1) %a.ptr, align 2
+  %b = load <2 x i8>, ptr addrspace(1) %b.ptr, align 2
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x i8> %a, <2 x i8> %b
-  store <2 x i8> %select, <2 x i8> addrspace(1)* %out, align 2
+  store <2 x i8> %select, ptr addrspace(1) %out, align 2
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_select_v4i8:
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %a.ptr, <4 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
-  %a = load <4 x i8>, <4 x i8> addrspace(1)* %a.ptr
-  %b = load <4 x i8>, <4 x i8> addrspace(1)* %b.ptr
+define amdgpu_kernel void @v_select_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+  %a = load <4 x i8>, ptr addrspace(1) %a.ptr
+  %b = load <4 x i8>, ptr addrspace(1) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
-  store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4
+  store <4 x i8> %select, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -41,12 +41,12 @@ define amdgpu_kernel void @v_select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> a
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %a.ptr, <8 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
-  %a = load <8 x i8>, <8 x i8> addrspace(1)* %a.ptr
-  %b = load <8 x i8>, <8 x i8> addrspace(1)* %b.ptr
+define amdgpu_kernel void @v_select_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+  %a = load <8 x i8>, ptr addrspace(1) %a.ptr
+  %b = load <8 x i8>, ptr addrspace(1) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x i8> %a, <8 x i8> %b
-  store <8 x i8> %select, <8 x i8> addrspace(1)* %out, align 4
+  store <8 x i8> %select, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -56,12 +56,12 @@ define amdgpu_kernel void @v_select_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> a
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %a.ptr, <16 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
-  %a = load <16 x i8>, <16 x i8> addrspace(1)* %a.ptr
-  %b = load <16 x i8>, <16 x i8> addrspace(1)* %b.ptr
+define amdgpu_kernel void @v_select_v16i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+  %a = load <16 x i8>, ptr addrspace(1) %a.ptr
+  %b = load <16 x i8>, ptr addrspace(1) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <16 x i8> %a, <16 x i8> %b
-  store <16 x i8> %select, <16 x i8> addrspace(1)* %out, align 4
+  store <16 x i8> %select, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -71,10 +71,10 @@ define amdgpu_kernel void @v_select_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8
 
 ; SI: s_cselect_b32
 ; SI-NOT: cndmask
-define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) #0 {
+define amdgpu_kernel void @select_v4i8(ptr addrspace(1) %out, <4 x i8> %a, <4 x i8> %b, i8 %c) #0 {
   %cmp = icmp eq i8 %c, 0
   %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
-  store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4
+  store <4 x i8> %select, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -85,10 +85,10 @@ define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a,
 
 ; SI: s_cselect_b32
 ; SI-NOT: v_cndmask_b32e
-define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
+define amdgpu_kernel void @select_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b
-  store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4
+  store <2 x i16> %select, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -97,12 +97,12 @@ define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; GCN: buffer_load_dword v
 ; GCN: v_cndmask_b32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
-  %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.ptr
-  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.ptr
+define amdgpu_kernel void @v_select_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+  %a = load <2 x i16>, ptr addrspace(1) %a.ptr
+  %b = load <2 x i16>, ptr addrspace(1) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b
-  store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4
+  store <2 x i16> %select, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -115,12 +115,12 @@ define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
 ; VI: s_cselect_b32
 ; GFX9: cndmask
 ; GFX9: cndmask
-define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
-  %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.ptr
-  %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.ptr
+define amdgpu_kernel void @v_select_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+  %a = load <3 x i16>, ptr addrspace(1) %a.ptr
+  %b = load <3 x i16>, ptr addrspace(1) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <3 x i16> %a, <3 x i16> %b
-  store <3 x i16> %select, <3 x i16> addrspace(1)* %out, align 4
+  store <3 x i16> %select, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -128,12 +128,12 @@ define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %a.ptr, <4 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
-  %a = load <4 x i16>, <4 x i16> addrspace(1)* %a.ptr
-  %b = load <4 x i16>, <4 x i16> addrspace(1)* %b.ptr
+define amdgpu_kernel void @v_select_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+  %a = load <4 x i16>, ptr addrspace(1) %a.ptr
+  %b = load <4 x i16>, ptr addrspace(1) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
-  store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4
+  store <4 x i16> %select, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -143,12 +143,12 @@ define amdgpu_kernel void @v_select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %a.ptr, <8 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
-  %a = load <8 x i16>, <8 x i16> addrspace(1)* %a.ptr
-  %b = load <8 x i16>, <8 x i16> addrspace(1)* %b.ptr
+define amdgpu_kernel void @v_select_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+  %a = load <8 x i16>, ptr addrspace(1) %a.ptr
+  %b = load <8 x i16>, ptr addrspace(1) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x i16> %a, <8 x i16> %b
-  store <8 x i16> %select, <8 x i16> addrspace(1)* %out, align 4
+  store <8 x i16> %select, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -159,10 +159,10 @@ define amdgpu_kernel void @v_select_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16
 ; GCN: s_cselect_b32
 ; GCN: s_cselect_b32
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+define amdgpu_kernel void @s_select_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
-  store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8
+  store <2 x i32> %select, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -172,10 +172,10 @@ define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32
 ; GCN: s_cselect_b32
 ; GCN: s_cselect_b32
 ; GCN: buffer_store_dwordx4
-define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+define amdgpu_kernel void @s_select_v4i32(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b
-  store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %select, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -188,12 +188,12 @@ define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx4
-define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @v_select_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %cond) #0 {
 bb:
   %tmp2 = icmp ult i32 %cond, 32
-  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %val = load <4 x i32>, ptr addrspace(1) %in
   %tmp3 = select i1 %tmp2, <4 x i32> %val, <4 x i32> zeroinitializer
-  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %tmp3, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -206,10 +206,10 @@ bb:
 ; GCN: s_cselect_b32
 ; GCN: s_cselect_b32
 ; GCN: s_cselect_b32
-define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) #0 {
+define amdgpu_kernel void @select_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
-  store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16
+  store <8 x i32> %select, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -218,10 +218,10 @@ define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32>
 ; GCN-DAG: s_cselect_b32
 ; GCN-DAG: s_cselect_b32
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 {
+define amdgpu_kernel void @s_select_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x float> %a, <2 x float> %b
-  store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16
+  store <2 x float> %select, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -233,10 +233,10 @@ define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x f
 ; GCN: s_cselect_b32
 
 ; GCN: buffer_store_dwordx
-define amdgpu_kernel void @s_select_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, <3 x float> %b, i32 %c) #0 {
+define amdgpu_kernel void @s_select_v3f32(ptr addrspace(1) %out, <3 x float> %a, <3 x float> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <3 x float> %a, <3 x float> %b
-  store <3 x float> %select, <3 x float> addrspace(1)* %out, align 16
+  store <3 x float> %select, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -250,10 +250,10 @@ define amdgpu_kernel void @s_select_v3f32(<3 x float> addrspace(1)* %out, <3 x f
 ; GCN: s_cselect_b32
 
 ; GCN: buffer_store_dwordx4
-define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) #0 {
+define amdgpu_kernel void @s_select_v4f32(ptr addrspace(1) %out, <4 x float> %a, <4 x float> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x float> %a, <4 x float> %b
-  store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16
+  store <4 x float> %select, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -266,12 +266,12 @@ define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x f
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx4
-define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @v_select_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %cond) #0 {
 bb:
   %tmp2 = icmp ult i32 %cond, 32
-  %val = load <4 x float>, <4 x float> addrspace(1)* %in
+  %val = load <4 x float>, ptr addrspace(1) %in
   %tmp3 = select i1 %tmp2, <4 x float> %val, <4 x float> zeroinitializer
-  store <4 x float> %tmp3, <4 x float> addrspace(1)* %out, align 16
+  store <4 x float> %tmp3, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -285,10 +285,10 @@ bb:
 ; GCN: s_cselect_b32
 
 ; GCN: buffer_store_dwordx
-define amdgpu_kernel void @s_select_v5f32(<5 x float> addrspace(1)* %out, <5 x float> %a, <5 x float> %b, i32 %c) #0 {
+define amdgpu_kernel void @s_select_v5f32(ptr addrspace(1) %out, <5 x float> %a, <5 x float> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <5 x float> %a, <5 x float> %b
-  store <5 x float> %select, <5 x float> addrspace(1)* %out, align 16
+  store <5 x float> %select, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -301,10 +301,10 @@ define amdgpu_kernel void @s_select_v5f32(<5 x float> addrspace(1)* %out, <5 x f
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
-define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) #0 {
+define amdgpu_kernel void @select_v8f32(ptr addrspace(1) %out, <8 x float> %a, <8 x float> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x float> %a, <8 x float> %b
-  store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16
+  store <8 x float> %select, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -313,10 +313,10 @@ define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x flo
 ; GCN: s_cselect_b32
 ; GCN: s_cselect_b32
 ; GCN: s_cselect_b32
-define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) #0 {
+define amdgpu_kernel void @select_v2f64(ptr addrspace(1) %out, <2 x double> %a, <2 x double> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
-  store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16
+  store <2 x double> %select, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -329,10 +329,10 @@ define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x do
 ; GCN: s_cselect_b32
 ; GCN: s_cselect_b32
 ; GCN: s_cselect_b32
-define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) #0 {
+define amdgpu_kernel void @select_v4f64(ptr addrspace(1) %out, <4 x double> %a, <4 x double> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x double> %a, <4 x double> %b
-  store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16
+  store <4 x double> %select, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -353,22 +353,22 @@ define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x do
 ; GCN: s_cselect_b32
 ; GCN: s_cselect_b32
 ; GCN: s_cselect_b32
-define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) #0 {
+define amdgpu_kernel void @select_v8f64(ptr addrspace(1) %out, <8 x double> %a, <8 x double> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x double> %a, <8 x double> %b
-  store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16
+  store <8 x double> %select, ptr addrspace(1) %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_select_v2f16:
 ; GCN: v_cndmask_b32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %a.ptr, <2 x half> addrspace(1)* %b.ptr, i32 %c) #0 {
-  %a = load <2 x half>, <2 x half> addrspace(1)* %a.ptr
-  %b = load <2 x half>, <2 x half> addrspace(1)* %b.ptr
+define amdgpu_kernel void @v_select_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+  %a = load <2 x half>, ptr addrspace(1) %a.ptr
+  %b = load <2 x half>, ptr addrspace(1) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x half> %a, <2 x half> %b
-  store <2 x half> %select, <2 x half> addrspace(1)* %out, align 4
+  store <2 x half> %select, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -376,12 +376,12 @@ define amdgpu_kernel void @v_select_v2f16(<2 x half> addrspace(1)* %out, <2 x ha
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %a.ptr, <3 x half> addrspace(1)* %b.ptr, i32 %c) #0 {
-  %a = load <3 x half>, <3 x half> addrspace(1)* %a.ptr
-  %b = load <3 x half>, <3 x half> addrspace(1)* %b.ptr
+define amdgpu_kernel void @v_select_v3f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+  %a = load <3 x half>, ptr addrspace(1) %a.ptr
+  %b = load <3 x half>, ptr addrspace(1) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <3 x half> %a, <3 x half> %b
-  store <3 x half> %select, <3 x half> addrspace(1)* %out, align 4
+  store <3 x half> %select, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -389,12 +389,12 @@ define amdgpu_kernel void @v_select_v3f16(<3 x half> addrspace(1)* %out, <3 x ha
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %a.ptr, <4 x half> addrspace(1)* %b.ptr, i32 %c) #0 {
-  %a = load <4 x half>, <4 x half> addrspace(1)* %a.ptr
-  %b = load <4 x half>, <4 x half> addrspace(1)* %b.ptr
+define amdgpu_kernel void @v_select_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+  %a = load <4 x half>, ptr addrspace(1) %a.ptr
+  %b = load <4 x half>, ptr addrspace(1) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x half> %a, <4 x half> %b
-  store <4 x half> %select, <4 x half> addrspace(1)* %out, align 4
+  store <4 x half> %select, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index 033afeb3adcac..abee0b2d9c5b4 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -91,19 +91,19 @@ define amdgpu_kernel void @select_f16(
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b,
-    half addrspace(1)* %c,
-    half addrspace(1)* %d) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c,
+    ptr addrspace(1) %d) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
-  %c.val = load volatile half, half addrspace(1)* %c
-  %d.val = load volatile half, half addrspace(1)* %d
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
+  %c.val = load volatile half, ptr addrspace(1) %c
+  %d.val = load volatile half, ptr addrspace(1) %d
   %fcmp = fcmp olt half %a.val, %b.val
   %r.val = select i1 %fcmp, half %c.val, half %d.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -173,17 +173,17 @@ define amdgpu_kernel void @select_f16_imm_a(
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
-    half addrspace(1)* %r,
-    half addrspace(1)* %b,
-    half addrspace(1)* %c,
-    half addrspace(1)* %d) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c,
+    ptr addrspace(1) %d) {
 entry:
-  %b.val = load volatile half, half addrspace(1)* %b
-  %c.val = load volatile half, half addrspace(1)* %c
-  %d.val = load volatile half, half addrspace(1)* %d
+  %b.val = load volatile half, ptr addrspace(1) %b
+  %c.val = load volatile half, ptr addrspace(1) %c
+  %d.val = load volatile half, ptr addrspace(1) %d
   %fcmp = fcmp olt half 0xH3800, %b.val
   %r.val = select i1 %fcmp, half %c.val, half %d.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -253,17 +253,17 @@ define amdgpu_kernel void @select_f16_imm_b(
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %c,
-    half addrspace(1)* %d) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %c,
+    ptr addrspace(1) %d) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %c.val = load volatile half, half addrspace(1)* %c
-  %d.val = load volatile half, half addrspace(1)* %d
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %c.val = load volatile half, ptr addrspace(1) %c
+  %d.val = load volatile half, ptr addrspace(1) %d
   %fcmp = fcmp olt half %a.val, 0xH3800
   %r.val = select i1 %fcmp, half %c.val, half %d.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -334,17 +334,17 @@ define amdgpu_kernel void @select_f16_imm_c(
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b,
-    half addrspace(1)* %d) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %d) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
-  %d.val = load volatile half, half addrspace(1)* %d
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
+  %d.val = load volatile half, ptr addrspace(1) %d
   %fcmp = fcmp olt half %a.val, %b.val
   %r.val = select i1 %fcmp, half 0xH3800, half %d.val
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -415,17 +415,17 @@ define amdgpu_kernel void @select_f16_imm_d(
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b,
-    half addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load volatile half, half addrspace(1)* %a
-  %b.val = load volatile half, half addrspace(1)* %b
-  %c.val = load volatile half, half addrspace(1)* %c
+  %a.val = load volatile half, ptr addrspace(1) %a
+  %b.val = load volatile half, ptr addrspace(1) %b
+  %c.val = load volatile half, ptr addrspace(1) %c
   %fcmp = fcmp olt half %a.val, %b.val
   %r.val = select i1 %fcmp, half %c.val, half 0xH3800
-  store half %r.val, half addrspace(1)* %r
+  store half %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -539,19 +539,19 @@ define amdgpu_kernel void @select_v2f16(
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-    <2 x half> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b,
-    <2 x half> addrspace(1)* %c,
-    <2 x half> addrspace(1)* %d) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c,
+    ptr addrspace(1) %d) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
-  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
-  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
+  %c.val = load <2 x half>, ptr addrspace(1) %c
+  %d.val = load <2 x half>, ptr addrspace(1) %d
   %fcmp = fcmp olt <2 x half> %a.val, %b.val
   %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
-  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
+  store <2 x half> %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -640,17 +640,17 @@ define amdgpu_kernel void @select_v2f16_imm_a(
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
-    <2 x half> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %b,
-    <2 x half> addrspace(1)* %c,
-    <2 x half> addrspace(1)* %d) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c,
+    ptr addrspace(1) %d) {
 entry:
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
-  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
-  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
+  %b.val = load <2 x half>, ptr addrspace(1) %b
+  %c.val = load <2 x half>, ptr addrspace(1) %c
+  %d.val = load <2 x half>, ptr addrspace(1) %d
   %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val
   %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
-  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
+  store <2 x half> %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -739,17 +739,17 @@ define amdgpu_kernel void @select_v2f16_imm_b(
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
-    <2 x half> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %c,
-    <2 x half> addrspace(1)* %d) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %c,
+    ptr addrspace(1) %d) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
-  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %c.val = load <2 x half>, ptr addrspace(1) %c
+  %d.val = load <2 x half>, ptr addrspace(1) %d
   %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900>
   %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
-  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
+  store <2 x half> %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -840,17 +840,17 @@ define amdgpu_kernel void @select_v2f16_imm_c(
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
-    <2 x half> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b,
-    <2 x half> addrspace(1)* %d) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %d) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
-  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
+  %d.val = load <2 x half>, ptr addrspace(1) %d
   %fcmp = fcmp olt <2 x half> %a.val, %b.val
   %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val
-  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
+  store <2 x half> %r.val, ptr addrspace(1) %r
   ret void
 }
 
@@ -941,16 +941,16 @@ define amdgpu_kernel void @select_v2f16_imm_d(
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
-    <2 x half> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a,
-    <2 x half> addrspace(1)* %b,
-    <2 x half> addrspace(1)* %c) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b,
+    ptr addrspace(1) %c) {
 entry:
-  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
-  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
-  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
+  %a.val = load <2 x half>, ptr addrspace(1) %a
+  %b.val = load <2 x half>, ptr addrspace(1) %b
+  %c.val = load <2 x half>, ptr addrspace(1) %c
   %fcmp = fcmp olt <2 x half> %a.val, %b.val
   %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900>
-  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
+  store <2 x half> %r.val, ptr addrspace(1) %r
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/select.ll b/llvm/test/CodeGen/AMDGPU/select.ll
index e53c159a2f712..ceb4ef1659867 100644
--- a/llvm/test/CodeGen/AMDGPU/select.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.ll
@@ -14,9 +14,9 @@
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
-define amdgpu_kernel void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out,
-                     <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out,
-                     <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out,
+define amdgpu_kernel void @select (ptr addrspace(1) %i32out, ptr addrspace(1) %f32out,
+                     ptr addrspace(1) %v2i32out, ptr addrspace(1) %v2f32out,
+                     ptr addrspace(1) %v4i32out, ptr addrspace(1) %v4f32out,
                      i32 %cond) {
 entry:
   br label %for
@@ -37,11 +37,11 @@ for:
   br i1 %0, label %body, label %done
 
 done:
-  store i32 %1, i32 addrspace(1)* %i32out
-  store float %2, float addrspace(1)* %f32out
-  store <2 x i32> %3, <2 x i32> addrspace(1)* %v2i32out
-  store <2 x float> %4, <2 x float> addrspace(1)* %v2f32out
-  store <4 x i32> %5, <4 x i32> addrspace(1)* %v4i32out
-  store <4 x float> %6, <4 x float> addrspace(1)* %v4f32out
+  store i32 %1, ptr addrspace(1) %i32out
+  store float %2, ptr addrspace(1) %f32out
+  store <2 x i32> %3, ptr addrspace(1) %v2i32out
+  store <2 x float> %4, ptr addrspace(1) %v2f32out
+  store <4 x i32> %5, ptr addrspace(1) %v4i32out
+  store <4 x float> %6, ptr addrspace(1) %v4f32out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/select64.ll b/llvm/test/CodeGen/AMDGPU/select64.ll
index 0224708fb586c..584dfdd9269dc 100644
--- a/llvm/test/CodeGen/AMDGPU/select64.ll
+++ b/llvm/test/CodeGen/AMDGPU/select64.ll
@@ -7,46 +7,46 @@
 ; GCN-NOT: s_lshr_b64
 ; GCN: s_cselect_b32
 ; GCN: s_cselect_b32
-define amdgpu_kernel void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
+define amdgpu_kernel void @select0(ptr addrspace(1) %out, i32 %cond, i64 %in) {
 entry:
   %0 = icmp ugt i32 %cond, 5
   %1 = select i1 %0, i64 0, i64 %in
-  store i64 %1, i64 addrspace(1)* %out
+  store i64 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}select_trunc_i64:
 ; GCN: s_cselect_b32
 ; GCN-NOT: s_cselect_b32
-define amdgpu_kernel void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind {
+define amdgpu_kernel void @select_trunc_i64(ptr addrspace(1) %out, i32 %cond, i64 %in) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i64 0, i64 %in
   %trunc = trunc i64 %sel to i32
-  store i32 %trunc, i32 addrspace(1)* %out, align 4
+  store i32 %trunc, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}select_trunc_i64_2:
 ; GCN: s_cselect_b32
 ; GCN-NOT: s_cselect_b32
-define amdgpu_kernel void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @select_trunc_i64_2(ptr addrspace(1) %out, i32 %cond, i64 %a, i64 %b) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i64 %a, i64 %b
   %trunc = trunc i64 %sel to i32
-  store i32 %trunc, i32 addrspace(1)* %out, align 4
+  store i32 %trunc, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_select_trunc_i64_2:
 ; GCN: s_cselect_b32
 ; GCN-NOT: s_cselect_b32
-define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_select_trunc_i64_2(ptr addrspace(1) %out, i32 %cond, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
   %cmp = icmp ugt i32 %cond, 5
-  %a = load i64, i64 addrspace(1)* %aptr, align 8
-  %b = load i64, i64 addrspace(1)* %bptr, align 8
+  %a = load i64, ptr addrspace(1) %aptr, align 8
+  %b = load i64, ptr addrspace(1) %bptr, align 8
   %sel = select i1 %cmp, i64 %a, i64 %b
   %trunc = trunc i64 %sel to i32
-  store i32 %trunc, i32 addrspace(1)* %out, align 4
+  store i32 %trunc, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -54,11 +54,11 @@ define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %con
 ; GCN-DAG: s_cselect_b32
 ; GCN-DAG: s_cselect_b32
 ; GCN: s_endpgm
-define amdgpu_kernel void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_select_i64_split_imm(ptr addrspace(1) %out, i32 %cond, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
   %cmp = icmp ugt i32 %cond, 5
-  %a = load i64, i64 addrspace(1)* %aptr, align 8
-  %b = load i64, i64 addrspace(1)* %bptr, align 8
+  %a = load i64, ptr addrspace(1) %aptr, align 8
+  %b = load i64, ptr addrspace(1) %bptr, align 8
   %sel = select i1 %cmp, i64 %a, i64 270582939648 ; 63 << 32
-  store i64 %sel, i64 addrspace(1)* %out, align 8
+  store i64 %sel, ptr addrspace(1) %out, align 8
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/selectcc-cnd.ll b/llvm/test/CodeGen/AMDGPU/selectcc-cnd.ll
index 18616851c9c24..a8af42877ae63 100644
--- a/llvm/test/CodeGen/AMDGPU/selectcc-cnd.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc-cnd.ll
@@ -3,10 +3,10 @@
 ;CHECK-NOT: SETE
 ;CHECK: CNDE {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1.0, literal.x,
 ;CHECK: 1073741824
-define amdgpu_kernel void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
-  %1 = load float, float addrspace(1)* %in
+define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %1 = load float, ptr addrspace(1) %in
   %2 = fcmp oeq float %1, 0.0
   %3 = select i1 %2, float 1.0, float 2.0
-  store float %3, float addrspace(1)* %out
+  store float %3, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/selectcc-cnde-int.ll b/llvm/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
index 1504165d3d2bc..14640c108913f 100644
--- a/llvm/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
@@ -3,10 +3,10 @@
 ;CHECK-NOT: SETE_INT
 ;CHECK: CNDE_INT {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, literal.x,
 ;CHECK-NEXT: 2
-define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %1 = load i32, i32 addrspace(1)* %in
+define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %1 = load i32, ptr addrspace(1) %in
   %2 = icmp eq i32 %1, 0
   %3 = select i1 %2, i32 1, i32 2
-  store i32 %3, i32 addrspace(1)* %out
+  store i32 %3, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll b/llvm/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
index c3c3023280162..d193a19e4a683 100644
--- a/llvm/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
@@ -3,7 +3,7 @@
 
 ; Test a selectcc with i32 LHS/RHS and float True/False
 
-define amdgpu_kernel void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; CHECK-LABEL: test:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
@@ -22,9 +22,9 @@ define amdgpu_kernel void @test(float addrspace(1)* %out, i32 addrspace(1)* %in)
 ; CHECK-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
 ; CHECK-NEXT:    1065353216(1.000000e+00), 2(2.802597e-45)
 entry:
-  %0 = load i32, i32 addrspace(1)* %in
+  %0 = load i32, ptr addrspace(1) %in
   %1 = icmp sge i32 %0, 0
   %2 = select i1 %1, float 1.0, float 0.0
-  store float %2, float addrspace(1)* %out
+  store float %2, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll b/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
index 563d86daa55cb..9386a55eee1c0 100644
--- a/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
@@ -7,7 +7,7 @@
 ; EG-NOT: CND
 ; EG: SET{{[NEQGTL]+}}_DX10
 
-define amdgpu_kernel void @test_a(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @test_a(ptr addrspace(1) %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 0.000000e+00
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -19,12 +19,12 @@ entry:
   br i1 %6, label %IF, label %ENDIF
 
 IF:
-  %7 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  store i32 0, i32 addrspace(1)* %7
+  %7 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  store i32 0, ptr addrspace(1) %7
   br label %ENDIF
 
 ENDIF:
-  store i32 0, i32 addrspace(1)* %out
+  store i32 0, ptr addrspace(1) %out
   ret void
 }
 
@@ -36,7 +36,7 @@ ENDIF:
 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
 ; EG-NEXT: PRED_
 ; EG-NEXT: ALU clause starting
-define amdgpu_kernel void @test_b(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @test_b(ptr addrspace(1) %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 0.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -48,23 +48,23 @@ entry:
   br i1 %6, label %ENDIF, label %IF
 
 IF:
-  %7 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  store i32 0, i32 addrspace(1)* %7
+  %7 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  store i32 0, ptr addrspace(1) %7
   br label %ENDIF
 
 ENDIF:
-  store i32 0, i32 addrspace(1)* %out
+  store i32 0, ptr addrspace(1) %out
   ret void
 }
 
 ; Test a CND*_INT instruction with float true/false values
 ; EG-LABEL: {{^}}test_c:
 ; EG: CND{{[GTE]+}}_INT
-define amdgpu_kernel void @test_c(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @test_c(ptr addrspace(1) %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   %1 = select i1 %0, float 2.0, float 3.0
-  store float %1, float addrspace(1)* %out
+  store float %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -73,9 +73,9 @@ entry:
 ; SI: v_cndmask_b32_e64
 ; SI-NOT: cmp
 ; SI-NOT: cndmask
-define amdgpu_kernel void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @selectcc_bool(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = select i1 %icmp0, i32 -1, i32 0
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/selectcc.ll b/llvm/test/CodeGen/AMDGPU/selectcc.ll
index b743ba3ba8879..b247dbd373e23 100644
--- a/llvm/test/CodeGen/AMDGPU/selectcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc.ll
@@ -11,10 +11,10 @@
 ; SI: v_cmp_eq_u64
 ; VI: s_cmp_eq_u64
 ; GCN: s_cselect_b32
-define amdgpu_kernel void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
+define amdgpu_kernel void @selectcc_i64(ptr addrspace(1) %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
 entry:
   %0 = icmp eq i64 %lhs, %rhs
   %1 = select i1 %0, i64 %true, i64 %false
-  store i64 %1, i64 addrspace(1)* %out
+  store i64 %1, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/setcc-equivalent.ll b/llvm/test/CodeGen/AMDGPU/setcc-equivalent.ll
index 853afa8772ea6..fee0e6d5ce98e 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-equivalent.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-equivalent.ll
@@ -3,12 +3,12 @@
 ; EG-LABEL: {{^}}and_setcc_setcc_i32:
 ; EG: AND_INT
 ; EG-NEXT: SETE_INT
-define amdgpu_kernel void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @and_setcc_setcc_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
   %cmp1 = icmp eq i32 %a, -1
   %cmp2 = icmp eq i32 %b, -1
   %and = and i1 %cmp1, %cmp2
   %ext = sext i1 %and to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
+  store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -20,11 +20,11 @@ define amdgpu_kernel void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i
 ; EG: SETE_INT
 ; EG: AND_INT
 ; EG: SETE_INT
-define amdgpu_kernel void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+define amdgpu_kernel void @and_setcc_setcc_v4i32(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b) {
   %cmp1 = icmp eq <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
   %cmp2 = icmp eq <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
   %and = and <4 x i1> %cmp1, %cmp2
   %ext = sext <4 x i1> %and to <4 x i32>
-  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out, align 4
+  store <4 x i32> %ext, ptr addrspace(1) %out, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/setcc-fneg-constant.ll b/llvm/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
index 8da83cd2bfed7..a8a02cceb3fb5 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
@@ -11,17 +11,17 @@
 ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL]]
 ; GCN: buffer_store_dword [[MUL]]
 define amdgpu_kernel void @multi_use_fneg_src() #0 {
-  %a = load volatile float, float addrspace(1)* undef
-  %b = load volatile float, float addrspace(1)* undef
-  %x = load volatile i32, i32 addrspace(1)* undef
-  %y = load volatile i32, i32 addrspace(1)* undef
+  %a = load volatile float, ptr addrspace(1) undef
+  %b = load volatile float, ptr addrspace(1) undef
+  %x = load volatile i32, ptr addrspace(1) undef
+  %y = load volatile i32, ptr addrspace(1) undef
 
   %mul = fmul float %a, %b
   %neg.mul = fsub float -0.0, %mul
   %cmp = fcmp oeq float %neg.mul, 4.0
   %select = select i1 %cmp, i32 %x, i32 %y
-  store volatile i32 %select, i32 addrspace(1)* undef
-  store volatile float %mul, float addrspace(1)* undef
+  store volatile i32 %select, ptr addrspace(1) undef
+  store volatile float %mul, ptr addrspace(1) undef
   ret void
 }
 
@@ -34,10 +34,10 @@ define amdgpu_kernel void @multi_use_fneg_src() #0 {
 ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[A]]
 ; GCN: v_mul_f32_e64 [[USE1:v[0-9]+]], [[MUL]], -[[MUL]]
 define amdgpu_kernel void @multi_foldable_use_fneg_src() #0 {
-  %a = load volatile float, float addrspace(1)* undef
-  %b = load volatile float, float addrspace(1)* undef
-  %x = load volatile i32, i32 addrspace(1)* undef
-  %y = load volatile i32, i32 addrspace(1)* undef
+  %a = load volatile float, ptr addrspace(1) undef
+  %b = load volatile float, ptr addrspace(1) undef
+  %x = load volatile i32, ptr addrspace(1) undef
+  %y = load volatile i32, ptr addrspace(1) undef
 
   %mul = fmul float %a, %b
   %neg.mul = fsub float -0.0, %mul
@@ -45,8 +45,8 @@ define amdgpu_kernel void @multi_foldable_use_fneg_src() #0 {
   %cmp = fcmp oeq float %neg.mul, 4.0
   %select = select i1 %cmp, i32 %x, i32 %y
 
-  store volatile i32 %select, i32 addrspace(1)* undef
-  store volatile float %use1, float addrspace(1)* undef
+  store volatile i32 %select, ptr addrspace(1) undef
+  store volatile float %use1, ptr addrspace(1) undef
   ret void
 }
 
@@ -60,17 +60,17 @@ define amdgpu_kernel void @multi_foldable_use_fneg_src() #0 {
 ; GCN-NOT: xor
 ; GCN: buffer_store_dword [[MUL]]
 define amdgpu_kernel void @multi_use_fneg() #0 {
-  %a = load volatile float, float addrspace(1)* undef
-  %b = load volatile float, float addrspace(1)* undef
-  %x = load volatile i32, i32 addrspace(1)* undef
-  %y = load volatile i32, i32 addrspace(1)* undef
+  %a = load volatile float, ptr addrspace(1) undef
+  %b = load volatile float, ptr addrspace(1) undef
+  %x = load volatile i32, ptr addrspace(1) undef
+  %y = load volatile i32, ptr addrspace(1) undef
 
   %mul = fmul float %a, %b
   %neg.mul = fsub float -0.0, %mul
   %cmp = fcmp oeq float %neg.mul, 4.0
   %select = select i1 %cmp, i32 %x, i32 %y
-  store volatile i32 %select, i32 addrspace(1)* undef
-  store volatile float %neg.mul, float addrspace(1)* undef
+  store volatile i32 %select, ptr addrspace(1) undef
+  store volatile float %neg.mul, ptr addrspace(1) undef
   ret void
 }
 
@@ -83,175 +83,175 @@ define amdgpu_kernel void @multi_use_fneg() #0 {
 ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[MUL0]], [[MUL0]]
 ; GCN: buffer_store_dword [[MUL1]]
 define amdgpu_kernel void @multi_foldable_use_fneg() #0 {
-  %a = load volatile float, float addrspace(1)* undef
-  %b = load volatile float, float addrspace(1)* undef
-  %x = load volatile i32, i32 addrspace(1)* undef
-  %y = load volatile i32, i32 addrspace(1)* undef
-  %z = load volatile i32, i32 addrspace(1)* undef
+  %a = load volatile float, ptr addrspace(1) undef
+  %b = load volatile float, ptr addrspace(1) undef
+  %x = load volatile i32, ptr addrspace(1) undef
+  %y = load volatile i32, ptr addrspace(1) undef
+  %z = load volatile i32, ptr addrspace(1) undef
 
   %mul = fmul float %a, %b
   %neg.mul = fsub float -0.0, %mul
   %cmp = fcmp oeq float %neg.mul, 4.0
   %select = select i1 %cmp, i32 %x, i32 %y
   %use1 = fmul float %neg.mul, %mul
-  store volatile i32 %select, i32 addrspace(1)* undef
-  store volatile float %use1, float addrspace(1)* undef
+  store volatile i32 %select, ptr addrspace(1) undef
+  store volatile float %use1, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_oeq_posk_f32:
 ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, v{{[0-9]+}}
 define amdgpu_kernel void @test_setcc_fneg_oeq_posk_f32() #0 {
-  %a = load volatile float, float addrspace(1)* undef
-  %x = load volatile i32, i32 addrspace(1)* undef
-  %y = load volatile i32, i32 addrspace(1)* undef
+  %a = load volatile float, ptr addrspace(1) undef
+  %x = load volatile i32, ptr addrspace(1) undef
+  %y = load volatile i32, ptr addrspace(1) undef
   %neg.a = fsub float -0.0, %a
   %cmp = fcmp oeq float %neg.a, 4.0
   %select = select i1 %cmp, i32 %x, i32 %y
-  store volatile i32 %select, i32 addrspace(1)* undef
+  store volatile i32 %select, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ogt_posk_f32:
 ; GCN: v_cmp_gt_f32_e32 vcc, -4.0, v{{[0-9]+}}
 define amdgpu_kernel void @test_setcc_fneg_ogt_posk_f32() #0 {
-  %a = load volatile float, float addrspace(1)* undef
-  %x = load volatile i32, i32 addrspace(1)* undef
-  %y = load volatile i32, i32 addrspace(1)* undef
+  %a = load volatile float, ptr addrspace(1) undef
+  %x = load volatile i32, ptr addrspace(1) undef
+  %y = load volatile i32, ptr addrspace(1) undef
   %neg.a = fsub float -0.0, %a
   %cmp = fcmp ogt float %neg.a, 4.0
   %select = select i1 %cmp, i32 %x, i32 %y
-  store volatile i32 %select, i32 addrspace(1)* undef
+  store volatile i32 %select, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_oge_posk_f32:
 ; GCN: v_cmp_ge_f32_e32 vcc, -4.0, v{{[0-9]+}}
 define amdgpu_kernel void @test_setcc_fneg_oge_posk_f32() #0 {
-  %a = load volatile float, float addrspace(1)* undef
-  %x = load volatile i32, i32 addrspace(1)* undef
-  %y = load volatile i32, i32 addrspace(1)* undef
+  %a = load volatile float, ptr addrspace(1) undef
+  %x = load volatile i32, ptr addrspace(1) undef
+  %y = load volatile i32, ptr addrspace(1) undef
   %neg.a = fsub float -0.0, %a
   %cmp = fcmp oge float %neg.a, 4.0
   %select = select i1 %cmp, i32 %x, i32 %y
-  store volatile i32 %select, i32 addrspace(1)* undef
+  store volatile i32 %select, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_olt_posk_f32:
 ; GCN: v_cmp_lt_f32_e32 vcc, -4.0, v{{[0-9]+}}
 define amdgpu_kernel void @test_setcc_fneg_olt_posk_f32() #0 {
-  %a = load volatile float, float addrspace(1)* undef
-  %x = load volatile i32, i32 addrspace(1)* undef
-  %y = load volatile i32, i32 addrspace(1)* undef
+  %a = load volatile float, ptr addrspace(1) undef
+  %x = load volatile i32, ptr addrspace(1) undef
+  %y = load volatile i32, ptr addrspace(1) undef
   %neg.a = fsub float -0.0, %a
   %cmp = fcmp olt float %neg.a, 4.0
   %select = select i1 %cmp, i32 %x, i32 %y
-  store volatile i32 %select, i32 addrspace(1)* undef
+  store volatile i32 %select, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ole_posk_f32:
 ; GCN: v_cmp_le_f32_e32 vcc, -4.0, v{{[0-9]+}}
 define amdgpu_kernel void @test_setcc_fneg_ole_posk_f32() #0 {
-  %a = load volatile float, float addrspace(1)* undef
-  %x = load volatile i32, i32 addrspace(1)* undef
-  %y = load volatile i32, i32 addrspace(1)* undef
+  %a = load volatile float, ptr addrspace(1) undef
+  %x = load volatile i32, ptr addrspace(1) undef
+  %y = load volatile i32, ptr addrspace(1) undef
   %neg.a = fsub float -0.0, %a
   %cmp = fcmp ole float %neg.a, 4.0
   %select = select i1 %cmp, i32 %x, i32 %y
-  store volatile i32 %select, i32 addrspace(1)* undef
+  store volatile i32 %select, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_one_posk_f32:
 ; GCN: v_cmp_lg_f32_e32 vcc, -4.0, v{{[0-9]+}}
 define amdgpu_kernel void @test_setcc_fneg_one_posk_f32() #0 {
-  %a = load volatile float, float addrspace(1)* undef
-  %x = load volatile i32, i32 addrspace(1)* undef
-  %y = load volatile i32, i32 addrspace(1)* undef
+  %a = load volatile float, ptr addrspace(1) undef
+  %x = load volatile i32, ptr addrspace(1) undef
+  %y = load volatile i32, ptr addrspace(1) undef
   %neg.a = fsub float -0.0, %a
   %cmp = fcmp one float %neg.a, 4.0
   %select = select i1 %cmp, i32 %x, i32 %y
-  store volatile i32 %select, i32 addrspace(1)* undef
+  store volatile i32 %select, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ueq_posk_f32:
 ; GCN: v_cmp_nlg_f32_e32 vcc, -4.0, v{{[0-9]+}}
 define amdgpu_kernel void @test_setcc_fneg_ueq_posk_f32() #0 {
-  %a = load volatile float, float addrspace(1)* undef
-  %x = load volatile i32, i32 addrspace(1)* undef
-  %y = load volatile i32, i32 addrspace(1)* undef
+  %a = load volatile float, ptr addrspace(1) undef
+  %x = load volatile i32, ptr addrspace(1) undef
+  %y = load volatile i32, ptr addrspace(1) undef
   %neg.a = fsub float -0.0, %a
   %cmp = fcmp ueq float %neg.a, 4.0
   %select = select i1 %cmp, i32 %x, i32 %y
-  store volatile i32 %select, i32 addrspace(1)* undef
+  store volatile i32 %select, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ugt_posk_f32:
 ; GCN: v_cmp_nle_f32_e32 vcc, -4.0, v{{[0-9]+}}
 define amdgpu_kernel void @test_setcc_fneg_ugt_posk_f32() #0 {
-  %a = load volatile float, float addrspace(1)* undef
-  %x = load volatile i32, i32 addrspace(1)* undef
-  %y = load volatile i32, i32 addrspace(1)* undef
+  %a = load volatile float, ptr addrspace(1) undef
+  %x = load volatile i32, ptr addrspace(1) undef
+  %y = load volatile i32, ptr addrspace(1) undef
   %neg.a = fsub float -0.0, %a
   %cmp = fcmp ugt float %neg.a, 4.0
   %select = select i1 %cmp, i32 %x, i32 %y
-  store volatile i32 %select, i32 addrspace(1)* undef
+  store volatile i32 %select, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_uge_posk_f32:
 ; GCN: v_cmp_nlt_f32_e32 vcc, -4.0, v{{[0-9]+}}
 define amdgpu_kernel void @test_setcc_fneg_uge_posk_f32() #0 {
-  %a = load volatile float, float addrspace(1)* undef
-  %x = load volatile i32, i32 addrspace(1)* undef
-  %y = load volatile i32, i32 addrspace(1)* undef
+  %a = load volatile float, ptr addrspace(1) undef
+  %x = load volatile i32, ptr addrspace(1) undef
+  %y = load volatile i32, ptr addrspace(1) undef
   %neg.a = fsub float -0.0, %a
   %cmp = fcmp uge float %neg.a, 4.0
   %select = select i1 %cmp, i32 %x, i32 %y
-  store volatile i32 %select, i32 addrspace(1)* undef
+  store volatile i32 %select, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ult_posk_f32:
 ; GCN: v_cmp_nge_f32_e32 vcc, -4.0, v{{[0-9]+}}
 define amdgpu_kernel void @test_setcc_fneg_ult_posk_f32() #0 {
-  %a = load volatile float, float addrspace(1)* undef
-  %x = load volatile i32, i32 addrspace(1)* undef
-  %y = load volatile i32, i32 addrspace(1)* undef
+  %a = load volatile float, ptr addrspace(1) undef
+  %x = load volatile i32, ptr addrspace(1) undef
+  %y = load volatile i32, ptr addrspace(1) undef
   %neg.a = fsub float -0.0, %a
   %cmp = fcmp ult float %neg.a, 4.0
   %select = select i1 %cmp, i32 %x, i32 %y
-  store volatile i32 %select, i32 addrspace(1)* undef
+  store volatile i32 %select, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ule_posk_f32:
 ; GCN: v_cmp_ngt_f32_e32 vcc, -4.0, v{{[0-9]+}}
 define amdgpu_kernel void @test_setcc_fneg_ule_posk_f32() #0 {
-  %a = load volatile float, float addrspace(1)* undef
-  %x = load volatile i32, i32 addrspace(1)* undef
-  %y = load volatile i32, i32 addrspace(1)* undef
+  %a = load volatile float, ptr addrspace(1) undef
+  %x = load volatile i32, ptr addrspace(1) undef
+  %y = load volatile i32, ptr addrspace(1) undef
   %neg.a = fsub float -0.0, %a
   %cmp = fcmp ule float %neg.a, 4.0
   %select = select i1 %cmp, i32 %x, i32 %y
-  store volatile i32 %select, i32 addrspace(1)* undef
+  store volatile i32 %select, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_une_posk_f32:
 ; GCN: v_cmp_neq_f32_e32 vcc, -4.0, v{{[0-9]+}}
 define amdgpu_kernel void @test_setcc_fneg_une_posk_f32() #0 {
-  %a = load volatile float, float addrspace(1)* undef
-  %x = load volatile i32, i32 addrspace(1)* undef
-  %y = load volatile i32, i32 addrspace(1)* undef
+  %a = load volatile float, ptr addrspace(1) undef
+  %x = load volatile i32, ptr addrspace(1) undef
+  %y = load volatile i32, ptr addrspace(1) undef
   %neg.a = fsub float -0.0, %a
   %cmp = fcmp une float %neg.a, 4.0
   %select = select i1 %cmp, i32 %x, i32 %y
-  store volatile i32 %select, i32 addrspace(1)* undef
+  store volatile i32 %select, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll b/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll
index f5f1d630a049e..7633eb3fa1158 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll
@@ -4,13 +4,13 @@
 ; GCN: s_load_dword s{{[0-9]+}}
 ; GCN: s_load_dword [[LD:s[0-9]+]],
 ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013
-define amdgpu_kernel void @const_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) {
-  %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x
-  %load = load i32, i32 addrspace(4)* %ptr, align 4
+define amdgpu_kernel void @const_load_no_shrink_dword_to_unaligned_byte(ptr addrspace(1) %out, ptr addrspace(4) %in, i32 %x) {
+  %ptr = getelementptr i32, ptr addrspace(4) %in, i32 %x
+  %load = load i32, ptr addrspace(4) %ptr, align 4
   %and = and i32 %load, 524288
   %cmp = icmp eq i32 %and, 0
   %sel = select i1 %cmp, i32 0, i32 -1
-  store i32 %sel, i32 addrspace(1)* %out
+  store i32 %sel, ptr addrspace(1) %out
   ret void
 }
 
@@ -18,13 +18,13 @@ define amdgpu_kernel void @const_load_no_shrink_dword_to_unaligned_byte(i32 addr
 ; GCN: s_load_dword s{{[0-9]+}}
 ; GCN: s_load_dword [[LD:s[0-9]+]],
 ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003
-define amdgpu_kernel void @const_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) {
-  %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x
-  %load = load i32, i32 addrspace(4)* %ptr, align 4
+define amdgpu_kernel void @const_load_no_shrink_dword_to_aligned_byte(ptr addrspace(1) %out, ptr addrspace(4) %in, i32 %x) {
+  %ptr = getelementptr i32, ptr addrspace(4) %in, i32 %x
+  %load = load i32, ptr addrspace(4) %ptr, align 4
   %and = and i32 %load, 8
   %cmp = icmp eq i32 %and, 0
   %sel = select i1 %cmp, i32 0, i32 -1
-  store i32 %sel, i32 addrspace(1)* %out
+  store i32 %sel, ptr addrspace(1) %out
   ret void
 }
 
@@ -32,13 +32,13 @@ define amdgpu_kernel void @const_load_no_shrink_dword_to_aligned_byte(i32 addrsp
 ; GCN: s_load_dword s{{[0-9]+}}
 ; GCN: s_load_dword [[LD:s[0-9]+]],
 ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013
-define amdgpu_kernel void @global_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %x) {
-  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %x
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
+define amdgpu_kernel void @global_load_no_shrink_dword_to_unaligned_byte(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %x) {
+  %ptr = getelementptr i32, ptr addrspace(1) %in, i32 %x
+  %load = load i32, ptr addrspace(1) %ptr, align 4
   %and = and i32 %load, 524288
   %cmp = icmp eq i32 %and, 0
   %sel = select i1 %cmp, i32 0, i32 -1
-  store i32 %sel, i32 addrspace(1)* %out
+  store i32 %sel, ptr addrspace(1) %out
   ret void
 }
 
@@ -46,24 +46,24 @@ define amdgpu_kernel void @global_load_no_shrink_dword_to_unaligned_byte(i32 add
 ; GCN: s_load_dword s{{[0-9]+}}
 ; GCN: s_load_dword [[LD:s[0-9]+]],
 ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003
-define amdgpu_kernel void @global_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %x) {
-  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %x
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
+define amdgpu_kernel void @global_load_no_shrink_dword_to_aligned_byte(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %x) {
+  %ptr = getelementptr i32, ptr addrspace(1) %in, i32 %x
+  %load = load i32, ptr addrspace(1) %ptr, align 4
   %and = and i32 %load, 8
   %cmp = icmp eq i32 %and, 0
   %sel = select i1 %cmp, i32 0, i32 -1
-  store i32 %sel, i32 addrspace(1)* %out
+  store i32 %sel, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: const_load_shrink_dword_to_unaligned_byte:
 ; GCN: global_load_ushort
-define amdgpu_kernel void @const_load_shrink_dword_to_unaligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) {
-  %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x
-  %load = load i32, i32 addrspace(4)* %ptr, align 2
+define amdgpu_kernel void @const_load_shrink_dword_to_unaligned_byte(ptr addrspace(1) %out, ptr addrspace(4) %in, i32 %x) {
+  %ptr = getelementptr i32, ptr addrspace(4) %in, i32 %x
+  %load = load i32, ptr addrspace(4) %ptr, align 2
   %and = and i32 %load, 524288
   %cmp = icmp eq i32 %and, 0
   %sel = select i1 %cmp, i32 0, i32 -1
-  store i32 %sel, i32 addrspace(1)* %out
+  store i32 %sel, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/setcc-multiple-use.ll b/llvm/test/CodeGen/AMDGPU/setcc-multiple-use.ll
index f1d36ebec3ad6..e112370b30235 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-multiple-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-multiple-use.ll
@@ -21,7 +21,7 @@ define i32 @f() {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc_lo
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  %i = load i32, i32 addrspace(3)* null, align 16
+  %i = load i32, ptr addrspace(3) null, align 16
   %i6 = icmp ult i32 0, %i
   %i7 = sext i1 %i6 to i32
   %i8 = add i32 %i7, 1

diff  --git a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
index 73d72f44a1c06..4562ea5dd3b52 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
@@ -12,11 +12,11 @@
 
 ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
 ; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
-define amdgpu_kernel void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_eq_0(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 0
-  store i1 %icmp1, i1 addrspace(1)* %out
+  store i1 %icmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -30,11 +30,11 @@ define amdgpu_kernel void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i3
 
 ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
 ; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
-define amdgpu_kernel void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_0(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 0
-  store i1 %icmp1, i1 addrspace(1)* %out
+  store i1 %icmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -45,11 +45,11 @@ define amdgpu_kernel void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i3
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_eq_neg1(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, -1
-  store i1 %icmp1, i1 addrspace(1)* %out
+  store i1 %icmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -60,11 +60,11 @@ define amdgpu_kernel void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a,
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_neg1(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, -1
-  store i1 %icmp1, i1 addrspace(1)* %out
+  store i1 %icmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -75,11 +75,11 @@ define amdgpu_kernel void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a,
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_0(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 0
-  store i1 %icmp1, i1 addrspace(1)* %out
+  store i1 %icmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -90,11 +90,11 @@ define amdgpu_kernel void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i3
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_0(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 0
-  store i1 %icmp1, i1 addrspace(1)* %out
+  store i1 %icmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -105,11 +105,11 @@ define amdgpu_kernel void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i3
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_1(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 1
-  store i1 %icmp1, i1 addrspace(1)* %out
+  store i1 %icmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -119,11 +119,11 @@ define amdgpu_kernel void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i3
 ; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
-define amdgpu_kernel void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_1(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 1
-  store i1 %icmp1, i1 addrspace(1)* %out
+  store i1 %icmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -132,11 +132,11 @@ define amdgpu_kernel void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i3
 ; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_byte [[TMP]]
 ; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_neg1(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, -1
-  store i1 %icmp1, i1 addrspace(1)* %out
+  store i1 %icmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -145,11 +145,11 @@ define amdgpu_kernel void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a,
 ; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[TMP]]
 ; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_neg1(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, -1
-  store i1 %icmp1, i1 addrspace(1)* %out
+  store i1 %icmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -169,10 +169,10 @@ define amdgpu_kernel void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a,
 ; VI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
+define amdgpu_kernel void @cmp_zext_k_i8max(ptr addrspace(1) %out, i8 %b) nounwind {
   %b.ext = zext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, 255
-  store i1 %icmp0, i1 addrspace(1)* %out
+  store i1 %icmp0, ptr addrspace(1) %out
   ret void
 }
 
@@ -182,11 +182,11 @@ define amdgpu_kernel void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwi
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind {
-  %b = load i8, i8 addrspace(1)* %b.ptr
+define amdgpu_kernel void @cmp_sext_k_neg1(ptr addrspace(1) %out, ptr addrspace(1) %b.ptr) nounwind {
+  %b = load i8, ptr addrspace(1) %b.ptr
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
-  store i1 %icmp0, i1 addrspace(1)* %out
+  store i1 %icmp0, ptr addrspace(1) %out
   ret void
 }
 
@@ -197,7 +197,7 @@ define amdgpu_kernel void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1
 define void @v_cmp_sext_k_neg1_i8_sext_arg(i8 signext %b) nounwind {
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
-  store i1 %icmp0, i1 addrspace(1)* undef
+  store i1 %icmp0, ptr addrspace(1) undef
   ret void
 }
 
@@ -214,10 +214,10 @@ define void @v_cmp_sext_k_neg1_i8_sext_arg(i8 signext %b) nounwind {
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
+define amdgpu_kernel void @cmp_sext_k_neg1_i8_arg(ptr addrspace(1) %out, i8 %b) nounwind {
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
-  store i1 %icmp0, i1 addrspace(1)* %out
+  store i1 %icmp0, ptr addrspace(1) %out
   ret void
 }
 
@@ -225,10 +225,10 @@ define amdgpu_kernel void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b)
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind {
+define amdgpu_kernel void @cmp_zext_k_neg1(ptr addrspace(1) %out, i8 %b) nounwind {
   %b.ext = zext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
-  store i1 %icmp0, i1 addrspace(1)* %out
+  store i1 %icmp0, ptr addrspace(1) %out
   ret void
 }
 
@@ -236,11 +236,11 @@ define amdgpu_kernel void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwin
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_k(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 2
-  store i1 %icmp1, i1 addrspace(1)* %out
+  store i1 %icmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -248,11 +248,11 @@ define amdgpu_kernel void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i3
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_k(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 2
-  store i1 %icmp1, i1 addrspace(1)* %out
+  store i1 %icmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -263,32 +263,32 @@ define amdgpu_kernel void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i3
 ; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_byte [[K]]
-define amdgpu_kernel void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_eq_1(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 1
-  store i1 %icmp1, i1 addrspace(1)* %out
+  store i1 %icmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[K]]
-define amdgpu_kernel void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_1(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 1
-  store i1 %icmp1, i1 addrspace(1)* %out
+  store i1 %icmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[K]]
-define amdgpu_kernel void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_k(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 2
-  store i1 %icmp1, i1 addrspace(1)* %out
+  store i1 %icmp1, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/setcc.ll b/llvm/test/CodeGen/AMDGPU/setcc.ll
index 231e8523eafe3..b250c915473ee 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc.ll
@@ -9,10 +9,10 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; GCN: s_cmp_eq_u32
 ; GCN: s_cmp_eq_u32
-define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @setcc_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %result = icmp eq <2 x i32> %a, %b
   %sext = sext <2 x i1> %result to <2 x i32>
-  store <2 x i32> %sext, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %sext, ptr addrspace(1) %out
   ret void
 }
 
@@ -26,13 +26,13 @@ define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 ; GCN: s_cmp_eq_u32
 ; GCN: s_cmp_eq_u32
 ; GCN: s_cmp_eq_u32
-define amdgpu_kernel void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
-  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
-  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
+define amdgpu_kernel void @setcc_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
+  %a = load <4 x i32>, ptr addrspace(1) %in
+  %b = load <4 x i32>, ptr addrspace(1) %b_ptr
   %result = icmp eq <4 x i32> %a, %b
   %sext = sext <4 x i1> %result to <4 x i32>
-  store <4 x i32> %sext, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %sext, ptr addrspace(1) %out
   ret void
 }
 
@@ -43,55 +43,55 @@ define amdgpu_kernel void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> a
 ; FUNC-LABEL: {{^}}f32_oeq:
 ; R600: SETE_DX10
 ; GCN: v_cmp_eq_f32
-define amdgpu_kernel void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_oeq(ptr addrspace(1) %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp oeq float %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}f32_ogt:
 ; R600: SETGT_DX10
 ; GCN: v_cmp_gt_f32
-define amdgpu_kernel void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ogt(ptr addrspace(1) %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ogt float %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}f32_oge:
 ; R600: SETGE_DX10
 ; GCN: v_cmp_ge_f32
-define amdgpu_kernel void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_oge(ptr addrspace(1) %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp oge float %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}f32_olt:
 ; R600: SETGT_DX10
 ; GCN: v_cmp_lt_f32
-define amdgpu_kernel void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_olt(ptr addrspace(1) %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp olt float %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}f32_ole:
 ; R600: SETGE_DX10
 ; GCN: v_cmp_le_f32
-define amdgpu_kernel void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ole(ptr addrspace(1) %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ole float %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -103,11 +103,11 @@ entry:
 
 ; GCN: v_cmp_lg_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define amdgpu_kernel void @f32_one(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_one(ptr addrspace(1) %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp one float %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -117,11 +117,11 @@ entry:
 ; R600-DAG: AND_INT
 ; R600-DAG: SETNE_INT
 ; GCN: v_cmp_o_f32
-define amdgpu_kernel void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ord(ptr addrspace(1) %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ord float %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -133,11 +133,11 @@ entry:
 
 ; GCN: v_cmp_nlg_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define amdgpu_kernel void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ueq(ptr addrspace(1) %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ueq float %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -146,11 +146,11 @@ entry:
 ; R600: SETE_DX10
 ; GCN: v_cmp_nle_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define amdgpu_kernel void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ugt(ptr addrspace(1) %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ugt float %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -160,11 +160,11 @@ entry:
 
 ; GCN: v_cmp_nlt_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define amdgpu_kernel void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_uge(ptr addrspace(1) %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp uge float %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -174,11 +174,11 @@ entry:
 
 ; GCN: v_cmp_nge_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define amdgpu_kernel void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ult(ptr addrspace(1) %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ult float %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -188,22 +188,22 @@ entry:
 
 ; GCN: v_cmp_ngt_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define amdgpu_kernel void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ule(ptr addrspace(1) %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ule float %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}f32_une:
 ; R600: SETNE_DX10
 ; GCN: v_cmp_neq_f32
-define amdgpu_kernel void @f32_une(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_une(ptr addrspace(1) %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp une float %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -213,11 +213,11 @@ entry:
 ; R600: OR_INT
 ; R600: SETNE_INT
 ; GCN: v_cmp_u_f32
-define amdgpu_kernel void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_uno(ptr addrspace(1) %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp uno float %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -228,110 +228,110 @@ entry:
 ; FUNC-LABEL: {{^}}i32_eq:
 ; R600: SETE_INT
 ; GCN: s_cmp_eq_u32
-define amdgpu_kernel void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_eq(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp eq i32 %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i32_ne:
 ; R600: SETNE_INT
 ; GCN: s_cmp_lg_u32
-define amdgpu_kernel void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ne(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ne i32 %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i32_ugt:
 ; R600: SETGT_UINT
 ; GCN: s_cmp_gt_u32
-define amdgpu_kernel void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ugt(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ugt i32 %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i32_uge:
 ; R600: SETGE_UINT
 ; GCN: s_cmp_ge_u32
-define amdgpu_kernel void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_uge(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp uge i32 %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i32_ult:
 ; R600: SETGT_UINT
 ; GCN: s_cmp_lt_u32
-define amdgpu_kernel void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ult(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ult i32 %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i32_ule:
 ; R600: SETGE_UINT
 ; GCN: s_cmp_le_u32
-define amdgpu_kernel void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ule(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ule i32 %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i32_sgt:
 ; R600: SETGT_INT
 ; GCN: s_cmp_gt_i32
-define amdgpu_kernel void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_sgt(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp sgt i32 %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i32_sge:
 ; R600: SETGE_INT
 ; GCN: s_cmp_ge_i32
-define amdgpu_kernel void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_sge(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp sge i32 %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i32_slt:
 ; R600: SETGT_INT
 ; GCN: s_cmp_lt_i32
-define amdgpu_kernel void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_slt(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp slt i32 %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}i32_sle:
 ; R600: SETGE_INT
 ; GCN: s_cmp_le_i32
-define amdgpu_kernel void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_sle(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp sle i32 %a, %b
   %1 = sext i1 %0 to i32
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -344,16 +344,16 @@ entry:
 ; GCN-DAG: v_cmp_eq_u32
 ; GCN-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
 ; GCN: s_endpgm
-define amdgpu_kernel void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) #0 {
+define amdgpu_kernel void @v3i32_eq(ptr addrspace(1) %out, ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.a = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptra, i32 %tid
-  %gep.b = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptrb, i32 %tid
-  %gep.out = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid
-  %a = load <3 x i32>, <3 x i32> addrspace(1)* %gep.a
-  %b = load <3 x i32>, <3 x i32> addrspace(1)* %gep.b
+  %gep.a = getelementptr <3 x i32>, ptr addrspace(1) %ptra, i32 %tid
+  %gep.b = getelementptr <3 x i32>, ptr addrspace(1) %ptrb, i32 %tid
+  %gep.out = getelementptr <3 x i32>, ptr addrspace(1) %out, i32 %tid
+  %a = load <3 x i32>, ptr addrspace(1) %gep.a
+  %b = load <3 x i32>, ptr addrspace(1) %gep.b
   %cmp = icmp eq <3 x i32> %a, %b
   %ext = sext <3 x i1> %cmp to <3 x i32>
-  store <3 x i32> %ext, <3 x i32> addrspace(1)* %gep.out
+  store <3 x i32> %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
@@ -365,16 +365,16 @@ define amdgpu_kernel void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addr
 ; GCN-DAG: v_cmp_eq_u32
 ; GCN-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
 ; GCN: s_endpgm
-define amdgpu_kernel void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) #0 {
+define amdgpu_kernel void @v3i8_eq(ptr addrspace(1) %out, ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.a = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptra, i32 %tid
-  %gep.b = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptrb, i32 %tid
-  %gep.out = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %out, i32 %tid
-  %a = load <3 x i8>, <3 x i8> addrspace(1)* %gep.a
-  %b = load <3 x i8>, <3 x i8> addrspace(1)* %gep.b
+  %gep.a = getelementptr <3 x i8>, ptr addrspace(1) %ptra, i32 %tid
+  %gep.b = getelementptr <3 x i8>, ptr addrspace(1) %ptrb, i32 %tid
+  %gep.out = getelementptr <3 x i8>, ptr addrspace(1) %out, i32 %tid
+  %a = load <3 x i8>, ptr addrspace(1) %gep.a
+  %b = load <3 x i8>, ptr addrspace(1) %gep.b
   %cmp = icmp eq <3 x i8> %a, %b
   %ext = sext <3 x i1> %cmp to <3 x i8>
-  store <3 x i8> %ext, <3 x i8> addrspace(1)* %gep.out
+  store <3 x i8> %ext, ptr addrspace(1) %gep.out
   ret void
 }
 
@@ -395,7 +395,7 @@ endif:
 ; GCN-DAG: v_cmp_nge_f32_e64 [[A:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}}
 ; GCN-DAG: v_cmp_nle_f32_e64 [[B:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
 ; GCN: s_or_b64 s[2:3], [[A]], [[B]]
-define amdgpu_kernel void @setcc-i1-and-xor(i32 addrspace(1)* %out, float %cond) #0 {
+define amdgpu_kernel void @setcc-i1-and-xor(ptr addrspace(1) %out, float %cond) #0 {
 bb0:
   %tmp5 = fcmp oge float %cond, 0.000000e+00
   %tmp7 = fcmp ole float %cond, 1.000000e+00
@@ -404,7 +404,7 @@ bb0:
   br i1 %tmp11, label %bb2, label %bb1
 
 bb1:
-  store i32 0, i32 addrspace(1)* %out
+  store i32 0, ptr addrspace(1) %out
   br label %bb2
 
 bb2:
@@ -415,14 +415,14 @@ bb2:
 ; GCN: s_cmp_gt_i32
 ; GCN: s_cmp_gt_i32
 define amdgpu_kernel void @setcc_v2i32_expand(
-  <2 x i32> addrspace(1)* %a,
-  <2 x i32> addrspace(1)* %b,
-  <2 x i32> addrspace(1)* %c,
-  <2 x float> addrspace(1)* %r) {
+  ptr addrspace(1) %a,
+  ptr addrspace(1) %b,
+  ptr addrspace(1) %c,
+  ptr addrspace(1) %r) {
 entry:
-  %a.val = load <2 x i32>, <2 x i32> addrspace(1)* %a
-  %b.val = load <2 x i32>, <2 x i32> addrspace(1)* %b
-  %c.val = load <2 x i32>, <2 x i32> addrspace(1)* %c
+  %a.val = load <2 x i32>, ptr addrspace(1) %a
+  %b.val = load <2 x i32>, ptr addrspace(1) %b
+  %c.val = load <2 x i32>, ptr addrspace(1) %c
 
   %icmp.val.1 = icmp sgt <2 x i32> %a.val, <i32 1, i32 1>
   %zext.val.1 = zext <2 x i1> %icmp.val.1 to <2 x i32>
@@ -432,7 +432,7 @@ entry:
   %icmp.val.2 = icmp sgt <2 x i32> %c.val, <i32 1199570944, i32 1199570944>
   %select.val.1 = select <2 x i1> %icmp.val.2, <2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x float> %bitcast.val.1
 
-  store <2 x float> %select.val.1, <2 x float> addrspace(1)* %r
+  store <2 x float> %select.val.1, ptr addrspace(1) %r
   ret void
 }
 
@@ -442,14 +442,14 @@ entry:
 ; GCN: s_cmp_gt_i32
 ; GCN: s_cmp_gt_i32
 define amdgpu_kernel void @setcc_v4i32_expand(
-  <4 x i32> addrspace(1)* %a,
-  <4 x i32> addrspace(1)* %b,
-  <4 x i32> addrspace(1)* %c,
-  <4 x float> addrspace(1)* %r) {
+  ptr addrspace(1) %a,
+  ptr addrspace(1) %b,
+  ptr addrspace(1) %c,
+  ptr addrspace(1) %r) {
 entry:
-  %a.val = load <4 x i32>, <4 x i32> addrspace(1)* %a
-  %b.val = load <4 x i32>, <4 x i32> addrspace(1)* %b
-  %c.val = load <4 x i32>, <4 x i32> addrspace(1)* %c
+  %a.val = load <4 x i32>, ptr addrspace(1) %a
+  %b.val = load <4 x i32>, ptr addrspace(1) %b
+  %c.val = load <4 x i32>, ptr addrspace(1) %c
 
   %icmp.val.1 = icmp sgt <4 x i32> %a.val, <i32 1, i32 1, i32 1, i32 1>
   %zext.val.1 = zext <4 x i1> %icmp.val.1 to <4 x i32>
@@ -459,7 +459,7 @@ entry:
   %icmp.val.2 = icmp sgt <4 x i32> %c.val, <i32 1199570944, i32 1199570944, i32 1199570944, i32 1199570944>
   %select.val.1 = select <4 x i1> %icmp.val.2, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float> %bitcast.val.1
 
-  store <4 x float> %select.val.1, <4 x float> addrspace(1)* %r
+  store <4 x float> %select.val.1, ptr addrspace(1) %r
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/setcc64.ll b/llvm/test/CodeGen/AMDGPU/setcc64.ll
index 5b7099ff7ed2d..50029dfc66662 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc64.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc64.ll
@@ -9,83 +9,83 @@
 
 ; GCN-LABEL: {{^}}f64_oeq:
 ; GCN: v_cmp_eq_f64
-define amdgpu_kernel void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_oeq(ptr addrspace(1) %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp oeq double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}f64_ogt:
 ; GCN: v_cmp_gt_f64
-define amdgpu_kernel void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ogt(ptr addrspace(1) %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ogt double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}f64_oge:
 ; GCN: v_cmp_ge_f64
-define amdgpu_kernel void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_oge(ptr addrspace(1) %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp oge double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}f64_olt:
 ; GCN: v_cmp_lt_f64
-define amdgpu_kernel void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_olt(ptr addrspace(1) %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp olt double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}f64_ole:
 ; GCN: v_cmp_le_f64
-define amdgpu_kernel void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ole(ptr addrspace(1) %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ole double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}f64_one:
 ; GCN: v_cmp_lg_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define amdgpu_kernel void @f64_one(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_one(ptr addrspace(1) %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp one double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}f64_ord:
 ; GCN: v_cmp_o_f64
-define amdgpu_kernel void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ord(ptr addrspace(1) %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ord double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}f64_ueq:
 ; GCN: v_cmp_nlg_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define amdgpu_kernel void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ueq(ptr addrspace(1) %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ueq double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -93,64 +93,64 @@ entry:
 
 ; GCN: v_cmp_nle_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define amdgpu_kernel void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ugt(ptr addrspace(1) %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ugt double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}f64_uge:
 ; GCN: v_cmp_nlt_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define amdgpu_kernel void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_uge(ptr addrspace(1) %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp uge double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}f64_ult:
 ; GCN: v_cmp_nge_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define amdgpu_kernel void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ult(ptr addrspace(1) %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ult double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}f64_ule:
 ; GCN: v_cmp_ngt_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define amdgpu_kernel void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ule(ptr addrspace(1) %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ule double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}f64_une:
 ; GCN: v_cmp_neq_f64
-define amdgpu_kernel void @f64_une(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_une(ptr addrspace(1) %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp une double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}f64_uno:
 ; GCN: v_cmp_u_f64
-define amdgpu_kernel void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_uno(ptr addrspace(1) %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp uno double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -161,102 +161,102 @@ entry:
 ; GCN-LABEL: {{^}}i64_eq:
 ; SI: v_cmp_eq_u64
 ; VI: s_cmp_eq_u64
-define amdgpu_kernel void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_eq(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp eq i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}i64_ne:
 ; SI: v_cmp_ne_u64
 ; VI: s_cmp_lg_u64
-define amdgpu_kernel void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ne(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ne i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}i64_ugt:
 ; GCN: v_cmp_gt_u64
-define amdgpu_kernel void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ugt(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ugt i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}i64_uge:
 ; GCN: v_cmp_ge_u64
-define amdgpu_kernel void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_uge(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp uge i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}i64_ult:
 ; GCN: v_cmp_lt_u64
-define amdgpu_kernel void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ult(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ult i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}i64_ule:
 ; GCN: v_cmp_le_u64
-define amdgpu_kernel void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ule(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ule i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}i64_sgt:
 ; GCN: v_cmp_gt_i64
-define amdgpu_kernel void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_sgt(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp sgt i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}i64_sge:
 ; GCN: v_cmp_ge_i64
-define amdgpu_kernel void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_sge(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp sge i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}i64_slt:
 ; GCN: v_cmp_lt_i64
-define amdgpu_kernel void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_slt(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp slt i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}i64_sle:
 ; GCN: v_cmp_le_i64
-define amdgpu_kernel void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_sle(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp sle i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -265,22 +265,22 @@ entry:
 ; CGV: v_cndmask
 ; SI: v_cmp_eq_u64
 ; VI: s_cmp_eq_u64
-define amdgpu_kernel void @i128_sle(i32 addrspace(1)* %out, i128 %a, i128 %b) #0 {
+define amdgpu_kernel void @i128_sle(ptr addrspace(1) %out, i128 %a, i128 %b) #0 {
 entry:
   %tmp0 = icmp sle i128 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}i128_eq_const:
 ; SI: v_cmp_eq_u64
 ; VI: s_cmp_eq_u64
-define amdgpu_kernel void @i128_eq_const(i32 addrspace(1)* %out, i128 %a) #0 {
+define amdgpu_kernel void @i128_eq_const(ptr addrspace(1) %out, i128 %a) #0 {
 entry:
   %tmp0 = icmp eq i128 %a, 85070591730234615865843651857942052992
   %tmp1 = sext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
index 995837e4fa203..8971f77a70bca 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
 
-define amdgpu_kernel void @sext_i16_to_i32_uniform(i32 addrspace(1)* %out, i16 %a, i32 %b) {
+define amdgpu_kernel void @sext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a, i32 %b) {
 ; GCN-LABEL: sext_i16_to_i32_uniform:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -17,12 +17,12 @@ define amdgpu_kernel void @sext_i16_to_i32_uniform(i32 addrspace(1)* %out, i16 %
 ; GCN-NEXT:    s_endpgm
   %sext = sext i16 %a to i32
   %res = add i32 %b, %sext
-  store i32 %res, i32 addrspace(1)* %out
+  store i32 %res, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @sext_i16_to_i64_uniform(i64 addrspace(1)* %out, i16 %a, i64 %b) {
+define amdgpu_kernel void @sext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a, i64 %b) {
 ; GCN-LABEL: sext_i16_to_i64_uniform:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -40,11 +40,11 @@ define amdgpu_kernel void @sext_i16_to_i64_uniform(i64 addrspace(1)* %out, i16 %
 ; GCN-NEXT:    s_endpgm
   %sext = sext i16 %a to i64
   %res = add i64 %b, %sext
-  store i64 %res, i64 addrspace(1)* %out
+  store i64 %res, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sext_i16_to_i32_divergent(i32 addrspace(1)* %out, i16 %a, i32 %b) {
+define amdgpu_kernel void @sext_i16_to_i32_divergent(ptr addrspace(1) %out, i16 %a, i32 %b) {
 ; GCN-LABEL: sext_i16_to_i32_divergent:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -60,12 +60,12 @@ define amdgpu_kernel void @sext_i16_to_i32_divergent(i32 addrspace(1)* %out, i16
   %tid.truncated = trunc i32 %tid to i16
   %divergent.a = add i16 %a, %tid.truncated
   %sext = sext i16 %divergent.a to i32
-  store i32 %sext, i32 addrspace(1)* %out
+  store i32 %sext, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @sext_i16_to_i64_divergent(i64 addrspace(1)* %out, i16 %a, i64 %b) {
+define amdgpu_kernel void @sext_i16_to_i64_divergent(ptr addrspace(1) %out, i16 %a, i64 %b) {
 ; GCN-LABEL: sext_i16_to_i64_divergent:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -82,11 +82,11 @@ define amdgpu_kernel void @sext_i16_to_i64_divergent(i64 addrspace(1)* %out, i16
   %tid.truncated = trunc i32 %tid to i16
   %divergent.a = add i16 %a, %tid.truncated
   %sext = sext i16 %divergent.a to i64
-  store i64 %sext, i64 addrspace(1)* %out
+  store i64 %sext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sext_i32_to_i64_uniform(i64 addrspace(1)* %out, i32 %a, i64 %b) {
+define amdgpu_kernel void @sext_i32_to_i64_uniform(ptr addrspace(1) %out, i32 %a, i64 %b) {
 ; GCN-LABEL: sext_i32_to_i64_uniform:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s6, s[0:1], 0xb
@@ -104,11 +104,11 @@ define amdgpu_kernel void @sext_i32_to_i64_uniform(i64 addrspace(1)* %out, i32 %
 ; GCN-NEXT:    s_endpgm
   %sext = sext i32 %a to i64
   %res = add i64 %b, %sext
-  store i64 %res, i64 addrspace(1)* %out
+  store i64 %res, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sext_i32_to_i64_divergent(i64 addrspace(1)* %out, i32 %a, i64 %b) {
+define amdgpu_kernel void @sext_i32_to_i64_divergent(ptr addrspace(1) %out, i32 %a, i64 %b) {
 ; GCN-LABEL: sext_i32_to_i64_divergent:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -123,7 +123,7 @@ define amdgpu_kernel void @sext_i32_to_i64_divergent(i64 addrspace(1)* %out, i32
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %divergent.a = add i32 %a, %tid
   %sext = sext i32 %divergent.a to i64
-  store i64 %sext, i64 addrspace(1)* %out
+  store i64 %sext, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index fbe91c12155b7..251c5b80d0b19 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -8,7 +8,7 @@
 ; threads will execute the same code paths, so we don't need to worry
 ; about instructions in 
diff erent blocks overwriting each other.
 
-define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_salu_br(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 ; SI-LABEL: sgpr_if_else_salu_br:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
@@ -49,11 +49,11 @@ else:
 endif:
   %3 = phi i32 [%1, %if], [%2, %else]
   %4 = add i32 %3, %a
-  store i32 %4, i32 addrspace(1)* %out
+  store i32 %4, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b, [8 x i32], i32 %c, [8 x i32], i32 %d, [8 x i32], i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_salu_br_opt(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b, [8 x i32], i32 %c, [8 x i32], i32 %d, [8 x i32], i32 %e) {
 ; SI-LABEL: sgpr_if_else_salu_br_opt:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0x13
@@ -99,13 +99,13 @@ else:
 endif:
   %phi = phi i32 [%add0, %if], [%add1, %else]
   %add2 = add i32 %phi, %a
-  store i32 %add2, i32 addrspace(1)* %out
+  store i32 %add2, ptr addrspace(1) %out
   ret void
 }
 
 ; The two S_ADD instructions should write to 
diff erent registers, since
 ; 
diff erent threads will take 
diff erent control flow paths.
-define amdgpu_kernel void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_valu_br(ptr addrspace(1) %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 ; SI-LABEL: sgpr_if_else_valu_br:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xc
@@ -151,11 +151,11 @@ else:
 
 endif:
   %tmp4 = phi i32 [%tmp2, %if], [%tmp3, %else]
-  store i32 %tmp4, i32 addrspace(1)* %out
+  store i32 %tmp4, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
 ; SI-LABEL: sgpr_if_else_valu_cmp_phi_br:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -204,21 +204,21 @@ entry:
   br i1 %tmp1, label %if, label %else
 
 if:
-  %gep.if = getelementptr i32, i32 addrspace(1)* %a, i32 %tid
-  %a.val = load i32, i32 addrspace(1)* %gep.if
+  %gep.if = getelementptr i32, ptr addrspace(1) %a, i32 %tid
+  %a.val = load i32, ptr addrspace(1) %gep.if
   %cmp.if = icmp eq i32 %a.val, 0
   br label %endif
 
 else:
-  %gep.else = getelementptr i32, i32 addrspace(1)* %b, i32 %tid
-  %b.val = load i32, i32 addrspace(1)* %gep.else
+  %gep.else = getelementptr i32, ptr addrspace(1) %b, i32 %tid
+  %b.val = load i32, ptr addrspace(1) %gep.else
   %cmp.else = icmp slt i32 %b.val, 0
   br label %endif
 
 endif:
   %tmp4 = phi i1 [%cmp.if, %if], [%cmp.else, %else]
   %ext = sext i1 %tmp4 to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
index ed11c2c3e22f5..9a75091b83899 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
@@ -6,14 +6,14 @@
 
 ; SI-LABEL: {{^}}test_dup_operands:
 ; SI: v_add_{{[iu]}}32_e32
-define amdgpu_kernel void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) {
-  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
+define amdgpu_kernel void @test_dup_operands(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
+  %a = load <2 x i32>, ptr addrspace(1) %in
   %lo = extractelement <2 x i32> %a, i32 0
   %hi = extractelement <2 x i32> %a, i32 1
   %add = add i32 %lo, %lo
   %vec0 = insertelement <2 x i32> undef, i32 %add, i32 0
   %vec1 = insertelement <2 x i32> %vec0, i32 %hi, i32 1
-  store <2 x i32> %vec1, <2 x i32> addrspace(1)* %out, align 8
+  store <2 x i32> %vec1, ptr addrspace(1) %out, align 8
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll
index 4e796f3384e4b..6635d5f4d6bde 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll
@@ -8,19 +8,19 @@ target triple = "amdgcn-amd-amdhsa"
 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]]
 ; There should be no redundant copies from PTR_HI.
 ; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]]
-define protected amdgpu_kernel void @t0(float addrspace(1)* %p, i32 %i0, i32 %j0, i32 %k0) {
+define protected amdgpu_kernel void @t0(ptr addrspace(1) %p, i32 %i0, i32 %j0, i32 %k0) {
 entry:
   %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %i = add i32 %0, %i0
   %j = add i32 %0, %j0
   %k = add i32 %0, %k0
-  %pi = getelementptr float, float addrspace(1)* %p, i32 %i
-  %vi = load float, float addrspace(1)* %pi
-  %pj = getelementptr float, float addrspace(1)* %p, i32 %j
-  %vj = load float, float addrspace(1)* %pj
+  %pi = getelementptr float, ptr addrspace(1) %p, i32 %i
+  %vi = load float, ptr addrspace(1) %pi
+  %pj = getelementptr float, ptr addrspace(1) %p, i32 %j
+  %vj = load float, ptr addrspace(1) %pj
   %sum = fadd float %vi, %vj
-  %pk = getelementptr float, float addrspace(1)* %p, i32 %k
-  store float %sum, float addrspace(1)* %pk
+  %pk = getelementptr float, ptr addrspace(1) %p, i32 %k
+  store float %sum, ptr addrspace(1) %pk
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
index b45ed2dce773d..e49ee6d2b6bea 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -5,10 +5,9 @@
 ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
 ; CHECK: ; %bb.1: ; %ELSE
 ; CHECK: s_xor_b32 s{{[0-9]}}, [[DST]]
-define amdgpu_ps void @phi1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <8 x i32> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @phi1(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
+  %tmp20 = load <4 x i32>, ptr addrspace(4) %arg, !tbaa !0
   %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 0, i32 0)
   %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 16, i32 0)
   %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 32, i32 0)
@@ -29,10 +28,9 @@ ENDIF:                                            ; preds = %ELSE, %main_body
 
 ; Make sure this program doesn't crash
 ; CHECK-LABEL: {{^}}phi2:
-define amdgpu_ps void @phi2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <8 x i32> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 {
+define amdgpu_ps void @phi2(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
+  %tmp20 = load <4 x i32>, ptr addrspace(4) %arg, !tbaa !0
   %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 16, i32 0)
   %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 32, i32 0)
   %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 36, i32 0)
@@ -48,10 +46,8 @@ main_body:
   %tmp33 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 84, i32 0)
   %tmp34 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 88, i32 0)
   %tmp35 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 92, i32 0)
-  %tmp36 = getelementptr <8 x i32>, <8 x i32> addrspace(4)* %arg2, i32 0
-  %tmp37 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp36, !tbaa !0
-  %tmp38 = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg1, i32 0
-  %tmp39 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp38, !tbaa !0
+  %tmp37 = load <8 x i32>, ptr addrspace(4) %arg2, !tbaa !0
+  %tmp39 = load <4 x i32>, ptr addrspace(4) %arg1, !tbaa !0
   %i.i = extractelement <2 x i32> %arg5, i32 0
   %j.i = extractelement <2 x i32> %arg5, i32 1
   %i.f.i = bitcast i32 %i.i to float
@@ -169,10 +165,9 @@ ENDIF24:                                          ; preds = %IF25, %ENDIF
 
 ; We just want to make sure the program doesn't crash
 ; CHECK-LABEL: {{^}}loop:
-define amdgpu_ps void @loop(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <8 x i32> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @loop(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
+  %tmp20 = load <4 x i32>, ptr addrspace(4) %arg, !tbaa !0
   %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 0, i32 0)
   %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 4, i32 0)
   %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 8, i32 0)
@@ -222,15 +217,12 @@ ENDIF:                                            ; preds = %LOOP
 ; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v[[[SAMPLE_LO]]:[[SAMPLE_HI]]]
 ; CHECK: exp
 ; CHECK: s_endpgm
-define amdgpu_ps void @sample_v3([17 x <4 x i32>] addrspace(4)* inreg %arg, [32 x <4 x i32>] addrspace(4)* inreg %arg1, [16 x <8 x i32>] addrspace(4)* inreg %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
+define amdgpu_ps void @sample_v3(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
 entry:
-  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0
-  %tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
+  %tmp21 = load <4 x i32>, ptr addrspace(4) %arg, !tbaa !0
   %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 16, i32 0)
-  %tmp23 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 0
-  %tmp24 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp23, !tbaa !0
-  %tmp25 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 0
-  %tmp26 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp25, !tbaa !0
+  %tmp24 = load <8 x i32>, ptr addrspace(4) %arg2, !tbaa !0
+  %tmp26 = load <4 x i32>, ptr addrspace(4) %arg1, !tbaa !0
   %tmp27 = fcmp oeq float %tmp22, 0.000000e+00
   %tmp26.bc = bitcast <4 x i32> %tmp26 to <4 x i32>
   br i1 %tmp27, label %if, label %else
@@ -261,9 +253,9 @@ endif:                                            ; preds = %else, %if
 ; CHECK: buffer_load_dword
 ; CHECK: v_add
 ; CHECK: s_endpgm
-define amdgpu_kernel void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) {
+define amdgpu_kernel void @copy1(ptr addrspace(1) %out, ptr addrspace(1) %in0) {
 entry:
-  %tmp = load float, float addrspace(1)* %in0
+  %tmp = load float, ptr addrspace(1) %in0
   %tmp1 = fcmp oeq float %tmp, 0.000000e+00
   br i1 %tmp1, label %if0, label %endif
 
@@ -279,14 +271,14 @@ if1:                                              ; preds = %if0
 endif:                                            ; preds = %if1, %if0, %entry
   %tmp5 = phi i32 [ 0, %entry ], [ %tmp2, %if0 ], [ %tmp4, %if1 ]
   %tmp6 = bitcast i32 %tmp5 to float
-  store float %tmp6, float addrspace(1)* %out
+  store float %tmp6, ptr addrspace(1) %out
   ret void
 }
 
 ; This test is just checking that we don't crash / assertion fail.
 ; CHECK-LABEL: {{^}}copy2:
 ; CHECK: s_endpgm
-define amdgpu_ps void @copy2([17 x <4 x i32>] addrspace(4)* inreg %arg, [32 x <4 x i32>] addrspace(4)* inreg %arg1, [16 x <8 x i32>] addrspace(4)* inreg %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
+define amdgpu_ps void @copy2(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
 entry:
   br label %LOOP68
 
@@ -324,15 +316,12 @@ ENDIF69:                                          ; preds = %LOOP68
 ; CHECK: v_add_{{[iu]}}32_e32 v[[ADD:[0-9]+]], vcc, 1, v{{[0-9]+}}
 ; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[ADD]]]
 ; CHECK: s_branch
-define amdgpu_ps void @sample_rsrc([6 x <4 x i32>] addrspace(4)* inreg %arg, [17 x <4 x i32>] addrspace(4)* inreg %arg1, [16 x <4 x i32>] addrspace(4)* inreg %arg2, [32 x <8 x i32>] addrspace(4)* inreg %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
+define amdgpu_ps void @sample_rsrc(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, ptr addrspace(4) inreg %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
 bb:
-  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg1, i32 0, i32 0
-  %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !3
+  %tmp22 = load <4 x i32>, ptr addrspace(4) %arg1, !tbaa !3
   %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp22, i32 16, i32 0)
-  %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(4)* %arg3, i32 0, i32 0
-  %tmp26 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp25, !tbaa !3
-  %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(4)* %arg2, i32 0, i32 0
-  %tmp28 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp27, !tbaa !3
+  %tmp26 = load <8 x i32>, ptr addrspace(4) %arg3, !tbaa !3
+  %tmp28 = load <4 x i32>, ptr addrspace(4) %arg2, !tbaa !3
   %i.i = extractelement <2 x i32> %arg7, i32 0
   %j.i = extractelement <2 x i32> %arg7, i32 1
   %i.f.i = bitcast i32 %i.i to float
@@ -374,11 +363,11 @@ bb71:                                             ; preds = %bb80, %bb38
 ; Check the resource descriptor is stored in an sgpr.
 ; CHECK-LABEL: {{^}}mimg_srsrc_sgpr:
 ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
-define amdgpu_ps void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(4)* inreg %arg) #0 {
+define amdgpu_ps void @mimg_srsrc_sgpr(ptr addrspace(4) inreg %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
-  %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(4)* %arg, i32 0, i32 %tid
-  %tmp8 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp7, align 32, !tbaa !0
+  %tmp7 = getelementptr [34 x <8 x i32>], ptr addrspace(4) %arg, i32 0, i32 %tid
+  %tmp8 = load <8 x i32>, ptr addrspace(4) %tmp7, align 32, !tbaa !0
   %tmp = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 7.500000e-01, float 2.500000e-01, <8 x i32> %tmp8, <4 x i32> undef, i1 0, i32 0, i32 0)
   %tmp10 = extractelement <4 x float> %tmp, i32 0
   %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp10)
@@ -389,11 +378,11 @@ bb:
 ; Check the sampler is stored in an sgpr.
 ; CHECK-LABEL: {{^}}mimg_ssamp_sgpr:
 ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
-define amdgpu_ps void @mimg_ssamp_sgpr([17 x <4 x i32>] addrspace(4)* inreg %arg) #0 {
+define amdgpu_ps void @mimg_ssamp_sgpr(ptr addrspace(4) inreg %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
-  %tmp7 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i32 0, i32 %tid
-  %tmp8 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp7, align 16, !tbaa !0
+  %tmp7 = getelementptr [17 x <4 x i32>], ptr addrspace(4) %arg, i32 0, i32 %tid
+  %tmp8 = load <4 x i32>, ptr addrspace(4) %tmp7, align 16, !tbaa !0
   %tmp = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 7.500000e-01, float 2.500000e-01, <8 x i32> undef, <4 x i32> %tmp8, i1 0, i32 0, i32 0)
   %tmp10 = extractelement <4 x float> %tmp, i32 0
   %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef)

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll
index 02f90ba227704..29622d3fd0f1b 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-incorrect-fi-bookkeeping-bug.ll
@@ -4,7 +4,7 @@
 ; which was due to incorrect book-keeping of removed dead frame indices.
 
 ; CHECK-LABEL: {{^}}kernel0:
-define amdgpu_kernel void @kernel0(i32 addrspace(1)* %out, i32 %in) #1 {
+define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
   call void asm sideeffect "", "~{v[0:7]}" () #0
   call void asm sideeffect "", "~{v[8:15]}" () #0
   call void asm sideeffect "", "~{v[16:19]}"() #0

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
index 7741bd427e6de..804779d5a63f8 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
@@ -6,7 +6,7 @@
 ; When we started spilling them into virtual VGPR lanes, we always succeed in doing so.
 ; The regalloc pass later takes care of allocating VGPRs to these virtual registers.
 
-define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 {
+define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %in) #1 {
 ; GCN-LABEL: partial_no_vgprs_last_sgpr_spill:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_add_u32 s0, s0, s7

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index ad96f56f2d789..03a538e975bef 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -278,7 +278,7 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %alloca
+  store volatile i32 0, ptr addrspace(5) %alloca
 
   call void asm sideeffect "",
   "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
@@ -575,7 +575,7 @@ define void @spill_to_lowest_available_vgpr() #0 {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %alloca
+  store volatile i32 0, ptr addrspace(5) %alloca
 
   call void asm sideeffect "",
   "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
@@ -865,7 +865,7 @@ define void @spill_sgpr_with_sgpr_uses() #0 {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %alloca
+  store volatile i32 0, ptr addrspace(5) %alloca
 
   call void asm sideeffect "",
   "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
@@ -1145,7 +1145,7 @@ define void @spill_sgpr_with_tail_call() #0 {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %alloca
+  store volatile i32 0, ptr addrspace(5) %alloca
 
   call void asm sideeffect "",
   "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
@@ -1178,7 +1178,7 @@ define void @spill_sgpr_with_tail_call() #0 {
   ret void
 }
 
-define void @spill_sgpr_no_free_vgpr(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-LABEL: spill_sgpr_no_free_vgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1464,7 +1464,7 @@ define void @spill_sgpr_no_free_vgpr(<4 x i32> addrspace(1)* %out, <4 x i32> add
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %a = load <4 x i32>, ptr addrspace(1) %in
   call void asm sideeffect "",
   "~{v6},~{v7},~{v8},~{v9}
   ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
@@ -1496,7 +1496,7 @@ define void @spill_sgpr_no_free_vgpr(<4 x i32> addrspace(1)* %out, <4 x i32> add
   call void asm sideeffect "",
   "~{s34},~{s35},~{s36},~{s37}" () #0
 
-  store <4 x i32> %a, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %a, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/sgprcopies.ll b/llvm/test/CodeGen/AMDGPU/sgprcopies.ll
index 68cd83bb6cf09..96773bee1cad7 100644
--- a/llvm/test/CodeGen/AMDGPU/sgprcopies.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgprcopies.ll
@@ -3,7 +3,7 @@
 ; GCN-LABEL: {{^}}checkTwoBlocksWithUniformBranch
 ; GCN: BB0_2
 ; GCN: v_add
-define amdgpu_kernel void @checkTwoBlocksWithUniformBranch(i32 addrspace(1)* nocapture %out, i32 %width, float %xPos, float %yPos, float %xStep, float %yStep, i32 %maxIter) {
+define amdgpu_kernel void @checkTwoBlocksWithUniformBranch(ptr addrspace(1) nocapture %out, i32 %width, float %xPos, float %yPos, float %xStep, float %yStep, i32 %maxIter) {
 entry:
   %conv = call i32 @llvm.amdgcn.workitem.id.x() #1
   %rem = urem i32 %conv, %width
@@ -45,8 +45,8 @@ for.end.loopexit:                                 ; preds = %for.body
 for.end:                                          ; preds = %for.end.loopexit, %entry
   %iter.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %for.end.loopexit ]
   %idxprom = ashr exact i32 %conv, 32
-  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %idxprom
-  store i32 %iter.0.lcssa, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %idxprom
+  store i32 %iter.0.lcssa, ptr addrspace(1) %arrayidx, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/shift-select.ll b/llvm/test/CodeGen/AMDGPU/shift-select.ll
index 3c6a07ab05d08..87adc2ba6b4a4 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-select.ll
@@ -5,130 +5,130 @@
 
 ; GCN-LABEL: name:            s_shl_i32
 ; GCN: S_LSHL_B32
-define amdgpu_kernel void @s_shl_i32(i32 addrspace(1)* %out, i32 %lhs, i32 %rhs) {
+define amdgpu_kernel void @s_shl_i32(ptr addrspace(1) %out, i32 %lhs, i32 %rhs) {
   %result = shl i32 %lhs, %rhs
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: name:            v_shl_i32
 ; GFX6: V_LSHL_B32_e32
 ; GFX8PLUS: V_LSHLREV_B32_e32
-define amdgpu_kernel void @v_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @v_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
+  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %a = load i32, ptr addrspace(1) %in
+  %b = load i32, ptr addrspace(1) %b_ptr
   %result = shl i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: name:            s_lshr_i32
 ; GCN: S_LSHR_B32
-define amdgpu_kernel void @s_lshr_i32(i32 addrspace(1)* %out, i32 %lhs, i32 %rhs) {
+define amdgpu_kernel void @s_lshr_i32(ptr addrspace(1) %out, i32 %lhs, i32 %rhs) {
   %result = lshr i32 %lhs, %rhs
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: name:            v_lshr_i32
 ; GFX6: V_LSHR_B32_e32
 ; GFX8PLUS: V_LSHRREV_B32_e64
-define amdgpu_kernel void @v_lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @v_lshr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
+  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %a = load i32, ptr addrspace(1) %in
+  %b = load i32, ptr addrspace(1) %b_ptr
   %result = lshr i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: name:            s_ashr_i32
 ; GCN: S_ASHR_I32
-define amdgpu_kernel void @s_ashr_i32(i32 addrspace(1)* %out, i32 %lhs, i32 %rhs) #0 {
+define amdgpu_kernel void @s_ashr_i32(ptr addrspace(1) %out, i32 %lhs, i32 %rhs) #0 {
   %result = ashr i32 %lhs, %rhs
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: name:            v_ashr_i32
 ; GFX6: V_ASHR_I32_e32
 ; GFX8PLUS: V_ASHRREV_I32_e64
-define amdgpu_kernel void @v_ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @v_ashr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
+  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %a = load i32, ptr addrspace(1) %in
+  %b = load i32, ptr addrspace(1) %b_ptr
   %result = ashr i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: name:            s_shl_i64
 ; GCN: S_LSHL_B64
-define amdgpu_kernel void @s_shl_i64(i64 addrspace(1)* %out, i64 %lhs, i64 %rhs) {
+define amdgpu_kernel void @s_shl_i64(ptr addrspace(1) %out, i64 %lhs, i64 %rhs) {
   %result = shl i64 %lhs, %rhs
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: name:            v_shl_i64
 ; GFX6: V_LSHL_B64
 ; GFX8: V_LSHLREV_B64
-define amdgpu_kernel void @v_shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idx = zext i32 %tid to i64
-  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %idx
-  %a = load i64, i64 addrspace(1)* %in
-  %b = load i64, i64 addrspace(1)* %b_ptr
+  %b_ptr = getelementptr i64, ptr addrspace(1) %in, i64 %idx
+  %a = load i64, ptr addrspace(1) %in
+  %b = load i64, ptr addrspace(1) %b_ptr
   %result = shl i64 %a, %b
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: name:            s_lshr_i64
 ; GCN: S_LSHR_B64
-define amdgpu_kernel void @s_lshr_i64(i64 addrspace(1)* %out, i64 %lhs, i64 %rhs) {
+define amdgpu_kernel void @s_lshr_i64(ptr addrspace(1) %out, i64 %lhs, i64 %rhs) {
   %result = lshr i64 %lhs, %rhs
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: name:            v_lshr_i64
 ; GFX6: V_LSHR_B64
 ; GFX8: V_LSHRREV_B64
-define amdgpu_kernel void @v_lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_lshr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idx = zext i32 %tid to i64
-  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %idx
-  %a = load i64, i64 addrspace(1)* %in
-  %b = load i64, i64 addrspace(1)* %b_ptr
+  %b_ptr = getelementptr i64, ptr addrspace(1) %in, i64 %idx
+  %a = load i64, ptr addrspace(1) %in
+  %b = load i64, ptr addrspace(1) %b_ptr
   %result = lshr i64 %a, %b
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: name:            s_ashr_i64
 ; GCN: S_ASHR_I64
-define amdgpu_kernel void @s_ashr_i64(i64 addrspace(1)* %out, i64 %lhs, i64 %rhs) {
+define amdgpu_kernel void @s_ashr_i64(ptr addrspace(1) %out, i64 %lhs, i64 %rhs) {
   %result = ashr i64 %lhs, %rhs
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: name:            v_ashr_i64
 ; GFX6: V_ASHR_I64
 ; GFX8: V_ASHRREV_I64
-define amdgpu_kernel void @v_ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_ashr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idx = zext i32 %tid to i64
-  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %idx
-  %a = load i64, i64 addrspace(1)* %in
-  %b = load i64, i64 addrspace(1)* %b_ptr
+  %b_ptr = getelementptr i64, ptr addrspace(1) %in, i64 %idx
+  %a = load i64, ptr addrspace(1) %in
+  %b = load i64, ptr addrspace(1) %b_ptr
   %result = ashr i64 %a, %b
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll
index 9931b9a3b51c3..dadffca71b3e9 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll
@@ -105,8 +105,8 @@ endif1:
   br i1 %.0, label %if2, label %endif2
 
 if2:
-  %.5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(6)* undef, i32 31, !amdgpu.uniform !0
-  %.6 = load <4 x i32>, <4 x i32> addrspace(6)* %.5, align 16, !invariant.load !0
+  %.5 = getelementptr inbounds <4 x i32>, ptr addrspace(6) undef, i32 31, !amdgpu.uniform !0
+  %.6 = load <4 x i32>, ptr addrspace(6) %.5, align 16, !invariant.load !0
   %.7 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %.test1, <4 x i32> %.6, i32 4, i32 0, i32 0)
   %.8 = sitofp i32 %.7 to float
   br label %endif2

diff  --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
index dae077944c6f5..9fafd12fc5d84 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
@@ -9,15 +9,15 @@
 ; GCN: s_cbranch_scc1
 ; GCN-NOT: s_endpgm
 ; GCN: .Lfunc_end0
-define amdgpu_kernel void @annotate_unreachable_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
+define amdgpu_kernel void @annotate_unreachable_noloop(ptr addrspace(1) noalias nocapture readonly %arg) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
 
 bb1:                                              ; preds = %bb
   %tmp2 = sext i32 %tmp to i64
-  %tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2
-  %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16
+  %tmp3 = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i64 %tmp2
+  %tmp4 = load <4 x float>, ptr addrspace(1) %tmp3, align 16
   br i1 undef, label %bb5, label %bb3
 
 bb3:                                              ; preds = %bb1
@@ -42,17 +42,17 @@ bb5:                                              ; preds = %bb3, %bb1
 ; GCN: s_and_saveexec_b64
 ; GCN-NEXT: s_endpgm
 ; GCN: .Lfunc_end
-define amdgpu_kernel void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
+define amdgpu_kernel void @annotate_ret_noloop(ptr addrspace(1) noalias nocapture readonly %arg) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
 
 bb1:                                              ; preds = %bb
   %tmp2 = sext i32 %tmp to i64
-  %tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2
-  %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16
+  %tmp3 = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i64 %tmp2
+  %tmp4 = load <4 x float>, ptr addrspace(1) %tmp3, align 16
   %tmp5 = extractelement <4 x float> %tmp4, i32 1
-  store volatile <4 x float> %tmp4, <4 x float> addrspace(1)* undef
+  store volatile <4 x float> %tmp4, ptr addrspace(1) undef
   %cmp = fcmp ogt float %tmp5, 1.0
   br i1 %cmp, label %bb5, label %bb3
 
@@ -75,14 +75,14 @@ bb5:                                              ; preds = %bb3, %bb1
 ; GCN: s_cbranch_scc1
 ; GCN: s_endpgm
 ; GCN: .Lfunc_end
-define amdgpu_kernel void @uniform_annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg, i32 %tmp) #0 {
+define amdgpu_kernel void @uniform_annotate_ret_noloop(ptr addrspace(1) noalias nocapture readonly %arg, i32 %tmp) #0 {
 bb:
   br label %bb1
 
 bb1:                                              ; preds = %bb
   %tmp2 = sext i32 %tmp to i64
-  %tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2
-  %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16
+  %tmp3 = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i64 %tmp2
+  %tmp4 = load <4 x float>, ptr addrspace(1) %tmp3, align 16
   br i1 undef, label %bb5, label %bb3
 
 bb3:                                              ; preds = %bb1

diff  --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll
index 9c385191941dd..f8fde0ab5892d 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll
@@ -11,15 +11,15 @@
 ; GCN: s_and_saveexec_b64
 ; GCN-NOT: s_endpgm
 ; GCN: .Lfunc_end0
-define amdgpu_kernel void @annotate_unreachable(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
+define amdgpu_kernel void @annotate_unreachable(ptr addrspace(1) noalias nocapture readonly %arg) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
 
 bb1:                                              ; preds = %bb
   %tmp2 = sext i32 %tmp to i64
-  %tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2
-  %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16
+  %tmp3 = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i64 %tmp2
+  %tmp4 = load <4 x float>, ptr addrspace(1) %tmp3, align 16
   br i1 undef, label %bb3, label %bb5  ; label order reversed
 
 bb3:                                              ; preds = %bb1

diff  --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index fb070e8304919..85ce877fcb0ca 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=FLAT %s
 
-define amdgpu_kernel void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out, i32 %a) {
 ; SI-LABEL: break_inserted_outside_of_loop:
 ; SI:       ; %bb.0: ; %main_body
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -59,7 +59,7 @@ main_body:
   br label %ENDIF
 
 ENDLOOP:
-  store i32 0, i32 addrspace(1)* %out
+  store i32 0, ptr addrspace(1) %out
   ret void
 
 ENDIF:
@@ -139,7 +139,7 @@ exit:
   ret void
 }
 
-define amdgpu_kernel void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
+define amdgpu_kernel void @switch_unreachable(ptr addrspace(1) %g, ptr addrspace(3) %l, i32 %x) nounwind {
 ; SI-LABEL: switch_unreachable:
 ; SI:       ; %bb.0: ; %centry
 ;
@@ -314,7 +314,7 @@ entry:
   br label %while.cond.outer
 
 while.cond.outer:
-  %tmp = load float, float addrspace(1)* undef
+  %tmp = load float, ptr addrspace(1) undef
   br label %while.cond
 
 while.cond:
@@ -331,7 +331,7 @@ if.end:
   br i1 %cmp2, label %if.else, label %while.cond.outer
 
 if.else:
-  store volatile i32 3, i32 addrspace(1)* undef, align 4
+  store volatile i32 3, ptr addrspace(1) undef, align 4
   br label %while.cond
 
 for.cond:

diff  --git a/llvm/test/CodeGen/AMDGPU/si-annotatecfg-multiple-backedges.ll b/llvm/test/CodeGen/AMDGPU/si-annotatecfg-multiple-backedges.ll
index 0fc1f4ba81331..0edd9f4cd6b4f 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotatecfg-multiple-backedges.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotatecfg-multiple-backedges.ll
@@ -5,7 +5,7 @@
 ; reset the variable introduced to record and accumulate the number of threads
 ; which have already exited the loop.
 
-define amdgpu_kernel void @multiple_backedges(i32 %arg, i32* %arg1) {
+define amdgpu_kernel void @multiple_backedges(i32 %arg, ptr %arg1) {
 ; OPT-LABEL: @multiple_backedges(
 ; OPT-NEXT:  entry:
 ; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -29,9 +29,9 @@ define amdgpu_kernel void @multiple_backedges(i32 %arg, i32* %arg1) {
 ; OPT:       loop_exit:
 ; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]])
 ; OPT-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP]] to i64
-; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[ARG1:%.*]], i64 [[TMP12]]
-; OPT-NEXT:    [[TMP14:%.*]] = addrspacecast i32* [[TMP13]] to i32 addrspace(1)*
-; OPT-NEXT:    store i32 [[TMP5]], i32 addrspace(1)* [[TMP14]], align 4
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARG1:%.*]], i64 [[TMP12]]
+; OPT-NEXT:    [[TMP14:%.*]] = addrspacecast ptr [[TMP13]] to ptr addrspace(1)
+; OPT-NEXT:    store i32 [[TMP5]], ptr addrspace(1) [[TMP14]], align 4
 ; OPT-NEXT:    ret void
 ;
 entry:
@@ -51,9 +51,9 @@ loop_end:
 
 loop_exit:
   %tmp12 = zext i32 %tmp to i64
-  %tmp13 = getelementptr inbounds i32, i32* %arg1, i64 %tmp12
-  %tmp14 = addrspacecast i32* %tmp13 to i32 addrspace(1)*
-  store i32 %tmp5, i32 addrspace(1)* %tmp14, align 4
+  %tmp13 = getelementptr inbounds i32, ptr %arg1, i64 %tmp12
+  %tmp14 = addrspacecast ptr %tmp13 to ptr addrspace(1)
+  store i32 %tmp5, ptr addrspace(1) %tmp14, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll b/llvm/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
index df8366a722e93..667c3df22d4e7 100644
--- a/llvm/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
@@ -5,12 +5,12 @@
 
 ; CHECK: %{{[0-9]+}}:vgpr_32 = V_ADD_CO_U32_e32 %{{[0-9]+}}, %{{[0-9]+}}, implicit-def $vcc, implicit $exec
 
-define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load volatile i32, i32 addrspace(1)* %in
-  %b = load volatile i32, i32 addrspace(1)* %b_ptr
+  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %a = load volatile i32, ptr addrspace(1) %in
+  %b = load volatile i32, ptr addrspace(1) %b_ptr
   %result = add i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
index 8dac78d584882..969dbc7b30c6a 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
@@ -19,7 +19,7 @@ bb:
   br i1 %tmp63, label %unreachable, label %ret
 
 unreachable:
-  store volatile i32 0, i32 addrspace(3)* undef, align 4
+  store volatile i32 0, ptr addrspace(3) undef, align 4
   unreachable
 
 ret:
@@ -47,7 +47,7 @@ ret:
   ret void
 
 unreachable:
-  store volatile i32 0, i32 addrspace(3)* undef, align 4
+  store volatile i32 0, ptr addrspace(3) undef, align 4
   unreachable
 }
 
@@ -66,7 +66,7 @@ bb:
   br i1 %tmp63, label %unreachable, label %ret
 
 unreachable:
-  store volatile i32 0, i32 addrspace(3)* undef, align 4
+  store volatile i32 0, ptr addrspace(3) undef, align 4
   unreachable
 
 ret:

diff  --git a/llvm/test/CodeGen/AMDGPU/si-scheduler.ll b/llvm/test/CodeGen/AMDGPU/si-scheduler.ll
index af31741254b25..05e6fa9e44578 100644
--- a/llvm/test/CodeGen/AMDGPU/si-scheduler.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-scheduler.ll
@@ -17,12 +17,10 @@
 ; CHECK: s_waitcnt vmcnt(0)
 ; CHECK: exp
 ; CHECK: s_endpgm
-define amdgpu_ps void @main([6 x <16 x i8>] addrspace(4)* inreg %arg, [17 x <16 x i8>] addrspace(4)* inreg %arg1, [17 x <4 x i32>] addrspace(4)* inreg %arg2, [34 x <8 x i32>] addrspace(4)* inreg %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
+define amdgpu_ps void @main(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, ptr addrspace(4) inreg %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
 main_body:
-  %tmp = bitcast [34 x <8 x i32>] addrspace(4)* %arg3 to <32 x i8> addrspace(4)*
-  %tmp22 = load <32 x i8>, <32 x i8> addrspace(4)* %tmp, align 32, !tbaa !0
-  %tmp23 = bitcast [17 x <4 x i32>] addrspace(4)* %arg2 to <16 x i8> addrspace(4)*
-  %tmp24 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp23, align 16, !tbaa !0
+  %tmp22 = load <32 x i8>, ptr addrspace(4) %arg3, align 32, !tbaa !0
+  %tmp24 = load <16 x i8>, ptr addrspace(4) %arg2, align 16, !tbaa !0
   %i.i = extractelement <2 x i32> %arg11, i32 0
   %j.i = extractelement <2 x i32> %arg11, i32 1
   %i.f.i = bitcast i32 %i.i to float
@@ -70,8 +68,8 @@ define amdgpu_ps void @_amdgpu_ps_main(i32 %arg) local_unnamed_addr {
 .entry:
   %tmp = insertelement <2 x i32> zeroinitializer, i32 %arg, i32 0
   %tmp1 = bitcast <2 x i32> %tmp to i64
-  %tmp2 = inttoptr i64 %tmp1 to <4 x i32> addrspace(4)*
-  %tmp3 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp2, align 16
+  %tmp2 = inttoptr i64 %tmp1 to ptr addrspace(4)
+  %tmp3 = load <4 x i32>, ptr addrspace(4) %tmp2, align 16
   %tmp4 = tail call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp3, i32 0, i32 0) #0
   switch i32 %tmp4, label %bb [
     i32 0, label %bb5

diff  --git a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll
index 4331aaaa0fb79..0d227ad47ebb0 100644
--- a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -31,11 +31,10 @@
 ; GCN: s_endpgm
 
 ; TOVGPR: ScratchSize: 0{{$}}
-define amdgpu_ps void @main([17 x <4 x i32>] addrspace(4)* inreg %arg, [32 x <4 x i32>] addrspace(4)* inreg %arg1, [16 x <8 x i32>] addrspace(4)* inreg %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
+define amdgpu_ps void @main(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
 main_body:
-  %lds = inttoptr i32 0 to [64 x i32] addrspace(3)*
-  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0
-  %tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
+  %lds = inttoptr i32 0 to ptr addrspace(3)
+  %tmp21 = load <4 x i32>, ptr addrspace(4) %arg, !tbaa !0
   %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 96, i32 0)
   %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 100, i32 0)
   %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 104, i32 0)
@@ -74,39 +73,37 @@ main_body:
   %tmp57 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 372, i32 0)
   %tmp58 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 376, i32 0)
   %tmp59 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 384, i32 0)
-  %tmp60 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 0
-  %tmp61 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp60, !tbaa !0
-  %tmp62 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 0
-  %tmp63 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp62, !tbaa !0
+  %tmp61 = load <8 x i32>, ptr addrspace(4) %arg2, !tbaa !0
+  %tmp63 = load <4 x i32>, ptr addrspace(4) %arg1, !tbaa !0
   %tmp63.bc = bitcast <4 x i32> %tmp63 to <4 x i32>
-  %tmp64 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 1
-  %tmp65 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp64, !tbaa !0
-  %tmp66 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 1
-  %tmp67 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp66, !tbaa !0
-  %tmp68 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 2
-  %tmp69 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp68, !tbaa !0
-  %tmp70 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 2
-  %tmp71 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp70, !tbaa !0
-  %tmp72 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 3
-  %tmp73 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp72, !tbaa !0
-  %tmp74 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 3
-  %tmp75 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp74, !tbaa !0
-  %tmp76 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 4
-  %tmp77 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp76, !tbaa !0
-  %tmp78 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 4
-  %tmp79 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp78, !tbaa !0
-  %tmp80 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 5
-  %tmp81 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp80, !tbaa !0
-  %tmp82 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 5
-  %tmp83 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp82, !tbaa !0
-  %tmp84 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 6
-  %tmp85 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp84, !tbaa !0
-  %tmp86 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 6
-  %tmp87 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp86, !tbaa !0
-  %tmp88 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 7
-  %tmp89 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp88, !tbaa !0
-  %tmp90 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 7
-  %tmp91 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp90, !tbaa !0
+  %tmp64 = getelementptr [16 x <8 x i32>], ptr addrspace(4) %arg2, i64 0, i32 1
+  %tmp65 = load <8 x i32>, ptr addrspace(4) %tmp64, !tbaa !0
+  %tmp66 = getelementptr [32 x <4 x i32>], ptr addrspace(4) %arg1, i64 0, i32 1
+  %tmp67 = load <4 x i32>, ptr addrspace(4) %tmp66, !tbaa !0
+  %tmp68 = getelementptr [16 x <8 x i32>], ptr addrspace(4) %arg2, i64 0, i32 2
+  %tmp69 = load <8 x i32>, ptr addrspace(4) %tmp68, !tbaa !0
+  %tmp70 = getelementptr [32 x <4 x i32>], ptr addrspace(4) %arg1, i64 0, i32 2
+  %tmp71 = load <4 x i32>, ptr addrspace(4) %tmp70, !tbaa !0
+  %tmp72 = getelementptr [16 x <8 x i32>], ptr addrspace(4) %arg2, i64 0, i32 3
+  %tmp73 = load <8 x i32>, ptr addrspace(4) %tmp72, !tbaa !0
+  %tmp74 = getelementptr [32 x <4 x i32>], ptr addrspace(4) %arg1, i64 0, i32 3
+  %tmp75 = load <4 x i32>, ptr addrspace(4) %tmp74, !tbaa !0
+  %tmp76 = getelementptr [16 x <8 x i32>], ptr addrspace(4) %arg2, i64 0, i32 4
+  %tmp77 = load <8 x i32>, ptr addrspace(4) %tmp76, !tbaa !0
+  %tmp78 = getelementptr [32 x <4 x i32>], ptr addrspace(4) %arg1, i64 0, i32 4
+  %tmp79 = load <4 x i32>, ptr addrspace(4) %tmp78, !tbaa !0
+  %tmp80 = getelementptr [16 x <8 x i32>], ptr addrspace(4) %arg2, i64 0, i32 5
+  %tmp81 = load <8 x i32>, ptr addrspace(4) %tmp80, !tbaa !0
+  %tmp82 = getelementptr [32 x <4 x i32>], ptr addrspace(4) %arg1, i64 0, i32 5
+  %tmp83 = load <4 x i32>, ptr addrspace(4) %tmp82, !tbaa !0
+  %tmp84 = getelementptr [16 x <8 x i32>], ptr addrspace(4) %arg2, i64 0, i32 6
+  %tmp85 = load <8 x i32>, ptr addrspace(4) %tmp84, !tbaa !0
+  %tmp86 = getelementptr [32 x <4 x i32>], ptr addrspace(4) %arg1, i64 0, i32 6
+  %tmp87 = load <4 x i32>, ptr addrspace(4) %tmp86, !tbaa !0
+  %tmp88 = getelementptr [16 x <8 x i32>], ptr addrspace(4) %arg2, i64 0, i32 7
+  %tmp89 = load <8 x i32>, ptr addrspace(4) %tmp88, !tbaa !0
+  %tmp90 = getelementptr [32 x <4 x i32>], ptr addrspace(4) %arg1, i64 0, i32 7
+  %tmp91 = load <4 x i32>, ptr addrspace(4) %tmp90, !tbaa !0
   %i.i = extractelement <2 x i32> %arg6, i32 0
   %j.i = extractelement <2 x i32> %arg6, i32 1
   %i.f.i = bitcast i32 %i.i to float
@@ -211,30 +208,30 @@ main_body:
   %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 5, i32 %arg4) #0
   %mbcnt.lo.0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tmp109 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.0)
-  %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp109
+  %tmp110 = getelementptr [64 x i32], ptr addrspace(3) %lds, i32 0, i32 %tmp109
   %tmp111 = bitcast float %p2.i to i32
-  store i32 %tmp111, i32 addrspace(3)* %tmp110
+  store i32 %tmp111, ptr addrspace(3) %tmp110
   %tmp112 = bitcast float %p2.i96 to i32
-  store i32 %tmp112, i32 addrspace(3)* %tmp110
+  store i32 %tmp112, ptr addrspace(3) %tmp110
   %mbcnt.lo.1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tmp113 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.1)
-  %tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp113
+  %tmp114 = getelementptr [64 x i32], ptr addrspace(3) %lds, i32 0, i32 %tmp113
   %tmp115 = and i32 %tmp113, -4
-  %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp115
+  %tmp116 = getelementptr [64 x i32], ptr addrspace(3) %lds, i32 0, i32 %tmp115
   %tmp117 = add i32 %tmp115, 1
-  %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp117
+  %tmp118 = getelementptr [64 x i32], ptr addrspace(3) %lds, i32 0, i32 %tmp117
   %tmp119 = bitcast float %p2.i to i32
-  store i32 %tmp119, i32 addrspace(3)* %tmp114
-  %tmp120 = load i32, i32 addrspace(3)* %tmp116
+  store i32 %tmp119, ptr addrspace(3) %tmp114
+  %tmp120 = load i32, ptr addrspace(3) %tmp116
   %tmp121 = bitcast i32 %tmp120 to float
-  %tmp122 = load i32, i32 addrspace(3)* %tmp118
+  %tmp122 = load i32, ptr addrspace(3) %tmp118
   %tmp123 = bitcast i32 %tmp122 to float
   %tmp124 = fsub float %tmp123, %tmp121
   %tmp125 = bitcast float %p2.i96 to i32
-  store i32 %tmp125, i32 addrspace(3)* %tmp114
-  %tmp126 = load i32, i32 addrspace(3)* %tmp116
+  store i32 %tmp125, ptr addrspace(3) %tmp114
+  %tmp126 = load i32, ptr addrspace(3) %tmp116
   %tmp127 = bitcast i32 %tmp126 to float
-  %tmp128 = load i32, i32 addrspace(3)* %tmp118
+  %tmp128 = load i32, ptr addrspace(3) %tmp118
   %tmp129 = bitcast i32 %tmp128 to float
   %tmp130 = fsub float %tmp129, %tmp127
   %tmp131 = insertelement <4 x float> undef, float %tmp124, i32 0
@@ -249,48 +246,48 @@ main_body:
   %tmp140 = fmul float %tmp59, %p2.i96
   %mbcnt.lo.2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tmp141 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.2)
-  %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp141
+  %tmp142 = getelementptr [64 x i32], ptr addrspace(3) %lds, i32 0, i32 %tmp141
   %tmp143 = bitcast float %tmp137 to i32
-  store i32 %tmp143, i32 addrspace(3)* %tmp142
+  store i32 %tmp143, ptr addrspace(3) %tmp142
   %tmp144 = bitcast float %tmp138 to i32
-  store i32 %tmp144, i32 addrspace(3)* %tmp142
+  store i32 %tmp144, ptr addrspace(3) %tmp142
   %tmp145 = bitcast float %tmp139 to i32
-  store i32 %tmp145, i32 addrspace(3)* %tmp142
+  store i32 %tmp145, ptr addrspace(3) %tmp142
   %tmp146 = bitcast float %tmp140 to i32
-  store i32 %tmp146, i32 addrspace(3)* %tmp142
+  store i32 %tmp146, ptr addrspace(3) %tmp142
   %mbcnt.lo.3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tmp147 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.3)
-  %tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp147
+  %tmp148 = getelementptr [64 x i32], ptr addrspace(3) %lds, i32 0, i32 %tmp147
   %tmp149 = and i32 %tmp147, -4
-  %tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp149
+  %tmp150 = getelementptr [64 x i32], ptr addrspace(3) %lds, i32 0, i32 %tmp149
   %tmp151 = add i32 %tmp149, 2
-  %tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp151
+  %tmp152 = getelementptr [64 x i32], ptr addrspace(3) %lds, i32 0, i32 %tmp151
   %tmp153 = bitcast float %tmp137 to i32
-  store i32 %tmp153, i32 addrspace(3)* %tmp148
-  %tmp154 = load i32, i32 addrspace(3)* %tmp150
+  store i32 %tmp153, ptr addrspace(3) %tmp148
+  %tmp154 = load i32, ptr addrspace(3) %tmp150
   %tmp155 = bitcast i32 %tmp154 to float
-  %tmp156 = load i32, i32 addrspace(3)* %tmp152
+  %tmp156 = load i32, ptr addrspace(3) %tmp152
   %tmp157 = bitcast i32 %tmp156 to float
   %tmp158 = fsub float %tmp157, %tmp155
   %tmp159 = bitcast float %tmp138 to i32
-  store i32 %tmp159, i32 addrspace(3)* %tmp148
-  %tmp160 = load i32, i32 addrspace(3)* %tmp150
+  store i32 %tmp159, ptr addrspace(3) %tmp148
+  %tmp160 = load i32, ptr addrspace(3) %tmp150
   %tmp161 = bitcast i32 %tmp160 to float
-  %tmp162 = load i32, i32 addrspace(3)* %tmp152
+  %tmp162 = load i32, ptr addrspace(3) %tmp152
   %tmp163 = bitcast i32 %tmp162 to float
   %tmp164 = fsub float %tmp163, %tmp161
   %tmp165 = bitcast float %tmp139 to i32
-  store i32 %tmp165, i32 addrspace(3)* %tmp148
-  %tmp166 = load i32, i32 addrspace(3)* %tmp150
+  store i32 %tmp165, ptr addrspace(3) %tmp148
+  %tmp166 = load i32, ptr addrspace(3) %tmp150
   %tmp167 = bitcast i32 %tmp166 to float
-  %tmp168 = load i32, i32 addrspace(3)* %tmp152
+  %tmp168 = load i32, ptr addrspace(3) %tmp152
   %tmp169 = bitcast i32 %tmp168 to float
   %tmp170 = fsub float %tmp169, %tmp167
   %tmp171 = bitcast float %tmp140 to i32
-  store i32 %tmp171, i32 addrspace(3)* %tmp148
-  %tmp172 = load i32, i32 addrspace(3)* %tmp150
+  store i32 %tmp171, ptr addrspace(3) %tmp148
+  %tmp172 = load i32, ptr addrspace(3) %tmp150
   %tmp173 = bitcast i32 %tmp172 to float
-  %tmp174 = load i32, i32 addrspace(3)* %tmp152
+  %tmp174 = load i32, ptr addrspace(3) %tmp152
   %tmp175 = bitcast i32 %tmp174 to float
   %tmp176 = fsub float %tmp175, %tmp173
   %tmp177 = insertelement <4 x float> undef, float %tmp158, i32 0
@@ -651,10 +648,9 @@ ENDIF66:                                          ; preds = %LOOP65
 ; GCN-LABEL: {{^}}main1:
 ; GCN: s_endpgm
 ; TOVGPR: ScratchSize: 0{{$}}
-define amdgpu_ps void @main1([17 x <4 x i32>] addrspace(4)* inreg %arg, [32 x <4 x i32>] addrspace(4)* inreg %arg1, [16 x <8 x i32>] addrspace(4)* inreg %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
+define amdgpu_ps void @main1(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
 main_body:
-  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0
-  %tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
+  %tmp21 = load <4 x i32>, ptr addrspace(4) %arg, !tbaa !0
   %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 0, i32 0)
   %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 4, i32 0)
   %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 8, i32 0)
@@ -758,42 +754,40 @@ main_body:
   %tmp122 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 716, i32 0)
   %tmp123 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 864, i32 0)
   %tmp124 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 868, i32 0)
-  %tmp125 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 0
-  %tmp126 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp125, !tbaa !0
-  %tmp127 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 0
-  %tmp128 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp127, !tbaa !0
-  %tmp129 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 1
-  %tmp130 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp129, !tbaa !0
-  %tmp131 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 1
-  %tmp132 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp131, !tbaa !0
-  %tmp133 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 2
-  %tmp134 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp133, !tbaa !0
-  %tmp135 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 2
-  %tmp136 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp135, !tbaa !0
-  %tmp137 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 3
-  %tmp138 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp137, !tbaa !0
-  %tmp139 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 3
-  %tmp140 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp139, !tbaa !0
-  %tmp141 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 4
-  %tmp142 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp141, !tbaa !0
-  %tmp143 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 4
-  %tmp144 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp143, !tbaa !0
-  %tmp145 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 5
-  %tmp146 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp145, !tbaa !0
-  %tmp147 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 5
-  %tmp148 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp147, !tbaa !0
-  %tmp149 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 6
-  %tmp150 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp149, !tbaa !0
-  %tmp151 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 6
-  %tmp152 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp151, !tbaa !0
-  %tmp153 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 7
-  %tmp154 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp153, !tbaa !0
-  %tmp155 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 7
-  %tmp156 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp155, !tbaa !0
-  %tmp157 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 8
-  %tmp158 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp157, !tbaa !0
-  %tmp159 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 8
-  %tmp160 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp159, !tbaa !0
+  %tmp126 = load <8 x i32>, ptr addrspace(4) %arg2, !tbaa !0
+  %tmp128 = load <4 x i32>, ptr addrspace(4) %arg1, !tbaa !0
+  %tmp129 = getelementptr [16 x <8 x i32>], ptr addrspace(4) %arg2, i64 0, i32 1
+  %tmp130 = load <8 x i32>, ptr addrspace(4) %tmp129, !tbaa !0
+  %tmp131 = getelementptr [32 x <4 x i32>], ptr addrspace(4) %arg1, i64 0, i32 1
+  %tmp132 = load <4 x i32>, ptr addrspace(4) %tmp131, !tbaa !0
+  %tmp133 = getelementptr [16 x <8 x i32>], ptr addrspace(4) %arg2, i64 0, i32 2
+  %tmp134 = load <8 x i32>, ptr addrspace(4) %tmp133, !tbaa !0
+  %tmp135 = getelementptr [32 x <4 x i32>], ptr addrspace(4) %arg1, i64 0, i32 2
+  %tmp136 = load <4 x i32>, ptr addrspace(4) %tmp135, !tbaa !0
+  %tmp137 = getelementptr [16 x <8 x i32>], ptr addrspace(4) %arg2, i64 0, i32 3
+  %tmp138 = load <8 x i32>, ptr addrspace(4) %tmp137, !tbaa !0
+  %tmp139 = getelementptr [32 x <4 x i32>], ptr addrspace(4) %arg1, i64 0, i32 3
+  %tmp140 = load <4 x i32>, ptr addrspace(4) %tmp139, !tbaa !0
+  %tmp141 = getelementptr [16 x <8 x i32>], ptr addrspace(4) %arg2, i64 0, i32 4
+  %tmp142 = load <8 x i32>, ptr addrspace(4) %tmp141, !tbaa !0
+  %tmp143 = getelementptr [32 x <4 x i32>], ptr addrspace(4) %arg1, i64 0, i32 4
+  %tmp144 = load <4 x i32>, ptr addrspace(4) %tmp143, !tbaa !0
+  %tmp145 = getelementptr [16 x <8 x i32>], ptr addrspace(4) %arg2, i64 0, i32 5
+  %tmp146 = load <8 x i32>, ptr addrspace(4) %tmp145, !tbaa !0
+  %tmp147 = getelementptr [32 x <4 x i32>], ptr addrspace(4) %arg1, i64 0, i32 5
+  %tmp148 = load <4 x i32>, ptr addrspace(4) %tmp147, !tbaa !0
+  %tmp149 = getelementptr [16 x <8 x i32>], ptr addrspace(4) %arg2, i64 0, i32 6
+  %tmp150 = load <8 x i32>, ptr addrspace(4) %tmp149, !tbaa !0
+  %tmp151 = getelementptr [32 x <4 x i32>], ptr addrspace(4) %arg1, i64 0, i32 6
+  %tmp152 = load <4 x i32>, ptr addrspace(4) %tmp151, !tbaa !0
+  %tmp153 = getelementptr [16 x <8 x i32>], ptr addrspace(4) %arg2, i64 0, i32 7
+  %tmp154 = load <8 x i32>, ptr addrspace(4) %tmp153, !tbaa !0
+  %tmp155 = getelementptr [32 x <4 x i32>], ptr addrspace(4) %arg1, i64 0, i32 7
+  %tmp156 = load <4 x i32>, ptr addrspace(4) %tmp155, !tbaa !0
+  %tmp157 = getelementptr [16 x <8 x i32>], ptr addrspace(4) %arg2, i64 0, i32 8
+  %tmp158 = load <8 x i32>, ptr addrspace(4) %tmp157, !tbaa !0
+  %tmp159 = getelementptr [32 x <4 x i32>], ptr addrspace(4) %arg1, i64 0, i32 8
+  %tmp160 = load <4 x i32>, ptr addrspace(4) %tmp159, !tbaa !0
   %tmp161 = fcmp ugt float %arg17, 0.000000e+00
   %tmp162 = select i1 %tmp161, float 1.000000e+00, float 0.000000e+00
   %i.i = extractelement <2 x i32> %arg6, i32 0

diff  --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
index 6b6798ef9f64f..5eb0ec734cf2e 100644
--- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
@@ -19,7 +19,7 @@
 ; SGPR-NEXT: buffer_store_dword v0, off, s[0:3], 0
 
 ; ALL: s_endpgm
-define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
   call void asm sideeffect "", "~{s[0:7]}" ()
   call void asm sideeffect "", "~{s[8:15]}" ()
   call void asm sideeffect "", "~{s[16:23]}" ()
@@ -65,6 +65,6 @@ define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) {
   call void asm sideeffect "", "~{v[240:247]}" ()
   call void asm sideeffect "", "~{v[248:255]}" ()
 
-  store i32 %in, i32 addrspace(1)* %out
+  store i32 %in, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
index d94fed4d12943..f9a17783f0d35 100644
--- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
@@ -5,7 +5,7 @@
 declare void @llvm.trap()
 declare i32 @llvm.amdgcn.workitem.id.x()
 
-define amdgpu_kernel void @kernel(i32 %a, i32 addrspace(1)* %x, i32 noundef %n) {
+define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {
 ; This used to bypass the structurization process because structurizer is unable to
 ; handle multiple-exits CFG. This should be correctly structurized.
 ; UNIFY-LABEL: define amdgpu_kernel void @kernel
@@ -29,12 +29,11 @@ define amdgpu_kernel void @kernel(i32 %a, i32 addrspace(1)* %x, i32 noundef %n)
 ; UNIFY-NEXT:    call void @llvm.trap()
 ; UNIFY-NEXT:    br label %UnifiedUnreachableBlock
 ; UNIFY-LABEL: if.end6.sink.split:
-; UNIFY-NEXT:    %x.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %kernel.kernarg.segment, i64 8
-; UNIFY-NEXT:    %x.kernarg.offset.cast = bitcast i8 addrspace(4)* %x.kernarg.offset to i32 addrspace(1)* addrspace(4)*
-; UNIFY-NEXT:    %x.load = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %x.kernarg.offset.cast, align 8, !invariant.load !0
+; UNIFY-NEXT:    %x.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %kernel.kernarg.segment, i64 8
+; UNIFY-NEXT:    %x.load = load ptr addrspace(1), ptr addrspace(4) %x.kernarg.offset, align 8, !invariant.load !0
 ; UNIFY-NEXT:    %idxprom = sext i32 %tid to i64
-; UNIFY-NEXT:    %x1 = getelementptr inbounds i32, i32 addrspace(1)* %x.load, i64 %idxprom
-; UNIFY-NEXT:    store i32 %a.load, i32 addrspace(1)* %x1, align 4
+; UNIFY-NEXT:    %x1 = getelementptr inbounds i32, ptr addrspace(1) %x.load, i64 %idxprom
+; UNIFY-NEXT:    store i32 %a.load, ptr addrspace(1) %x1, align 4
 ; UNIFY-NEXT:    br label %UnifiedReturnBlock
 ; UNIFY-LABEL: UnifiedUnreachableBlock:
 ; UNIFY-NEXT:    call void @llvm.amdgcn.unreachable()
@@ -130,8 +129,8 @@ cond.false.i8:
   unreachable
 
 if.end6.sink.split:
-  %x1 = getelementptr inbounds i32, i32 addrspace(1)* %x, i32 %tid
-  store i32 %a, i32 addrspace(1)* %x1, align 4
+  %x1 = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %tid
+  store i32 %a, ptr addrspace(1) %x1, align 4
   br label %if.end6
 
 if.end6:

diff  --git a/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll b/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll
index fafce8d1c302b..6f768641b5b03 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-branch-trap.ll
@@ -11,10 +11,10 @@
 ; GCN: [[TRAP]]:
 ; GCN: s_trap 2
 ; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @trap_divergent_branch(i32 addrspace(1)* nocapture readonly %arg) {
+define amdgpu_kernel void @trap_divergent_branch(ptr addrspace(1) nocapture readonly %arg) {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %id
-  %divergent.val = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id
+  %divergent.val = load i32, ptr addrspace(1) %gep
   %cmp = icmp eq i32 %divergent.val, 0
   br i1 %cmp, label %bb, label %end
 
@@ -33,10 +33,10 @@ end:
 ; GCN: s_trap 3
 ; GCN-NEXT: [[ENDPGM]]:
 ; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @debugtrap_divergent_branch(i32 addrspace(1)* nocapture readonly %arg) {
+define amdgpu_kernel void @debugtrap_divergent_branch(ptr addrspace(1) nocapture readonly %arg) {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %id
-  %divergent.val = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id
+  %divergent.val = load i32, ptr addrspace(1) %gep
   %cmp = icmp eq i32 %divergent.val, 0
   br i1 %cmp, label %bb, label %end
 

diff  --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 7080c84f7b50a..19cf69f36554b 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -747,13 +747,13 @@ bb:
   %cmp.var = fcmp olt float %var, 0.0
   ; TODO: We could do an early-exit here (the branch above is uniform!)
   call void @llvm.amdgcn.kill(i1 %cmp.var)
-  store volatile float %live.across, float addrspace(1)* undef
+  store volatile float %live.across, ptr addrspace(1) undef
   %live.out = call float asm sideeffect "v_mov_b32_e64 v9, -2", "={v9}"()
   br label %exit
 
 exit:
   %phi = phi float [ 0.0, %entry ], [ %live.out, %bb ]
-  store float %phi, float addrspace(1)* undef
+  store float %phi, ptr addrspace(1) undef
   ret void
 }
 
@@ -1127,12 +1127,12 @@ bb:
     v_nop_e64", "={v7}"()
   %cmp.var = fcmp olt float %var, 0.0
   call void @llvm.amdgcn.kill(i1 %cmp.var)
-  %vgpr = load volatile i32, i32 addrspace(1)* undef
+  %vgpr = load volatile i32, ptr addrspace(1) undef
   %loop.cond = icmp eq i32 %vgpr, 0
   br i1 %loop.cond, label %bb, label %exit
 
 exit:
-  store volatile i32 8, i32 addrspace(1)* undef
+  store volatile i32 8, ptr addrspace(1) undef
   ret void
 }
 
@@ -1277,11 +1277,11 @@ phibb:
   br i1 %tmp6, label %bb10, label %end
 
 bb8:
-  store volatile i32 8, i32 addrspace(1)* undef
+  store volatile i32 8, ptr addrspace(1) undef
   br label %phibb
 
 bb10:
-  store volatile i32 9, i32 addrspace(1)* undef
+  store volatile i32 9, ptr addrspace(1) undef
   br label %end
 
 end:
@@ -1543,7 +1543,7 @@ bb4:                                              ; preds = %bb3, %bb
   br i1 %tmp7, label %bb8, label %bb9
 
 bb8:                                              ; preds = %bb9, %bb4
-  store volatile i32 9, i32 addrspace(1)* undef
+  store volatile i32 9, ptr addrspace(1) undef
   ret void
 
 bb9:                                              ; preds = %bb4

diff  --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll
index 75a1ce4e380ee..d9e21e5a83c99 100644
--- a/llvm/test/CodeGen/AMDGPU/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/smed3.ll
@@ -6,11 +6,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i32:
 ; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define amdgpu_kernel void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_smed3_r_i_i_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0
+  %gep0 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load i32, ptr addrspace(1) %gep0
 
   %icmp0 = icmp sgt i32 %a, 12
   %i0 = select i1 %icmp0, i32 %a, i32 12
@@ -18,18 +18,18 @@ define amdgpu_kernel void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 ad
   %icmp1 = icmp slt i32 %i0, 17
   %i1 = select i1 %icmp1, i32 %i0, i32 17
 
-  store i32 %i1, i32 addrspace(1)* %outgep
+  store i32 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_smed3_multi_use_r_i_i_i32:
 ; GCN: v_max_i32
 ; GCN: v_min_i32
-define amdgpu_kernel void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_smed3_multi_use_r_i_i_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0
+  %gep0 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load i32, ptr addrspace(1) %gep0
 
   %icmp0 = icmp sgt i32 %a, 12
   %i0 = select i1 %icmp0, i32 %a, i32 12
@@ -37,19 +37,19 @@ define amdgpu_kernel void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %o
   %icmp1 = icmp slt i32 %i0, 17
   %i1 = select i1 %icmp1, i32 %i0, i32 17
 
-  store volatile i32 %i0, i32 addrspace(1)* %outgep
-  store volatile i32 %i1, i32 addrspace(1)* %outgep
+  store volatile i32 %i0, ptr addrspace(1) %outgep
+  store volatile i32 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_sign_mismatch_i32:
 ; GCN: v_max_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
 ; GCN: v_min_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
-define amdgpu_kernel void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_smed3_r_i_i_sign_mismatch_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0
+  %gep0 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load i32, ptr addrspace(1) %gep0
 
   %icmp0 = icmp ugt i32 %a, 12
   %i0 = select i1 %icmp0, i32 %a, i32 12
@@ -57,18 +57,18 @@ define amdgpu_kernel void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)
   %icmp1 = icmp slt i32 %i0, 17
   %i1 = select i1 %icmp1, i32 %i0, i32 17
 
-  store i32 %i1, i32 addrspace(1)* %outgep
+  store i32 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i64:
 ; GCN: v_cmp_lt_i64
 ; GCN: v_cmp_gt_i64
-define amdgpu_kernel void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_smed3_r_i_i_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
-  %a = load i64, i64 addrspace(1)* %gep0
+  %gep0 = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr i64, ptr addrspace(1) %out, i32 %tid
+  %a = load i64, ptr addrspace(1) %gep0
 
   %icmp0 = icmp sgt i64 %a, 12
   %i0 = select i1 %icmp0, i64 %a, i64 12
@@ -76,7 +76,7 @@ define amdgpu_kernel void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 ad
   %icmp1 = icmp slt i64 %i0, 17
   %i1 = select i1 %icmp1, i64 %i0, i64 17
 
-  store i64 %i1, i64 addrspace(1)* %outgep
+  store i64 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -96,11 +96,11 @@ declare i64 @llvm.smin.i64(i64, i64)
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i16:
 ; SICIVI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define amdgpu_kernel void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_smed3_r_i_i_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
-  %a = load i16, i16 addrspace(1)* %gep0
+  %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid
+  %a = load i16, ptr addrspace(1) %gep0
 
   %icmp0 = icmp sgt i16 %a, 12
   %i0 = select i1 %icmp0, i16 %a, i16 12
@@ -108,7 +108,7 @@ define amdgpu_kernel void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 ad
   %icmp1 = icmp slt i16 %i0, 17
   %i1 = select i1 %icmp1, i16 %i0, i16 17
 
-  store i16 %i1, i16 addrspace(1)* %outgep
+  store i16 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -168,193 +168,193 @@ define internal i8 @smax8(i8 %x, i8 %y) #2 {
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_1:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_1(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
   %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_2:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_2(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
   %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_3:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_3(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
   %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
   %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_4:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_4(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_5:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_5(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
   %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_6:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_6(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
   %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_7:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_7(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
   %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
   %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_8:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_8(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_9:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_9(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
   %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_10:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_10(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
   %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_11:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_11(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
   %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
   %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_12:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_12(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_13:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_13(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
   %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_14:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_14(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
   %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_15:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_15(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
   %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
   %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -373,193 +373,193 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_16:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_16(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_16(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smax(i32 %tmp0, i32 %z)
   %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_17:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_17(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_17(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smax(i32 %tmp0, i32 %z)
   %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_18:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_18(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_18(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smax(i32 %z, i32 %tmp0)
   %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_19:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_19(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_19(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smax(i32 %z, i32 %tmp0)
   %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_20:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_20(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_20(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
   %tmp2 = call i32 @smax(i32 %tmp0, i32 %z)
   %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_21:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_21(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_21(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
   %tmp2 = call i32 @smax(i32 %tmp0, i32 %z)
   %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_22:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_22(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_22(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
   %tmp2 = call i32 @smax(i32 %z, i32 %tmp0)
   %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_23:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_23(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_23(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
   %tmp2 = call i32 @smax(i32 %z, i32 %tmp0)
   %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_24:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_24(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_24(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smax(i32 %tmp0, i32 %z)
   %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_25:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_25(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_25(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smax(i32 %tmp0, i32 %z)
   %tmp3 = call i32 @smin(i32 %tmp1, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_26:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_26(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_26(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smax(i32 %z, i32 %tmp0)
   %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_27:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_27(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_27(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smax(i32 %z, i32 %tmp0)
   %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_28:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_28(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_28(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
   %tmp2 = call i32 @smax(i32 %tmp0, i32 %z)
   %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_29:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_29(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_29(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
   %tmp2 = call i32 @smax(i32 %tmp0, i32 %z)
   %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_30:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_30(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_30(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
   %tmp2 = call i32 @smax(i32 %z, i32 %tmp0)
   %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_31:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_31(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_31(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
   %tmp2 = call i32 @smax(i32 %z, i32 %tmp0)
   %tmp3 = call i32 @smin(i32 %tmp2, i32 %tmp1)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -569,13 +569,13 @@ bb:
 ; GCN: s_sext_i32_i16
 ; GCN: s_sext_i32_i16
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, [8 x i32], i16 %x, [8 x i32], i16 %y, [8 x i32], i16 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i16_pat_0(ptr addrspace(1) %arg, [8 x i32], i16 %x, [8 x i32], i16 %y, [8 x i32], i16 %z) #1 {
 bb:
   %tmp0 = call i16 @smin16(i16 %x, i16 %y)
   %tmp1 = call i16 @smax16(i16 %x, i16 %y)
   %tmp2 = call i16 @smin16(i16 %tmp1, i16 %z)
   %tmp3 = call i16 @smax16(i16 %tmp0, i16 %tmp2)
-  store i16 %tmp3, i16 addrspace(1)* %arg
+  store i16 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -584,13 +584,13 @@ bb:
 ; GCN: s_sext_i32_i8
 ; GCN: s_sext_i32_i8
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, [8 x i32], i8 %x, [8 x i32], i8 %y, [8 x i32], i8 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i8_pat_0(ptr addrspace(1) %arg, [8 x i32], i8 %x, [8 x i32], i8 %y, [8 x i32], i8 %z) #1 {
 bb:
   %tmp0 = call i8 @smin8(i8 %x, i8 %y)
   %tmp1 = call i8 @smax8(i8 %x, i8 %y)
   %tmp2 = call i8 @smin8(i8 %tmp1, i8 %z)
   %tmp3 = call i8 @smax8(i8 %tmp0, i8 %tmp2)
-  store i8 %tmp3, i8 addrspace(1)* %arg
+  store i8 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -598,14 +598,14 @@ bb:
 ; GCN: s_min_i32
 ; GCN-NOT: {{s_min_i32|s_max_i32}}
 ; GCN: v_med3_i32
-define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_0(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
-  store volatile i32 %tmp0, i32 addrspace(1)* %arg
-  store volatile i32 %tmp3, i32 addrspace(1)* %arg
+  store volatile i32 %tmp0, ptr addrspace(1) %arg
+  store volatile i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -613,14 +613,14 @@ bb:
 ; GCN: s_max_i32
 ; GCN-NOT: {{s_min_i32|s_max_i32}}
 ; GCN: v_med3_i32
-define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_1(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
-  store volatile i32 %tmp1, i32 addrspace(1)* %arg
-  store volatile i32 %tmp3, i32 addrspace(1)* %arg
+  store volatile i32 %tmp1, ptr addrspace(1) %arg
+  store volatile i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -629,28 +629,28 @@ bb:
 ; GCN: s_min_i32
 ; GCN-NOT: {{s_min_i32|s_max_i32}}
 ; GCN: v_med3_i32
-define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_2(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
-  store volatile i32 %tmp2, i32 addrspace(1)* %arg
-  store volatile i32 %tmp3, i32 addrspace(1)* %arg
+  store volatile i32 %tmp2, ptr addrspace(1) %arg
+  store volatile i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_result:
 ; GCN-NOT: {{s_min_i32|s_max_i32}}
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_result(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
   %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
-  store volatile i32 %tmp3, i32 addrspace(1)* %arg
-  store volatile i32 %tmp3, i32 addrspace(1)* %arg
+  store volatile i32 %tmp3, ptr addrspace(1) %arg
+  store volatile i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -658,7 +658,7 @@ bb:
 ; GCN-NOT: {{s_min_i32|s_max_i32}}
 ; GCN: v_med3_i32 v{{[0-9]+}}, [[B0:s[0-9]+]], [[B1:v[0-9]+]], v{{[0-9]+}}
 ; GCN: v_med3_i32 v{{[0-9]+}}, [[B0]], [[B1]], v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_reuse_bounds(i32 addrspace(1)* %arg, i32 %b0, i32 %b1, i32 %x, i32 %y) #1 {
+define amdgpu_kernel void @s_test_smed3_reuse_bounds(ptr addrspace(1) %arg, i32 %b0, i32 %b1, i32 %x, i32 %y) #1 {
 bb:
   %lo = call i32 @smin(i32 %b0, i32 %b1)
   %hi = call i32 @smax(i32 %b0, i32 %b1)
@@ -669,8 +669,8 @@ bb:
   %tmp1 = call i32 @smin(i32 %y, i32 %hi)
   %z1 = call i32 @smax(i32 %tmp1, i32 %lo)
 
-  store volatile i32 %z0, i32 addrspace(1)* %arg
-  store volatile i32 %z1, i32 addrspace(1)* %arg
+  store volatile i32 %z0, ptr addrspace(1) %arg
+  store volatile i32 %z1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -684,44 +684,44 @@ bb:
 ; VI: v_max_i16
 
 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
+define amdgpu_kernel void @v_test_smed3_i16_pat_0(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
-  %gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3
-  %gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8
-  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
-  %x = load i16, i16 addrspace(1)* %gep0
-  %y = load i16, i16 addrspace(1)* %gep1
-  %z = load i16, i16 addrspace(1)* %gep2
+  %gep0 = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i32 %tid
+  %gep1 = getelementptr inbounds i16, ptr addrspace(1) %gep0, i32 3
+  %gep2 = getelementptr inbounds i16, ptr addrspace(1) %gep0, i32 8
+  %out.gep = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
+  %x = load i16, ptr addrspace(1) %gep0
+  %y = load i16, ptr addrspace(1) %gep1
+  %z = load i16, ptr addrspace(1) %gep2
 
   %tmp0 = call i16 @smin16(i16 %x, i16 %y)
   %tmp1 = call i16 @smax16(i16 %x, i16 %y)
   %tmp2 = call i16 @smin16(i16 %tmp1, i16 %z)
   %tmp3 = call i16 @smax16(i16 %tmp0, i16 %tmp2)
-  store i16 %tmp3, i16 addrspace(1)* %out.gep
+  store i16 %tmp3, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_smed3_i16_pat_1:
 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
-define amdgpu_kernel void @v_test_smed3_i16_pat_1(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
+define amdgpu_kernel void @v_test_smed3_i16_pat_1(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
-  %gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3
-  %gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8
-  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
-  %x = load i16, i16 addrspace(1)* %gep0
-  %y = load i16, i16 addrspace(1)* %gep1
-  %z = load i16, i16 addrspace(1)* %gep2
+  %gep0 = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i32 %tid
+  %gep1 = getelementptr inbounds i16, ptr addrspace(1) %gep0, i32 3
+  %gep2 = getelementptr inbounds i16, ptr addrspace(1) %gep0, i32 8
+  %out.gep = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
+  %x = load i16, ptr addrspace(1) %gep0
+  %y = load i16, ptr addrspace(1) %gep1
+  %z = load i16, ptr addrspace(1) %gep2
 
   %tmp0 = call i16 @smin16(i16 %x, i16 %y)
   %tmp1 = call i16 @smax16(i16 %x, i16 %y)
   %tmp2 = call i16 @smax16(i16 %tmp0, i16 %z)
   %tmp3 = call i16 @smin16(i16 %tmp1, i16 %tmp2)
-  store i16 %tmp3, i16 addrspace(1)* %out.gep
+  store i16 %tmp3, ptr addrspace(1) %out.gep
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/sminmax.ll b/llvm/test/CodeGen/AMDGPU/sminmax.ll
index 26846d20271bd..c52ef9d48b934 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.ll
@@ -8,12 +8,12 @@
 ; GCN: s_add_i32
 
 ; EG: MAX_INT
-define amdgpu_kernel void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_abs_i32(ptr addrspace(1) %out, i32 %val) nounwind {
   %neg = sub i32 0, %val
   %cond = icmp sgt i32 %val, %neg
   %res = select i1 %cond, i32 %val, i32 %neg
   %res2 = add i32 %res, 2
-  store i32 %res2, i32 addrspace(1)* %out, align 4
+  store i32 %res2, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -27,15 +27,15 @@ define amdgpu_kernel void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind
 ; GFX9: v_add_u32_e32 v{{[0-9]+}}, 2
 
 ; EG: MAX_INT
-define amdgpu_kernel void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_i32(ptr addrspace(1) %out, ptr addrspace(1) %src) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds i32, i32 addrspace(1)* %src, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in, align 4
+  %gep.in = getelementptr inbounds i32, ptr addrspace(1) %src, i32 %tid
+  %val = load i32, ptr addrspace(1) %gep.in, align 4
   %neg = sub i32 0, %val
   %cond = icmp sgt i32 %val, %neg
   %res = select i1 %cond, i32 %val, i32 %neg
   %res2 = add i32 %res, 2
-  store i32 %res2, i32 addrspace(1)* %out, align 4
+  store i32 %res2, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -44,15 +44,15 @@ define amdgpu_kernel void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %
 ; GFX9: v_sub_u32_e32 [[NEG:v[0-9]+]], 0, [[SRC:v[0-9]+]]
 ; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[SRC]], [[NEG]]
 ; GCN: v_mul_lo_u32 v{{[0-9]+}}, [[MAX]], [[MAX]]
-define amdgpu_kernel void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_i32_repeat_user(ptr addrspace(1) %out, ptr addrspace(1) %src) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds i32, i32 addrspace(1)* %src, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep.in, align 4
+  %gep.in = getelementptr inbounds i32, ptr addrspace(1) %src, i32 %tid
+  %val = load i32, ptr addrspace(1) %gep.in, align 4
   %neg = sub i32 0, %val
   %cond = icmp sgt i32 %val, %neg
   %res = select i1 %cond, i32 %val, i32 %neg
   %mul = mul i32 %res, %res
-  store i32 %mul, i32 addrspace(1)* %out, align 4
+  store i32 %mul, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -64,7 +64,7 @@ define amdgpu_kernel void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 add
 
 ; EG: MAX_INT
 ; EG: MAX_INT
-define amdgpu_kernel void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind {
+define amdgpu_kernel void @s_abs_v2i32(ptr addrspace(1) %out, <2 x i32> %val) nounwind {
   %z0 = insertelement <2 x i32> undef, i32 0, i32 0
   %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
   %t0 = insertelement <2 x i32> undef, i32 2, i32 0
@@ -73,7 +73,7 @@ define amdgpu_kernel void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
   %cond = icmp sgt <2 x i32> %val, %neg
   %res = select <2 x i1> %cond, <2 x i32> %val, <2 x i32> %neg
   %res2 = add <2 x i32> %res, %t1
-  store <2 x i32> %res2, <2 x i32> addrspace(1)* %out, align 4
+  store <2 x i32> %res2, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -95,19 +95,19 @@ define amdgpu_kernel void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 
 ; EG: MAX_INT
 ; EG: MAX_INT
-define amdgpu_kernel void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %src) nounwind {
   %z0 = insertelement <2 x i32> undef, i32 0, i32 0
   %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
   %t0 = insertelement <2 x i32> undef, i32 2, i32 0
   %t1 = insertelement <2 x i32> %t0, i32 2, i32 1
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %src, i32 %tid
-  %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in, align 4
+  %gep.in = getelementptr inbounds <2 x i32>, ptr addrspace(1) %src, i32 %tid
+  %val = load <2 x i32>, ptr addrspace(1) %gep.in, align 4
   %neg = sub <2 x i32> %z1, %val
   %cond = icmp sgt <2 x i32> %val, %neg
   %res = select <2 x i1> %cond, <2 x i32> %val, <2 x i32> %neg
   %res2 = add <2 x i32> %res, %t1
-  store <2 x i32> %res2, <2 x i32> addrspace(1)* %out, align 4
+  store <2 x i32> %res2, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -127,7 +127,7 @@ define amdgpu_kernel void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> a
 ; EG: MAX_INT
 ; EG: MAX_INT
 ; EG: MAX_INT
-define amdgpu_kernel void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind {
+define amdgpu_kernel void @s_abs_v4i32(ptr addrspace(1) %out, <4 x i32> %val) nounwind {
   %z0 = insertelement <4 x i32> undef, i32 0, i32 0
   %z1 = insertelement <4 x i32> %z0, i32 0, i32 1
   %z2 = insertelement <4 x i32> %z1, i32 0, i32 2
@@ -140,7 +140,7 @@ define amdgpu_kernel void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %
   %cond = icmp sgt <4 x i32> %val, %neg
   %res = select <4 x i1> %cond, <4 x i32> %val, <4 x i32> %neg
   %res2 = add <4 x i32> %res, %t3
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out, align 4
+  store <4 x i32> %res2, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -175,7 +175,7 @@ define amdgpu_kernel void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %
 ; EG: MAX_INT
 ; EG: MAX_INT
 ; EG: MAX_INT
-define amdgpu_kernel void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %src) nounwind {
   %z0 = insertelement <4 x i32> undef, i32 0, i32 0
   %z1 = insertelement <4 x i32> %z0, i32 0, i32 1
   %z2 = insertelement <4 x i32> %z1, i32 0, i32 2
@@ -185,13 +185,13 @@ define amdgpu_kernel void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> a
   %t2 = insertelement <4 x i32> %t1, i32 2, i32 2
   %t3 = insertelement <4 x i32> %t2, i32 2, i32 3
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %src, i32 %tid
-  %val = load <4 x i32>, <4 x i32> addrspace(1)* %gep.in, align 4
+  %gep.in = getelementptr inbounds <4 x i32>, ptr addrspace(1) %src, i32 %tid
+  %val = load <4 x i32>, ptr addrspace(1) %gep.in, align 4
   %neg = sub <4 x i32> %z3, %val
   %cond = icmp sgt <4 x i32> %val, %neg
   %res = select <4 x i1> %cond, <4 x i32> %val, <4 x i32> %neg
   %res2 = add <4 x i32> %res, %t3
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out, align 4
+  store <4 x i32> %res2, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -201,13 +201,13 @@ define amdgpu_kernel void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> a
 
 ; GCN-DAG: s_min_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]]
 ; GCN-DAG: s_max_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]]
-define amdgpu_kernel void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32], i32 %val0, [8 x i32], i32 %val1) nounwind {
+define amdgpu_kernel void @s_min_max_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, [8 x i32], i32 %val0, [8 x i32], i32 %val1) nounwind {
   %cond0 = icmp sgt i32 %val0, %val1
   %sel0 = select i1 %cond0, i32 %val0, i32 %val1
   %sel1 = select i1 %cond0, i32 %val1, i32 %val0
 
-  store volatile i32 %sel0, i32 addrspace(1)* %out0, align 4
-  store volatile i32 %sel1, i32 addrspace(1)* %out1, align 4
+  store volatile i32 %sel0, ptr addrspace(1) %out0, align 4
+  store volatile i32 %sel1, ptr addrspace(1) %out1, align 4
   ret void
 }
 
@@ -217,16 +217,16 @@ define amdgpu_kernel void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(
 
 ; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL0]], [[VAL1]]
 ; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL0]], [[VAL1]]
-define amdgpu_kernel void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
-  %val0 = load volatile i32, i32 addrspace(1)* %ptr0
-  %val1 = load volatile i32, i32 addrspace(1)* %ptr1
+define amdgpu_kernel void @v_min_max_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) nounwind {
+  %val0 = load volatile i32, ptr addrspace(1) %ptr0
+  %val1 = load volatile i32, ptr addrspace(1) %ptr1
 
   %cond0 = icmp sgt i32 %val0, %val1
   %sel0 = select i1 %cond0, i32 %val0, i32 %val1
   %sel1 = select i1 %cond0, i32 %val1, i32 %val0
 
-  store volatile i32 %sel0, i32 addrspace(1)* %out0, align 4
-  store volatile i32 %sel1, i32 addrspace(1)* %out1, align 4
+  store volatile i32 %sel0, ptr addrspace(1) %out0, align 4
+  store volatile i32 %sel1, ptr addrspace(1) %out1, align 4
   ret void
 }
 
@@ -239,13 +239,13 @@ define amdgpu_kernel void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(
 ; GCN-DAG: s_max_i32
 ; GCN-DAG: s_max_i32
 ; GCN-DAG: s_max_i32
-define amdgpu_kernel void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, <4 x i32> %val0, <4 x i32> %val1) nounwind {
+define amdgpu_kernel void @s_min_max_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %val0, <4 x i32> %val1) nounwind {
   %cond0 = icmp sgt <4 x i32> %val0, %val1
   %sel0 = select <4 x i1> %cond0, <4 x i32> %val0, <4 x i32> %val1
   %sel1 = select <4 x i1> %cond0, <4 x i32> %val1, <4 x i32> %val0
 
-  store volatile <4 x i32> %sel0, <4 x i32> addrspace(1)* %out0, align 4
-  store volatile <4 x i32> %sel1, <4 x i32> addrspace(1)* %out1, align 4
+  store volatile <4 x i32> %sel0, ptr addrspace(1) %out0, align 4
+  store volatile <4 x i32> %sel1, ptr addrspace(1) %out1, align 4
   ret void
 }
 
@@ -254,17 +254,17 @@ define amdgpu_kernel void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i
 ; GCN-DAG: v_cndmask_b32_e32
 ; GCN-DAG: v_cndmask_b32_e32
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
-define amdgpu_kernel void @v_min_max_i32_user(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
-  %val0 = load volatile i32, i32 addrspace(1)* %ptr0
-  %val1 = load volatile i32, i32 addrspace(1)* %ptr1
+define amdgpu_kernel void @v_min_max_i32_user(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) nounwind {
+  %val0 = load volatile i32, ptr addrspace(1) %ptr0
+  %val1 = load volatile i32, ptr addrspace(1) %ptr1
 
   %cond0 = icmp sgt i32 %val0, %val1
   %sel0 = select i1 %cond0, i32 %val0, i32 %val1
   %sel1 = select i1 %cond0, i32 %val1, i32 %val0
 
-  store volatile i32 %sel0, i32 addrspace(1)* %out0, align 4
-  store volatile i32 %sel1, i32 addrspace(1)* %out1, align 4
-  store volatile i1 %cond0, i1 addrspace(1)* undef
+  store volatile i32 %sel0, ptr addrspace(1) %out0, align 4
+  store volatile i32 %sel1, ptr addrspace(1) %out1, align 4
+  store volatile i1 %cond0, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index b08bb8b55264a..9a6851c162d09 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -17,12 +17,12 @@
 ; CIVI-DAG: s_add_i32
 ; CIVI-DAG: s_and_b32
 ; CIVI-DAG: s_or_b32
-define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
+define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0 {
   %neg = sub <2 x i16> zeroinitializer, %val
   %cond = icmp sgt <2 x i16> %val, %neg
   %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg
   %res2 = add <2 x i16> %res, <i16 2, i16 2>
-  store <2 x i16> %res2, <2 x i16> addrspace(1)* %out, align 4
+  store <2 x i16> %res2, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -53,16 +53,16 @@ define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %
 ; CI-DAG: v_add_i32
 ; CI-DAG: v_add_i32
 ; CI-DAG: v_or_b32
-define amdgpu_kernel void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
+define amdgpu_kernel void @v_abs_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %src) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %src, i32 %tid
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
-  %val = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in, align 4
+  %gep.in = getelementptr inbounds <2 x i16>, ptr addrspace(1) %src, i32 %tid
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
+  %val = load <2 x i16>, ptr addrspace(1) %gep.in, align 4
   %neg = sub <2 x i16> zeroinitializer, %val
   %cond = icmp sgt <2 x i16> %val, %neg
   %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg
   %res2 = add <2 x i16> %res, <i16 2, i16 2>
-  store <2 x i16> %res2, <2 x i16> addrspace(1)* %gep.out, align 4
+  store <2 x i16> %res2, ptr addrspace(1) %gep.out, align 4
   ret void
 }
 
@@ -71,7 +71,7 @@ define amdgpu_kernel void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> a
 ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
 ; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0]
-define amdgpu_kernel void @s_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
+define amdgpu_kernel void @s_abs_v2i16_2(ptr addrspace(1) %out, <2 x i16> %val) #0 {
   %z0 = insertelement <2 x i16> undef, i16 0, i16 0
   %z1 = insertelement <2 x i16> %z0, i16 0, i16 1
   %t0 = insertelement <2 x i16> undef, i16 2, i16 0
@@ -80,7 +80,7 @@ define amdgpu_kernel void @s_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16>
   %cond = icmp sgt <2 x i16> %val, %neg
   %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg
   %res2 = add <2 x i16> %res, %t1
-  store <2 x i16> %res2, <2 x i16> addrspace(1)* %out, align 4
+  store <2 x i16> %res2, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -89,19 +89,19 @@ define amdgpu_kernel void @s_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
 ; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0]
-define amdgpu_kernel void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
+define amdgpu_kernel void @v_abs_v2i16_2(ptr addrspace(1) %out, ptr addrspace(1) %src) #0 {
   %z0 = insertelement <2 x i16> undef, i16 0, i16 0
   %z1 = insertelement <2 x i16> %z0, i16 0, i16 1
   %t0 = insertelement <2 x i16> undef, i16 2, i16 0
   %t1 = insertelement <2 x i16> %t0, i16 2, i16 1
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %src, i32 %tid
-  %val = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in, align 4
+  %gep.in = getelementptr inbounds <2 x i16>, ptr addrspace(1) %src, i32 %tid
+  %val = load <2 x i16>, ptr addrspace(1) %gep.in, align 4
   %neg = sub <2 x i16> %z1, %val
   %cond = icmp sgt <2 x i16> %val, %neg
   %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg
   %res2 = add <2 x i16> %res, %t1
-  store <2 x i16> %res2, <2 x i16> addrspace(1)* %out, align 4
+  store <2 x i16> %res2, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -113,7 +113,7 @@ define amdgpu_kernel void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], s[[#LOAD + 3]], [[SUB1]]
 ; GFX9-DAG: v_pk_sub_u16 [[ADD0:v[0-9]+]], [[MAX0]], -2 op_sel_hi:[1,0]
 ; GFX9-DAG: v_pk_sub_u16 [[ADD1:v[0-9]+]], [[MAX1]], -2 op_sel_hi:[1,0]
-define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 {
+define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0 {
   %z0 = insertelement <4 x i16> undef, i16 0, i16 0
   %z1 = insertelement <4 x i16> %z0, i16 0, i16 1
   %z2 = insertelement <4 x i16> %z1, i16 0, i16 2
@@ -126,7 +126,7 @@ define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %
   %cond = icmp sgt <4 x i16> %val, %neg
   %res = select <4 x i1> %cond, <4 x i16> %val, <4 x i16> %neg
   %res2 = add <4 x i16> %res, %t3
-  store <4 x i16> %res2, <4 x i16> addrspace(1)* %out, align 4
+  store <4 x i16> %res2, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -140,7 +140,7 @@ define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %
 ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, v[[VAL1]]
 ; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], v[[VAL1]], [[SUB1]]
 ; GFX9-DAG: v_pk_sub_u16 [[ADD1:v[0-9]+]], [[MAX1]], -2 op_sel_hi:[1,0]
-define amdgpu_kernel void @v_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %src) #0 {
+define amdgpu_kernel void @v_abs_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %src) #0 {
   %z0 = insertelement <4 x i16> undef, i16 0, i16 0
   %z1 = insertelement <4 x i16> %z0, i16 0, i16 1
   %z2 = insertelement <4 x i16> %z1, i16 0, i16 2
@@ -150,42 +150,42 @@ define amdgpu_kernel void @v_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
   %t2 = insertelement <4 x i16> %t1, i16 2, i16 2
   %t3 = insertelement <4 x i16> %t2, i16 2, i16 3
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.in = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %src, i32 %tid
-  %val = load <4 x i16>, <4 x i16> addrspace(1)* %gep.in, align 4
+  %gep.in = getelementptr inbounds <4 x i16>, ptr addrspace(1) %src, i32 %tid
+  %val = load <4 x i16>, ptr addrspace(1) %gep.in, align 4
   %neg = sub <4 x i16> %z3, %val
   %cond = icmp sgt <4 x i16> %val, %neg
   %res = select <4 x i1> %cond, <4 x i16> %val, <4 x i16> %neg
   %res2 = add <4 x i16> %res, %t3
-  store <4 x i16> %res2, <4 x i16> addrspace(1)* %out, align 4
+  store <4 x i16> %res2, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_min_max_v2i16:
 ; GFX9: v_pk_max_i16
 ; GFX9: v_pk_min_i16
-define amdgpu_kernel void @s_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) #0 {
+define amdgpu_kernel void @s_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %val0, <2 x i16> %val1) #0 {
   %cond0 = icmp sgt <2 x i16> %val0, %val1
   %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1
   %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0
 
-  store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4
-  store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4
+  store volatile <2 x i16> %sel0, ptr addrspace(1) %out0, align 4
+  store volatile <2 x i16> %sel1, ptr addrspace(1) %out1, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_min_max_v2i16:
 ; GFX9: v_pk_max_i16
 ; GFX9: v_pk_min_i16
-define amdgpu_kernel void @v_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 {
-  %val0 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr0
-  %val1 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr1
+define amdgpu_kernel void @v_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) #0 {
+  %val0 = load volatile <2 x i16>, ptr addrspace(1) %ptr0
+  %val1 = load volatile <2 x i16>, ptr addrspace(1) %ptr1
 
   %cond0 = icmp sgt <2 x i16> %val0, %val1
   %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1
   %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0
 
-  store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4
-  store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4
+  store volatile <2 x i16> %sel0, ptr addrspace(1) %out0, align 4
+  store volatile <2 x i16> %sel1, ptr addrspace(1) %out1, align 4
   ret void
 }
 
@@ -194,41 +194,41 @@ define amdgpu_kernel void @v_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i
 ; GFX9-DAG: v_pk_min_i16
 ; GFX9-DAG: v_pk_max_i16
 ; GFX9-DAG: v_pk_min_i16
-define amdgpu_kernel void @s_min_max_v4i16(<4 x i16> addrspace(1)* %out0, <4 x i16> addrspace(1)* %out1, <4 x i16> %val0, <4 x i16> %val1) #0 {
+define amdgpu_kernel void @s_min_max_v4i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i16> %val0, <4 x i16> %val1) #0 {
   %cond0 = icmp sgt <4 x i16> %val0, %val1
   %sel0 = select <4 x i1> %cond0, <4 x i16> %val0, <4 x i16> %val1
   %sel1 = select <4 x i1> %cond0, <4 x i16> %val1, <4 x i16> %val0
 
-  store volatile <4 x i16> %sel0, <4 x i16> addrspace(1)* %out0, align 4
-  store volatile <4 x i16> %sel1, <4 x i16> addrspace(1)* %out1, align 4
+  store volatile <4 x i16> %sel0, ptr addrspace(1) %out0, align 4
+  store volatile <4 x i16> %sel1, ptr addrspace(1) %out1, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_min_max_v2i16_user:
-define amdgpu_kernel void @v_min_max_v2i16_user(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 {
-  %val0 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr0
-  %val1 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr1
+define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) #0 {
+  %val0 = load volatile <2 x i16>, ptr addrspace(1) %ptr0
+  %val1 = load volatile <2 x i16>, ptr addrspace(1) %ptr1
 
   %cond0 = icmp sgt <2 x i16> %val0, %val1
   %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1
   %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0
 
-  store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4
-  store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4
-  store volatile <2 x i1> %cond0, <2 x i1> addrspace(1)* undef
+  store volatile <2 x i16> %sel0, ptr addrspace(1) %out0, align 4
+  store volatile <2 x i16> %sel1, ptr addrspace(1) %out1, align 4
+  store volatile <2 x i1> %cond0, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}u_min_max_v2i16:
 ; GFX9: v_pk_max_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9: v_pk_min_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @u_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) nounwind {
+define amdgpu_kernel void @u_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %val0, <2 x i16> %val1) nounwind {
   %cond0 = icmp ugt <2 x i16> %val0, %val1
   %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1
   %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0
 
-  store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4
-  store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4
+  store volatile <2 x i16> %sel0, ptr addrspace(1) %out0, align 4
+  store volatile <2 x i16> %sel1, ptr addrspace(1) %out1, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll
index 0782b2f4ba35c..49c1bb378bcc6 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd.ll
@@ -8,11 +8,11 @@
 ; GCN-LABEL: {{^}}smrd0:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 ; VIGFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
-define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @smrd0(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
 entry:
-  %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 1
-  %tmp1 = load i32, i32 addrspace(4)* %tmp
-  store i32 %tmp1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 1
+  %tmp1 = load i32, ptr addrspace(4) %tmp
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -20,11 +20,11 @@ entry:
 ; GCN-LABEL: {{^}}smrd1:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
 ; VIGFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @smrd1(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
 entry:
-  %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 255
-  %tmp1 = load i32, i32 addrspace(4)* %tmp
-  store i32 %tmp1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 255
+  %tmp1 = load i32, ptr addrspace(4) %tmp
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -35,11 +35,11 @@ entry:
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VIGFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 ; GCN: s_endpgm
-define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @smrd2(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
 entry:
-  %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 256
-  %tmp1 = load i32, i32 addrspace(4)* %tmp
-  store i32 %tmp1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 256
+  %tmp1 = load i32, ptr addrspace(4) %tmp
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -50,11 +50,11 @@ entry:
 ; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0x13 ; encoding: [0x13
 ; TODO: Add VI checks
 ; GCN: s_endpgm
-define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, [8 x i32], i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @smrd3(ptr addrspace(1) %out, [8 x i32], ptr addrspace(4) %ptr) #0 {
 entry:
-  %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 4294967296
-  %tmp1 = load i32, i32 addrspace(4)* %tmp
-  store i32 %tmp1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 4294967296
+  %tmp1 = load i32, ptr addrspace(4) %tmp
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -66,11 +66,11 @@ entry:
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
 ; GFX9_10: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc
 ; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
-define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @smrd4(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
 entry:
-  %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 262143
-  %tmp1 = load i32, i32 addrspace(4)* %tmp
-  store i32 %tmp1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 262143
+  %tmp1 = load i32, ptr addrspace(4) %tmp
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -80,11 +80,11 @@ entry:
 ; SIVIGFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @smrd5(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
 entry:
-  %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 262144
-  %tmp1 = load i32, i32 addrspace(4)* %tmp
-  store i32 %tmp1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 262144
+  %tmp1 = load i32, ptr addrspace(4) %tmp
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -93,11 +93,11 @@ entry:
 ; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4
 ; SICIVI: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0
 ; GFX9_10: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, -0x4
-define amdgpu_kernel void @smrd6(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
 entry:
-  %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 -1
-  %tmp1 = load i32, i32 addrspace(4)* %tmp
-  store i32 %tmp1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 -1
+  %tmp1 = load i32, ptr addrspace(4) %tmp
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -106,11 +106,11 @@ entry:
 ; GCN: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, 0xffe00000
 ; SICIVI: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0
 ; GFX9_10: s_load_dword s{{[0-9]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0
-define amdgpu_kernel void @smrd7(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @smrd7(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
 entry:
-  %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 -524288
-  %tmp1 = load i32, i32 addrspace(4)* %tmp
-  store i32 %tmp1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 -524288
+  %tmp1 = load i32, ptr addrspace(4) %tmp
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
@@ -135,10 +135,9 @@ main_body:
 ; GCN-LABEL: {{^}}smrd_load_const0:
 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
 ; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
-define amdgpu_ps void @smrd_load_const0(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
+define amdgpu_ps void @smrd_load_const0(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
+  %tmp20 = load <4 x i32>, ptr addrspace(4) %arg
   %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 16, i32 0)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
   ret void
@@ -151,12 +150,11 @@ main_body:
 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff glc ; encoding: [0xff
 ; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x3fc ;
 ; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x3fc glc ;
-define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
+define amdgpu_ps void @smrd_load_const1(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
+  %tmp20 = load <4 x i32>, ptr addrspace(4) %arg
   %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1020, i32 0)
-  %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
   %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1020, i32 1)
   %s.buffer.float = bitcast i32 %s.buffer to float
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
@@ -174,12 +172,11 @@ main_body:
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x400
 ; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0x400
-define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
+define amdgpu_ps void @smrd_load_const2(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
+  %tmp20 = load <4 x i32>, ptr addrspace(4) %arg
   %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1024, i32 0)
-  %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
   %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1024, i32 0)
   %s.buffer.float = bitcast i32 %s.buffer to float
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
@@ -195,12 +192,11 @@ main_body:
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0xffffc
 ; VIGFX9_10: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]+}}], 0xffffc
-define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
+define amdgpu_ps void @smrd_load_const3(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
+  %tmp20 = load <4 x i32>, ptr addrspace(4) %arg
   %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1048572, i32 0)
-  %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
   %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1048572, i32 0)
   %s.buffer.float = bitcast i32 %s.buffer to float
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
@@ -215,12 +211,11 @@ main_body:
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define amdgpu_ps void @smrd_load_const4(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
+define amdgpu_ps void @smrd_load_const4(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
+  %tmp20 = load <4 x i32>, ptr addrspace(4) %arg
   %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1048576, i32 0)
-  %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
   %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1048576, i32 0)
   %s.buffer.float = bitcast i32 %s.buffer to float
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
@@ -231,9 +226,9 @@ main_body:
 ; GCN-LABEL: {{^}}s_buffer_load_dwordx2:
 ; VIGFX9_10: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
 ; SICI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
-define amdgpu_ps void @s_buffer_load_dwordx2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
+define amdgpu_ps void @s_buffer_load_dwordx2(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
 main_body:
-  %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
   %s.buffer = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %tmp22, i32 128, i32 0)
   %s.buffer.0 = extractelement <2 x i32> %s.buffer, i32 0
   %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
@@ -247,9 +242,9 @@ main_body:
 ; GCN-LABEL: {{^}}s_buffer_load_dwordx4:
 ; VIGFX9_10: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
 ; SICI: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
-define amdgpu_ps void @s_buffer_load_dwordx4(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
+define amdgpu_ps void @s_buffer_load_dwordx4(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
 main_body:
-  %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
   %s.buffer = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %tmp22, i32 128, i32 0)
   %s.buffer.0 = extractelement <4 x i32> %s.buffer, i32 0
   %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
@@ -267,9 +262,9 @@ main_body:
 ; GCN-LABEL: {{^}}s_buffer_load_dwordx8:
 ; VIGFX9_10: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
 ; SICI: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
-define amdgpu_ps void @s_buffer_load_dwordx8(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
+define amdgpu_ps void @s_buffer_load_dwordx8(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
 main_body:
-  %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
   %s.buffer = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %tmp22, i32 128, i32 0)
   %s.buffer.0 = extractelement <8 x i32> %s.buffer, i32 0
   %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
@@ -287,9 +282,9 @@ main_body:
 ; GCN-LABEL: {{^}}s_buffer_load_dwordx8_v8f32:
 ; VIGFX9_10: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
 ; SICI: s_buffer_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
-define amdgpu_ps void @s_buffer_load_dwordx8_v8f32(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
+define amdgpu_ps void @s_buffer_load_dwordx8_v8f32(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
 main_body:
-  %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
   %s.buffer = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %tmp22, i32 128, i32 0)
   %s.buffer.0 = extractelement <8 x float> %s.buffer, i32 0
   %s.buffer.1 = extractelement <8 x float> %s.buffer, i32 2
@@ -303,9 +298,9 @@ main_body:
 ; GCN-LABEL: {{^}}s_buffer_load_dwordx16:
 ; VIGFX9_10: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
 ; SICI: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
-define amdgpu_ps void @s_buffer_load_dwordx16(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
+define amdgpu_ps void @s_buffer_load_dwordx16(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
 main_body:
-  %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
   %s.buffer = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %tmp22, i32 128, i32 0)
   %s.buffer.0 = extractelement <16 x i32> %s.buffer, i32 0
   %s.buffer.0.float = bitcast i32 %s.buffer.0 to float
@@ -322,9 +317,9 @@ main_body:
 ; GCN-LABEL: {{^}}s_buffer_load_dwordx16_v16f32:
 ; VIGFX9_10: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x80
 ; SICI: s_buffer_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0x20
-define amdgpu_ps void @s_buffer_load_dwordx16_v16f32(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in) #0 {
+define amdgpu_ps void @s_buffer_load_dwordx16_v16f32(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in) #0 {
 main_body:
-  %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
   %s.buffer = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %tmp22, i32 128, i32 0)
   %s.buffer.0 = extractelement <16 x float> %s.buffer, i32 0
   %s.buffer.1 = extractelement <16 x float> %s.buffer, i32 3
@@ -471,9 +466,8 @@ main_body:
 
 ; GCN-LABEL: {{^}}smrd_sgpr_descriptor_promoted
 ; GCN: v_readfirstlane
-define amdgpu_cs void @smrd_sgpr_descriptor_promoted([0 x i8] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), i32) #0 {
+define amdgpu_cs void @smrd_sgpr_descriptor_promoted(ptr addrspace(4) inreg noalias dereferenceable(18446744073709551615), i32) #0 {
 main_body:
-  %descptr = bitcast [0 x i8] addrspace(4)* %0 to <4 x i32> addrspace(4)*, !amdgpu.uniform !0
   br label %.outer_loop_header
 
 ret_block:                                       ; preds = %.outer, %.label22, %main_body
@@ -489,9 +483,9 @@ ret_block:                                       ; preds = %.outer, %.label22, %
   br i1 %inner_br1, label %.inner_loop_body, label %ret_block
 
 .inner_loop_body:
-  %descriptor = load <4 x i32>, <4 x i32> addrspace(4)* %descptr, align 16, !invariant.load !0
+  %descriptor = load <4 x i32>, ptr addrspace(4) %0, align 16, !invariant.load !0
   %load1result = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %descriptor, i32 0, i32 0)
-  store float %load1result, float addrspace(1)* undef
+  store float %load1result, ptr addrspace(1) undef
   %inner_br2 = icmp uge i32 %1, 10
   br i1 %inner_br2, label %.inner_loop_header, label %.outer_loop_body
 
@@ -509,12 +503,11 @@ ret_block:                                       ; preds = %.outer, %.label22, %
 ; CI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
 ; CI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
 ; GCN: s_endpgm
-define amdgpu_ps void @smrd_load_nonconst0(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 inreg %ncoff) #0 {
+define amdgpu_ps void @smrd_load_nonconst0(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in, i32 inreg %ncoff) #0 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
+  %tmp20 = load <4 x i32>, ptr addrspace(4) %arg
   %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 %ncoff, i32 0)
-  %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
   %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
   %s.buffer.float = bitcast i32 %s.buffer to float
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
@@ -528,12 +521,11 @@ main_body:
 ; CI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
 ; CI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
 ; GCN: s_endpgm
-define amdgpu_ps void @smrd_load_nonconst1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 %ncoff) #0 {
+define amdgpu_ps void @smrd_load_nonconst1(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in, i32 %ncoff) #0 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
+  %tmp20 = load <4 x i32>, ptr addrspace(4) %arg
   %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 %ncoff, i32 0)
-  %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
   %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
   %s.buffer.float = bitcast i32 %s.buffer to float
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %s.buffer.float, i1 true, i1 true) #0
@@ -547,12 +539,11 @@ main_body:
 ; CI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
 ; CI: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
 ; GCN: s_endpgm
-define amdgpu_ps void @smrd_load_nonconst2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 %ncoff) #0 {
+define amdgpu_ps void @smrd_load_nonconst2(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in, i32 %ncoff) #0 {
 main_body:
-  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
-  %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
+  %tmp20 = load <4 x i32>, ptr addrspace(4) %arg
   %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 %ncoff, i32 0)
-  %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
   %s.buffer = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
   %s.buffer.elt = extractelement <8 x i32> %s.buffer, i32 1
   %s.buffer.float = bitcast i32 %s.buffer.elt to float
@@ -618,9 +609,9 @@ main_body:
 ; SIVIGFX9_10: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
 ; CI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
 ; GCN: s_endpgm
-define amdgpu_ps void @smrd_load_dwordx2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, <4 x i32> addrspace(4)* inreg %in, i32 inreg %ncoff) #0 {
+define amdgpu_ps void @smrd_load_dwordx2(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, ptr addrspace(4) inreg %in, i32 inreg %ncoff) #0 {
 main_body:
-  %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
+  %tmp22 = load <4 x i32>, ptr addrspace(4) %in
   %s.buffer = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
   %s.buffer.float = bitcast <2 x i32> %s.buffer to <2 x float>
   %r.1 = extractelement <2 x float> %s.buffer.float, i32 0
@@ -700,7 +691,7 @@ main_body:
   br i1 undef, label %if1, label %endif1
 
 if1:                                              ; preds = %main_body
-  store i32 0, i32 addrspace(3)* undef, align 4
+  store i32 0, ptr addrspace(3) undef, align 4
   br label %endif1
 
 endif1:                                           ; preds = %if1, %main_body

diff  --git a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
index 501da3ff09875..3176257920a7a 100644
--- a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
+++ b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck %s
 
-define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(float addrspace(4)* %wei_ptr, float addrspace(1)* %out_ptr, float addrspace(1)* %in) {
+define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) {
 ; CHECK-LABEL: excess_soft_clause_reg_pressure:
 ; CHECK:  BB0_1: ; %for.cond28.preheader
 ; CHECK:         s_load_dwordx16
@@ -24,9 +24,8 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(float addrs
 ; CHECK-NOT: v_writelane_b32
 ; CHECK-NOT: v_readlane_b32
 entry:
-  %i = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %i1 = bitcast i8 addrspace(4)* %i to i64 addrspace(4)*
-  %i2 = load i64, i64 addrspace(4)* %i1, align 8
+  %i = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %i2 = load i64, ptr addrspace(4) %i, align 8
   %i3 = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %i4 = shl i32 %i3, 8
   %i5 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !5
@@ -35,8 +34,8 @@ entry:
   %conv = add i32 %i6, %i7
   %conv.frozen = freeze i32 %conv
   %div = udiv i32 %conv.frozen, 49
-  %add.ptr22 = getelementptr inbounds float, float addrspace(4)* %wei_ptr, i64 undef
-  %in.ptr1 = getelementptr inbounds float, float addrspace(1)* %in, i32 %i5
+  %add.ptr22 = getelementptr inbounds float, ptr addrspace(4) %wei_ptr, i64 undef
+  %in.ptr1 = getelementptr inbounds float, ptr addrspace(1) %in, i32 %i5
   br label %for.cond28.preheader
 
 for.cond28.preheader:                             ; preds = %for.cond28.preheader, %entry
@@ -72,272 +71,272 @@ for.cond28.preheader:                             ; preds = %for.cond28.preheade
   %accum.sroa.118.0 = phi float [ 0.000000e+00, %entry ], [ %i259, %for.cond28.preheader ]
   %accum.sroa.122.0 = phi float [ 0.000000e+00, %entry ], [ %i263, %for.cond28.preheader ]
   %accum.sroa.126.0 = phi float [ 0.000000e+00, %entry ], [ %i267, %for.cond28.preheader ]
-  %i_ptr.0288 = phi float addrspace(1)* [ %in.ptr1, %entry ], [ %add.ptr47.3, %for.cond28.preheader ]
-  %w_ptr.0287 = phi float addrspace(4)* [ %add.ptr22, %entry ], [ %add.ptr74, %for.cond28.preheader ]
+  %i_ptr.0288 = phi ptr addrspace(1) [ %in.ptr1, %entry ], [ %add.ptr47.3, %for.cond28.preheader ]
+  %w_ptr.0287 = phi ptr addrspace(4) [ %add.ptr22, %entry ], [ %add.ptr74, %for.cond28.preheader ]
   %ci.0286 = phi i32 [ 0, %entry ], [ %inc116, %for.cond28.preheader ]
-  %i8 = load float, float addrspace(1)* %i_ptr.0288, align 4
-  %add.ptr47 = getelementptr inbounds float, float addrspace(1)* %i_ptr.0288, i64 49
-  %i9 = load float, float addrspace(1)* %add.ptr47, align 4
-  %add.ptr47.1 = getelementptr inbounds float, float addrspace(1)* %i_ptr.0288, i64 98
-  %i10 = load float, float addrspace(1)* %add.ptr47.1, align 4
-  %add.ptr47.2 = getelementptr inbounds float, float addrspace(1)* %i_ptr.0288, i64 147
-  %i11 = load float, float addrspace(1)* %add.ptr47.2, align 4
-  %i12 = load float, float addrspace(4)* %w_ptr.0287, align 4
-  %add.ptr66 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1024
-  %i13 = load float, float addrspace(4)* %add.ptr66, align 4
-  %add.ptr66.1 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2048
-  %i14 = load float, float addrspace(4)* %add.ptr66.1, align 4
-  %add.ptr66.2 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3072
-  %i15 = load float, float addrspace(4)* %add.ptr66.2, align 4
-  %add.ptr70 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1
-  %i16 = load float, float addrspace(4)* %add.ptr70, align 4
-  %add.ptr66.1291 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1025
-  %i17 = load float, float addrspace(4)* %add.ptr66.1291, align 4
-  %add.ptr66.1.1 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2049
-  %i18 = load float, float addrspace(4)* %add.ptr66.1.1, align 4
-  %add.ptr66.2.1 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3073
-  %i19 = load float, float addrspace(4)* %add.ptr66.2.1, align 4
-  %add.ptr70.1 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2
-  %i20 = load float, float addrspace(4)* %add.ptr70.1, align 4
-  %add.ptr66.2293 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1026
-  %i21 = load float, float addrspace(4)* %add.ptr66.2293, align 4
-  %add.ptr66.1.2 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2050
-  %i22 = load float, float addrspace(4)* %add.ptr66.1.2, align 4
-  %add.ptr66.2.2 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3074
-  %i23 = load float, float addrspace(4)* %add.ptr66.2.2, align 4
-  %add.ptr70.2 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3
-  %i24 = load float, float addrspace(4)* %add.ptr70.2, align 4
-  %add.ptr66.3 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1027
-  %i25 = load float, float addrspace(4)* %add.ptr66.3, align 4
-  %add.ptr66.1.3 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2051
-  %i26 = load float, float addrspace(4)* %add.ptr66.1.3, align 4
-  %add.ptr66.2.3 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3075
-  %i27 = load float, float addrspace(4)* %add.ptr66.2.3, align 4
-  %add.ptr70.3 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 4
-  %i28 = load float, float addrspace(4)* %add.ptr70.3, align 4
-  %add.ptr66.4 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1028
-  %i29 = load float, float addrspace(4)* %add.ptr66.4, align 4
-  %add.ptr66.1.4 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2052
-  %i30 = load float, float addrspace(4)* %add.ptr66.1.4, align 4
-  %add.ptr66.2.4 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3076
-  %i31 = load float, float addrspace(4)* %add.ptr66.2.4, align 4
-  %add.ptr70.4 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 5
-  %i32 = load float, float addrspace(4)* %add.ptr70.4, align 4
-  %add.ptr66.5 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1029
-  %i33 = load float, float addrspace(4)* %add.ptr66.5, align 4
-  %add.ptr66.1.5 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2053
-  %i34 = load float, float addrspace(4)* %add.ptr66.1.5, align 4
-  %add.ptr66.2.5 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3077
-  %i35 = load float, float addrspace(4)* %add.ptr66.2.5, align 4
-  %add.ptr70.5 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 6
-  %i36 = load float, float addrspace(4)* %add.ptr70.5, align 4
-  %add.ptr66.6 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1030
-  %i37 = load float, float addrspace(4)* %add.ptr66.6, align 4
-  %add.ptr66.1.6 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2054
-  %i38 = load float, float addrspace(4)* %add.ptr66.1.6, align 4
-  %add.ptr66.2.6 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3078
-  %i39 = load float, float addrspace(4)* %add.ptr66.2.6, align 4
-  %add.ptr70.6 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 7
-  %i40 = load float, float addrspace(4)* %add.ptr70.6, align 4
-  %add.ptr66.7 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1031
-  %i41 = load float, float addrspace(4)* %add.ptr66.7, align 4
-  %add.ptr66.1.7 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2055
-  %i42 = load float, float addrspace(4)* %add.ptr66.1.7, align 4
-  %add.ptr66.2.7 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3079
-  %i43 = load float, float addrspace(4)* %add.ptr66.2.7, align 4
-  %add.ptr70.7 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 8
-  %i44 = load float, float addrspace(4)* %add.ptr70.7, align 4
-  %add.ptr66.8 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1032
-  %i45 = load float, float addrspace(4)* %add.ptr66.8, align 4
-  %add.ptr66.1.8 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2056
-  %i46 = load float, float addrspace(4)* %add.ptr66.1.8, align 4
-  %add.ptr66.2.8 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3080
-  %i47 = load float, float addrspace(4)* %add.ptr66.2.8, align 4
-  %add.ptr70.8 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 9
-  %i48 = load float, float addrspace(4)* %add.ptr70.8, align 4
-  %add.ptr66.9 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1033
-  %i49 = load float, float addrspace(4)* %add.ptr66.9, align 4
-  %add.ptr66.1.9 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2057
-  %i50 = load float, float addrspace(4)* %add.ptr66.1.9, align 4
-  %add.ptr66.2.9 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3081
-  %i51 = load float, float addrspace(4)* %add.ptr66.2.9, align 4
-  %add.ptr70.9 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 10
-  %i52 = load float, float addrspace(4)* %add.ptr70.9, align 4
-  %add.ptr66.10 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1034
-  %i53 = load float, float addrspace(4)* %add.ptr66.10, align 4
-  %add.ptr66.1.10 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2058
-  %i54 = load float, float addrspace(4)* %add.ptr66.1.10, align 4
-  %add.ptr66.2.10 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3082
-  %i55 = load float, float addrspace(4)* %add.ptr66.2.10, align 4
-  %add.ptr70.10 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 11
-  %i56 = load float, float addrspace(4)* %add.ptr70.10, align 4
-  %add.ptr66.11 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1035
-  %i57 = load float, float addrspace(4)* %add.ptr66.11, align 4
-  %add.ptr66.1.11 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2059
-  %i58 = load float, float addrspace(4)* %add.ptr66.1.11, align 4
-  %add.ptr66.2.11 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3083
-  %i59 = load float, float addrspace(4)* %add.ptr66.2.11, align 4
-  %add.ptr70.11 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 12
-  %i60 = load float, float addrspace(4)* %add.ptr70.11, align 4
-  %add.ptr66.12 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1036
-  %i61 = load float, float addrspace(4)* %add.ptr66.12, align 4
-  %add.ptr66.1.12 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2060
-  %i62 = load float, float addrspace(4)* %add.ptr66.1.12, align 4
-  %add.ptr66.2.12 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3084
-  %i63 = load float, float addrspace(4)* %add.ptr66.2.12, align 4
-  %add.ptr70.12 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 13
-  %i64 = load float, float addrspace(4)* %add.ptr70.12, align 4
-  %add.ptr66.13 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1037
-  %i65 = load float, float addrspace(4)* %add.ptr66.13, align 4
-  %add.ptr66.1.13 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2061
-  %i66 = load float, float addrspace(4)* %add.ptr66.1.13, align 4
-  %add.ptr66.2.13 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3085
-  %i67 = load float, float addrspace(4)* %add.ptr66.2.13, align 4
-  %add.ptr70.13 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 14
-  %i68 = load float, float addrspace(4)* %add.ptr70.13, align 4
-  %add.ptr66.14 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1038
-  %i69 = load float, float addrspace(4)* %add.ptr66.14, align 4
-  %add.ptr66.1.14 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2062
-  %i70 = load float, float addrspace(4)* %add.ptr66.1.14, align 4
-  %add.ptr66.2.14 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3086
-  %i71 = load float, float addrspace(4)* %add.ptr66.2.14, align 4
-  %add.ptr70.14 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 15
-  %i72 = load float, float addrspace(4)* %add.ptr70.14, align 4
-  %add.ptr66.15 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1039
-  %i73 = load float, float addrspace(4)* %add.ptr66.15, align 4
-  %add.ptr66.1.15 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2063
-  %i74 = load float, float addrspace(4)* %add.ptr66.1.15, align 4
-  %add.ptr66.2.15 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3087
-  %i75 = load float, float addrspace(4)* %add.ptr66.2.15, align 4
-  %add.ptr70.15 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 16
-  %i76 = load float, float addrspace(4)* %add.ptr70.15, align 4
-  %add.ptr66.16 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1040
-  %i77 = load float, float addrspace(4)* %add.ptr66.16, align 4
-  %add.ptr66.1.16 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2064
-  %i78 = load float, float addrspace(4)* %add.ptr66.1.16, align 4
-  %add.ptr66.2.16 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3088
-  %i79 = load float, float addrspace(4)* %add.ptr66.2.16, align 4
-  %add.ptr70.16 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 17
-  %i80 = load float, float addrspace(4)* %add.ptr70.16, align 4
-  %add.ptr66.17 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1041
-  %i81 = load float, float addrspace(4)* %add.ptr66.17, align 4
-  %add.ptr66.1.17 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2065
-  %i82 = load float, float addrspace(4)* %add.ptr66.1.17, align 4
-  %add.ptr66.2.17 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3089
-  %i83 = load float, float addrspace(4)* %add.ptr66.2.17, align 4
-  %add.ptr70.17 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 18
-  %i84 = load float, float addrspace(4)* %add.ptr70.17, align 4
-  %add.ptr66.18 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1042
-  %i85 = load float, float addrspace(4)* %add.ptr66.18, align 4
-  %add.ptr66.1.18 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2066
-  %i86 = load float, float addrspace(4)* %add.ptr66.1.18, align 4
-  %add.ptr66.2.18 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3090
-  %i87 = load float, float addrspace(4)* %add.ptr66.2.18, align 4
-  %add.ptr70.18 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 19
-  %i88 = load float, float addrspace(4)* %add.ptr70.18, align 4
-  %add.ptr66.19 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1043
-  %i89 = load float, float addrspace(4)* %add.ptr66.19, align 4
-  %add.ptr66.1.19 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2067
-  %i90 = load float, float addrspace(4)* %add.ptr66.1.19, align 4
-  %add.ptr66.2.19 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3091
-  %i91 = load float, float addrspace(4)* %add.ptr66.2.19, align 4
-  %add.ptr70.19 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 20
-  %i92 = load float, float addrspace(4)* %add.ptr70.19, align 4
-  %add.ptr66.20 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1044
-  %i93 = load float, float addrspace(4)* %add.ptr66.20, align 4
-  %add.ptr66.1.20 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2068
-  %i94 = load float, float addrspace(4)* %add.ptr66.1.20, align 4
-  %add.ptr66.2.20 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3092
-  %i95 = load float, float addrspace(4)* %add.ptr66.2.20, align 4
-  %add.ptr70.20 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 21
-  %i96 = load float, float addrspace(4)* %add.ptr70.20, align 4
-  %add.ptr66.21 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1045
-  %i97 = load float, float addrspace(4)* %add.ptr66.21, align 4
-  %add.ptr66.1.21 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2069
-  %i98 = load float, float addrspace(4)* %add.ptr66.1.21, align 4
-  %add.ptr66.2.21 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3093
-  %i99 = load float, float addrspace(4)* %add.ptr66.2.21, align 4
-  %add.ptr70.21 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 22
-  %i100 = load float, float addrspace(4)* %add.ptr70.21, align 4
-  %add.ptr66.22 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1046
-  %i101 = load float, float addrspace(4)* %add.ptr66.22, align 4
-  %add.ptr66.1.22 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2070
-  %i102 = load float, float addrspace(4)* %add.ptr66.1.22, align 4
-  %add.ptr66.2.22 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3094
-  %i103 = load float, float addrspace(4)* %add.ptr66.2.22, align 4
-  %add.ptr70.22 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 23
-  %i104 = load float, float addrspace(4)* %add.ptr70.22, align 4
-  %add.ptr66.23 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1047
-  %i105 = load float, float addrspace(4)* %add.ptr66.23, align 4
-  %add.ptr66.1.23 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2071
-  %i106 = load float, float addrspace(4)* %add.ptr66.1.23, align 4
-  %add.ptr66.2.23 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3095
-  %i107 = load float, float addrspace(4)* %add.ptr66.2.23, align 4
-  %add.ptr70.23 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 24
-  %i108 = load float, float addrspace(4)* %add.ptr70.23, align 4
-  %add.ptr66.24 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1048
-  %i109 = load float, float addrspace(4)* %add.ptr66.24, align 4
-  %add.ptr66.1.24 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2072
-  %i110 = load float, float addrspace(4)* %add.ptr66.1.24, align 4
-  %add.ptr66.2.24 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3096
-  %i111 = load float, float addrspace(4)* %add.ptr66.2.24, align 4
-  %add.ptr70.24 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 25
-  %i112 = load float, float addrspace(4)* %add.ptr70.24, align 4
-  %add.ptr66.25 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1049
-  %i113 = load float, float addrspace(4)* %add.ptr66.25, align 4
-  %add.ptr66.1.25 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2073
-  %i114 = load float, float addrspace(4)* %add.ptr66.1.25, align 4
-  %add.ptr66.2.25 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3097
-  %i115 = load float, float addrspace(4)* %add.ptr66.2.25, align 4
-  %add.ptr70.25 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 26
-  %i116 = load float, float addrspace(4)* %add.ptr70.25, align 4
-  %add.ptr66.26 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1050
-  %i117 = load float, float addrspace(4)* %add.ptr66.26, align 4
-  %add.ptr66.1.26 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2074
-  %i118 = load float, float addrspace(4)* %add.ptr66.1.26, align 4
-  %add.ptr66.2.26 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3098
-  %i119 = load float, float addrspace(4)* %add.ptr66.2.26, align 4
-  %add.ptr70.26 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 27
-  %i120 = load float, float addrspace(4)* %add.ptr70.26, align 4
-  %add.ptr66.27 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1051
-  %i121 = load float, float addrspace(4)* %add.ptr66.27, align 4
-  %add.ptr66.1.27 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2075
-  %i122 = load float, float addrspace(4)* %add.ptr66.1.27, align 4
-  %add.ptr66.2.27 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3099
-  %i123 = load float, float addrspace(4)* %add.ptr66.2.27, align 4
-  %add.ptr70.27 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 28
-  %i124 = load float, float addrspace(4)* %add.ptr70.27, align 4
-  %add.ptr66.28 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1052
-  %i125 = load float, float addrspace(4)* %add.ptr66.28, align 4
-  %add.ptr66.1.28 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2076
-  %i126 = load float, float addrspace(4)* %add.ptr66.1.28, align 4
-  %add.ptr66.2.28 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3100
-  %i127 = load float, float addrspace(4)* %add.ptr66.2.28, align 4
-  %add.ptr70.28 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 29
-  %i128 = load float, float addrspace(4)* %add.ptr70.28, align 4
-  %add.ptr66.29 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1053
-  %i129 = load float, float addrspace(4)* %add.ptr66.29, align 4
-  %add.ptr66.1.29 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2077
-  %i130 = load float, float addrspace(4)* %add.ptr66.1.29, align 4
-  %add.ptr66.2.29 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3101
-  %i131 = load float, float addrspace(4)* %add.ptr66.2.29, align 4
-  %add.ptr70.29 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 30
-  %i132 = load float, float addrspace(4)* %add.ptr70.29, align 4
-  %add.ptr66.30 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1054
-  %i133 = load float, float addrspace(4)* %add.ptr66.30, align 4
-  %add.ptr66.1.30 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2078
-  %i134 = load float, float addrspace(4)* %add.ptr66.1.30, align 4
-  %add.ptr66.2.30 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3102
-  %i135 = load float, float addrspace(4)* %add.ptr66.2.30, align 4
-  %add.ptr70.30 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 31
-  %i136 = load float, float addrspace(4)* %add.ptr70.30, align 4
-  %add.ptr66.31 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 1055
-  %i137 = load float, float addrspace(4)* %add.ptr66.31, align 4
-  %add.ptr66.1.31 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 2079
-  %i138 = load float, float addrspace(4)* %add.ptr66.1.31, align 4
-  %add.ptr66.2.31 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 3103
-  %i139 = load float, float addrspace(4)* %add.ptr66.2.31, align 4
-  %add.ptr47.3 = getelementptr inbounds float, float addrspace(1)* %i_ptr.0288, i64 196
+  %i8 = load float, ptr addrspace(1) %i_ptr.0288, align 4
+  %add.ptr47 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 49
+  %i9 = load float, ptr addrspace(1) %add.ptr47, align 4
+  %add.ptr47.1 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 98
+  %i10 = load float, ptr addrspace(1) %add.ptr47.1, align 4
+  %add.ptr47.2 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 147
+  %i11 = load float, ptr addrspace(1) %add.ptr47.2, align 4
+  %i12 = load float, ptr addrspace(4) %w_ptr.0287, align 4
+  %add.ptr66 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1024
+  %i13 = load float, ptr addrspace(4) %add.ptr66, align 4
+  %add.ptr66.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2048
+  %i14 = load float, ptr addrspace(4) %add.ptr66.1, align 4
+  %add.ptr66.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3072
+  %i15 = load float, ptr addrspace(4) %add.ptr66.2, align 4
+  %add.ptr70 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1
+  %i16 = load float, ptr addrspace(4) %add.ptr70, align 4
+  %add.ptr66.1291 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1025
+  %i17 = load float, ptr addrspace(4) %add.ptr66.1291, align 4
+  %add.ptr66.1.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2049
+  %i18 = load float, ptr addrspace(4) %add.ptr66.1.1, align 4
+  %add.ptr66.2.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3073
+  %i19 = load float, ptr addrspace(4) %add.ptr66.2.1, align 4
+  %add.ptr70.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2
+  %i20 = load float, ptr addrspace(4) %add.ptr70.1, align 4
+  %add.ptr66.2293 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1026
+  %i21 = load float, ptr addrspace(4) %add.ptr66.2293, align 4
+  %add.ptr66.1.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2050
+  %i22 = load float, ptr addrspace(4) %add.ptr66.1.2, align 4
+  %add.ptr66.2.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3074
+  %i23 = load float, ptr addrspace(4) %add.ptr66.2.2, align 4
+  %add.ptr70.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3
+  %i24 = load float, ptr addrspace(4) %add.ptr70.2, align 4
+  %add.ptr66.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1027
+  %i25 = load float, ptr addrspace(4) %add.ptr66.3, align 4
+  %add.ptr66.1.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2051
+  %i26 = load float, ptr addrspace(4) %add.ptr66.1.3, align 4
+  %add.ptr66.2.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3075
+  %i27 = load float, ptr addrspace(4) %add.ptr66.2.3, align 4
+  %add.ptr70.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 4
+  %i28 = load float, ptr addrspace(4) %add.ptr70.3, align 4
+  %add.ptr66.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1028
+  %i29 = load float, ptr addrspace(4) %add.ptr66.4, align 4
+  %add.ptr66.1.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2052
+  %i30 = load float, ptr addrspace(4) %add.ptr66.1.4, align 4
+  %add.ptr66.2.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3076
+  %i31 = load float, ptr addrspace(4) %add.ptr66.2.4, align 4
+  %add.ptr70.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 5
+  %i32 = load float, ptr addrspace(4) %add.ptr70.4, align 4
+  %add.ptr66.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1029
+  %i33 = load float, ptr addrspace(4) %add.ptr66.5, align 4
+  %add.ptr66.1.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2053
+  %i34 = load float, ptr addrspace(4) %add.ptr66.1.5, align 4
+  %add.ptr66.2.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3077
+  %i35 = load float, ptr addrspace(4) %add.ptr66.2.5, align 4
+  %add.ptr70.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 6
+  %i36 = load float, ptr addrspace(4) %add.ptr70.5, align 4
+  %add.ptr66.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1030
+  %i37 = load float, ptr addrspace(4) %add.ptr66.6, align 4
+  %add.ptr66.1.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2054
+  %i38 = load float, ptr addrspace(4) %add.ptr66.1.6, align 4
+  %add.ptr66.2.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3078
+  %i39 = load float, ptr addrspace(4) %add.ptr66.2.6, align 4
+  %add.ptr70.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 7
+  %i40 = load float, ptr addrspace(4) %add.ptr70.6, align 4
+  %add.ptr66.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1031
+  %i41 = load float, ptr addrspace(4) %add.ptr66.7, align 4
+  %add.ptr66.1.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2055
+  %i42 = load float, ptr addrspace(4) %add.ptr66.1.7, align 4
+  %add.ptr66.2.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3079
+  %i43 = load float, ptr addrspace(4) %add.ptr66.2.7, align 4
+  %add.ptr70.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 8
+  %i44 = load float, ptr addrspace(4) %add.ptr70.7, align 4
+  %add.ptr66.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1032
+  %i45 = load float, ptr addrspace(4) %add.ptr66.8, align 4
+  %add.ptr66.1.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2056
+  %i46 = load float, ptr addrspace(4) %add.ptr66.1.8, align 4
+  %add.ptr66.2.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3080
+  %i47 = load float, ptr addrspace(4) %add.ptr66.2.8, align 4
+  %add.ptr70.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 9
+  %i48 = load float, ptr addrspace(4) %add.ptr70.8, align 4
+  %add.ptr66.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1033
+  %i49 = load float, ptr addrspace(4) %add.ptr66.9, align 4
+  %add.ptr66.1.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2057
+  %i50 = load float, ptr addrspace(4) %add.ptr66.1.9, align 4
+  %add.ptr66.2.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3081
+  %i51 = load float, ptr addrspace(4) %add.ptr66.2.9, align 4
+  %add.ptr70.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 10
+  %i52 = load float, ptr addrspace(4) %add.ptr70.9, align 4
+  %add.ptr66.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1034
+  %i53 = load float, ptr addrspace(4) %add.ptr66.10, align 4
+  %add.ptr66.1.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2058
+  %i54 = load float, ptr addrspace(4) %add.ptr66.1.10, align 4
+  %add.ptr66.2.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3082
+  %i55 = load float, ptr addrspace(4) %add.ptr66.2.10, align 4
+  %add.ptr70.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 11
+  %i56 = load float, ptr addrspace(4) %add.ptr70.10, align 4
+  %add.ptr66.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1035
+  %i57 = load float, ptr addrspace(4) %add.ptr66.11, align 4
+  %add.ptr66.1.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2059
+  %i58 = load float, ptr addrspace(4) %add.ptr66.1.11, align 4
+  %add.ptr66.2.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3083
+  %i59 = load float, ptr addrspace(4) %add.ptr66.2.11, align 4
+  %add.ptr70.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 12
+  %i60 = load float, ptr addrspace(4) %add.ptr70.11, align 4
+  %add.ptr66.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1036
+  %i61 = load float, ptr addrspace(4) %add.ptr66.12, align 4
+  %add.ptr66.1.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2060
+  %i62 = load float, ptr addrspace(4) %add.ptr66.1.12, align 4
+  %add.ptr66.2.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3084
+  %i63 = load float, ptr addrspace(4) %add.ptr66.2.12, align 4
+  %add.ptr70.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 13
+  %i64 = load float, ptr addrspace(4) %add.ptr70.12, align 4
+  %add.ptr66.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1037
+  %i65 = load float, ptr addrspace(4) %add.ptr66.13, align 4
+  %add.ptr66.1.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2061
+  %i66 = load float, ptr addrspace(4) %add.ptr66.1.13, align 4
+  %add.ptr66.2.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3085
+  %i67 = load float, ptr addrspace(4) %add.ptr66.2.13, align 4
+  %add.ptr70.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 14
+  %i68 = load float, ptr addrspace(4) %add.ptr70.13, align 4
+  %add.ptr66.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1038
+  %i69 = load float, ptr addrspace(4) %add.ptr66.14, align 4
+  %add.ptr66.1.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2062
+  %i70 = load float, ptr addrspace(4) %add.ptr66.1.14, align 4
+  %add.ptr66.2.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3086
+  %i71 = load float, ptr addrspace(4) %add.ptr66.2.14, align 4
+  %add.ptr70.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 15
+  %i72 = load float, ptr addrspace(4) %add.ptr70.14, align 4
+  %add.ptr66.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1039
+  %i73 = load float, ptr addrspace(4) %add.ptr66.15, align 4
+  %add.ptr66.1.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2063
+  %i74 = load float, ptr addrspace(4) %add.ptr66.1.15, align 4
+  %add.ptr66.2.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3087
+  %i75 = load float, ptr addrspace(4) %add.ptr66.2.15, align 4
+  %add.ptr70.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 16
+  %i76 = load float, ptr addrspace(4) %add.ptr70.15, align 4
+  %add.ptr66.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1040
+  %i77 = load float, ptr addrspace(4) %add.ptr66.16, align 4
+  %add.ptr66.1.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2064
+  %i78 = load float, ptr addrspace(4) %add.ptr66.1.16, align 4
+  %add.ptr66.2.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3088
+  %i79 = load float, ptr addrspace(4) %add.ptr66.2.16, align 4
+  %add.ptr70.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 17
+  %i80 = load float, ptr addrspace(4) %add.ptr70.16, align 4
+  %add.ptr66.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1041
+  %i81 = load float, ptr addrspace(4) %add.ptr66.17, align 4
+  %add.ptr66.1.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2065
+  %i82 = load float, ptr addrspace(4) %add.ptr66.1.17, align 4
+  %add.ptr66.2.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3089
+  %i83 = load float, ptr addrspace(4) %add.ptr66.2.17, align 4
+  %add.ptr70.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 18
+  %i84 = load float, ptr addrspace(4) %add.ptr70.17, align 4
+  %add.ptr66.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1042
+  %i85 = load float, ptr addrspace(4) %add.ptr66.18, align 4
+  %add.ptr66.1.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2066
+  %i86 = load float, ptr addrspace(4) %add.ptr66.1.18, align 4
+  %add.ptr66.2.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3090
+  %i87 = load float, ptr addrspace(4) %add.ptr66.2.18, align 4
+  %add.ptr70.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 19
+  %i88 = load float, ptr addrspace(4) %add.ptr70.18, align 4
+  %add.ptr66.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1043
+  %i89 = load float, ptr addrspace(4) %add.ptr66.19, align 4
+  %add.ptr66.1.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2067
+  %i90 = load float, ptr addrspace(4) %add.ptr66.1.19, align 4
+  %add.ptr66.2.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3091
+  %i91 = load float, ptr addrspace(4) %add.ptr66.2.19, align 4
+  %add.ptr70.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 20
+  %i92 = load float, ptr addrspace(4) %add.ptr70.19, align 4
+  %add.ptr66.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1044
+  %i93 = load float, ptr addrspace(4) %add.ptr66.20, align 4
+  %add.ptr66.1.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2068
+  %i94 = load float, ptr addrspace(4) %add.ptr66.1.20, align 4
+  %add.ptr66.2.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3092
+  %i95 = load float, ptr addrspace(4) %add.ptr66.2.20, align 4
+  %add.ptr70.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 21
+  %i96 = load float, ptr addrspace(4) %add.ptr70.20, align 4
+  %add.ptr66.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1045
+  %i97 = load float, ptr addrspace(4) %add.ptr66.21, align 4
+  %add.ptr66.1.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2069
+  %i98 = load float, ptr addrspace(4) %add.ptr66.1.21, align 4
+  %add.ptr66.2.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3093
+  %i99 = load float, ptr addrspace(4) %add.ptr66.2.21, align 4
+  %add.ptr70.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 22
+  %i100 = load float, ptr addrspace(4) %add.ptr70.21, align 4
+  %add.ptr66.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1046
+  %i101 = load float, ptr addrspace(4) %add.ptr66.22, align 4
+  %add.ptr66.1.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2070
+  %i102 = load float, ptr addrspace(4) %add.ptr66.1.22, align 4
+  %add.ptr66.2.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3094
+  %i103 = load float, ptr addrspace(4) %add.ptr66.2.22, align 4
+  %add.ptr70.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 23
+  %i104 = load float, ptr addrspace(4) %add.ptr70.22, align 4
+  %add.ptr66.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1047
+  %i105 = load float, ptr addrspace(4) %add.ptr66.23, align 4
+  %add.ptr66.1.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2071
+  %i106 = load float, ptr addrspace(4) %add.ptr66.1.23, align 4
+  %add.ptr66.2.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3095
+  %i107 = load float, ptr addrspace(4) %add.ptr66.2.23, align 4
+  %add.ptr70.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 24
+  %i108 = load float, ptr addrspace(4) %add.ptr70.23, align 4
+  %add.ptr66.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1048
+  %i109 = load float, ptr addrspace(4) %add.ptr66.24, align 4
+  %add.ptr66.1.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2072
+  %i110 = load float, ptr addrspace(4) %add.ptr66.1.24, align 4
+  %add.ptr66.2.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3096
+  %i111 = load float, ptr addrspace(4) %add.ptr66.2.24, align 4
+  %add.ptr70.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 25
+  %i112 = load float, ptr addrspace(4) %add.ptr70.24, align 4
+  %add.ptr66.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1049
+  %i113 = load float, ptr addrspace(4) %add.ptr66.25, align 4
+  %add.ptr66.1.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2073
+  %i114 = load float, ptr addrspace(4) %add.ptr66.1.25, align 4
+  %add.ptr66.2.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3097
+  %i115 = load float, ptr addrspace(4) %add.ptr66.2.25, align 4
+  %add.ptr70.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 26
+  %i116 = load float, ptr addrspace(4) %add.ptr70.25, align 4
+  %add.ptr66.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1050
+  %i117 = load float, ptr addrspace(4) %add.ptr66.26, align 4
+  %add.ptr66.1.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2074
+  %i118 = load float, ptr addrspace(4) %add.ptr66.1.26, align 4
+  %add.ptr66.2.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3098
+  %i119 = load float, ptr addrspace(4) %add.ptr66.2.26, align 4
+  %add.ptr70.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 27
+  %i120 = load float, ptr addrspace(4) %add.ptr70.26, align 4
+  %add.ptr66.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1051
+  %i121 = load float, ptr addrspace(4) %add.ptr66.27, align 4
+  %add.ptr66.1.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2075
+  %i122 = load float, ptr addrspace(4) %add.ptr66.1.27, align 4
+  %add.ptr66.2.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3099
+  %i123 = load float, ptr addrspace(4) %add.ptr66.2.27, align 4
+  %add.ptr70.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 28
+  %i124 = load float, ptr addrspace(4) %add.ptr70.27, align 4
+  %add.ptr66.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1052
+  %i125 = load float, ptr addrspace(4) %add.ptr66.28, align 4
+  %add.ptr66.1.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2076
+  %i126 = load float, ptr addrspace(4) %add.ptr66.1.28, align 4
+  %add.ptr66.2.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3100
+  %i127 = load float, ptr addrspace(4) %add.ptr66.2.28, align 4
+  %add.ptr70.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 29
+  %i128 = load float, ptr addrspace(4) %add.ptr70.28, align 4
+  %add.ptr66.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1053
+  %i129 = load float, ptr addrspace(4) %add.ptr66.29, align 4
+  %add.ptr66.1.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2077
+  %i130 = load float, ptr addrspace(4) %add.ptr66.1.29, align 4
+  %add.ptr66.2.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3101
+  %i131 = load float, ptr addrspace(4) %add.ptr66.2.29, align 4
+  %add.ptr70.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 30
+  %i132 = load float, ptr addrspace(4) %add.ptr70.29, align 4
+  %add.ptr66.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1054
+  %i133 = load float, ptr addrspace(4) %add.ptr66.30, align 4
+  %add.ptr66.1.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2078
+  %i134 = load float, ptr addrspace(4) %add.ptr66.1.30, align 4
+  %add.ptr66.2.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3102
+  %i135 = load float, ptr addrspace(4) %add.ptr66.2.30, align 4
+  %add.ptr70.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 31
+  %i136 = load float, ptr addrspace(4) %add.ptr70.30, align 4
+  %add.ptr66.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1055
+  %i137 = load float, ptr addrspace(4) %add.ptr66.31, align 4
+  %add.ptr66.1.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2079
+  %i138 = load float, ptr addrspace(4) %add.ptr66.1.31, align 4
+  %add.ptr66.2.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3103
+  %i139 = load float, ptr addrspace(4) %add.ptr66.2.31, align 4
+  %add.ptr47.3 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 196
   %i140 = tail call float @llvm.fmuladd.f32(float %i8, float %i12, float %accum.sroa.0.0)
   %i141 = tail call float @llvm.fmuladd.f32(float %i9, float %i13, float %i140)
   %i142 = tail call float @llvm.fmuladd.f32(float %i10, float %i14, float %i141)
@@ -466,7 +465,7 @@ for.cond28.preheader:                             ; preds = %for.cond28.preheade
   %i265 = tail call float @llvm.fmuladd.f32(float %i9, float %i137, float %i264)
   %i266 = tail call float @llvm.fmuladd.f32(float %i10, float %i138, float %i265)
   %i267 = tail call float @llvm.fmuladd.f32(float %i11, float %i139, float %i266)
-  %add.ptr74 = getelementptr inbounds float, float addrspace(4)* %w_ptr.0287, i64 4096
+  %add.ptr74 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 4096
   %inc116 = add nuw nsw i32 %ci.0286, 1
   %exitcond.not = icmp eq i32 %inc116, 512
   br i1 %exitcond.not, label %for.cond.cleanup26, label %for.cond28.preheader
@@ -480,96 +479,96 @@ for.cond.cleanup26:                               ; preds = %for.cond28.preheade
   %add124 = add i32 %add122, %mul123
   %add126 = add i32 %add124, %mul119
   %idx.ext127 = zext i32 %add126 to i64
-  %add.ptr128 = getelementptr inbounds float, float addrspace(1)* %out_ptr, i64 %idx.ext127
-  store float %i143, float addrspace(1)* %add.ptr128, align 4
-  %add.ptr184 = getelementptr inbounds float, float addrspace(1)* %add.ptr128, i64 196
-  store float %i147, float addrspace(1)* %add.ptr184, align 4
-  %add.ptr167.1 = getelementptr inbounds float, float addrspace(1)* %add.ptr184, i64 14
-  store float 0.000000e+00, float addrspace(1)* %add.ptr167.1, align 4
-  %add.ptr175.1.1 = getelementptr inbounds float, float addrspace(1)* %add.ptr167.1, i64 1
-  store float 0.000000e+00, float addrspace(1)* %add.ptr175.1.1, align 4
-  %add.ptr184.1 = getelementptr inbounds float, float addrspace(1)* %add.ptr184, i64 196
-  store float %i151, float addrspace(1)* %add.ptr184.1, align 4
-  %add.ptr184.2 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.1, i64 196
-  store float %i155, float addrspace(1)* %add.ptr184.2, align 4
-  %add.ptr184.3 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.2, i64 196
-  store float %i159, float addrspace(1)* %add.ptr184.3, align 4
-  %add.ptr184.4 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.3, i64 196
-  store float %i163, float addrspace(1)* %add.ptr184.4, align 4
-  %add.ptr154.5 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.4, i64 1
-  store float 0.000000e+00, float addrspace(1)* %add.ptr154.5, align 4
-  %add.ptr184.5 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.4, i64 196
-  store float %i167, float addrspace(1)* %add.ptr184.5, align 4
-  %add.ptr154.6 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.5, i64 1
-  store float 0.000000e+00, float addrspace(1)* %add.ptr154.6, align 4
-  %add.ptr184.6 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.5, i64 196
-  store float %i171, float addrspace(1)* %add.ptr184.6, align 4
-  %add.ptr184.7 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.6, i64 196
-  store float %i175, float addrspace(1)* %add.ptr184.7, align 4
-  %add.ptr167.8 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.7, i64 14
-  store float 0.000000e+00, float addrspace(1)* %add.ptr167.8, align 4
-  %add.ptr175.1.8 = getelementptr inbounds float, float addrspace(1)* %add.ptr167.8, i64 1
-  store float 0.000000e+00, float addrspace(1)* %add.ptr175.1.8, align 4
-  %add.ptr184.8 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.7, i64 196
-  store float %i179, float addrspace(1)* %add.ptr184.8, align 4
-  %add.ptr184.9 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.8, i64 196
-  store float %i183, float addrspace(1)* %add.ptr184.9, align 4
-  %add.ptr184.10 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.9, i64 196
-  store float %i187, float addrspace(1)* %add.ptr184.10, align 4
-  %add.ptr184.11 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.10, i64 196
-  store float %i191, float addrspace(1)* %add.ptr184.11, align 4
-  %add.ptr184.12 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.11, i64 196
-  store float %i195, float addrspace(1)* %add.ptr184.12, align 4
-  %add.ptr184.13 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.12, i64 196
-  store float %i199, float addrspace(1)* %add.ptr184.13, align 4
-  %add.ptr184.14 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.13, i64 196
-  store float %i203, float addrspace(1)* %add.ptr184.14, align 4
-  %add.ptr184.15 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.14, i64 196
-  store float %i207, float addrspace(1)* %add.ptr184.15, align 4
-  %add.ptr184.16 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.15, i64 196
-  store float %i211, float addrspace(1)* %add.ptr184.16, align 4
-  %add.ptr184.17 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.16, i64 196
-  store float %i215, float addrspace(1)* %add.ptr184.17, align 4
-  %add.ptr184.18 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.17, i64 196
-  store float %i219, float addrspace(1)* %add.ptr184.18, align 4
-  %add.ptr184.19 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.18, i64 196
-  store float %i223, float addrspace(1)* %add.ptr184.19, align 4
-  %add.ptr184.20 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.19, i64 196
-  store float %i227, float addrspace(1)* %add.ptr184.20, align 4
-  %add.ptr184.21 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.20, i64 196
-  store float %i231, float addrspace(1)* %add.ptr184.21, align 4
-  %add.ptr184.22 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.21, i64 196
-  store float %i235, float addrspace(1)* %add.ptr184.22, align 4
-  %add.ptr184.23 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.22, i64 196
-  store float %i239, float addrspace(1)* %add.ptr184.23, align 4
-  %add.ptr184.24 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.23, i64 196
-  store float %i243, float addrspace(1)* %add.ptr184.24, align 4
-  %add.ptr184.25 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.24, i64 196
-  store float %i247, float addrspace(1)* %add.ptr184.25, align 4
-  %add.ptr184.26 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.25, i64 196
-  store float %i251, float addrspace(1)* %add.ptr184.26, align 4
-  %add.ptr184.27 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.26, i64 196
-  store float %i255, float addrspace(1)* %add.ptr184.27, align 4
-  %add.ptr184.28 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.27, i64 196
-  store float %i259, float addrspace(1)* %add.ptr184.28, align 4
-  %add.ptr184.29 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.28, i64 196
-  store float %i263, float addrspace(1)* %add.ptr184.29, align 4
-  %add.ptr184.30 = getelementptr inbounds float, float addrspace(1)* %add.ptr184.29, i64 196
-  store float %i267, float addrspace(1)* %add.ptr184.30, align 4
+  %add.ptr128 = getelementptr inbounds float, ptr addrspace(1) %out_ptr, i64 %idx.ext127
+  store float %i143, ptr addrspace(1) %add.ptr128, align 4
+  %add.ptr184 = getelementptr inbounds float, ptr addrspace(1) %add.ptr128, i64 196
+  store float %i147, ptr addrspace(1) %add.ptr184, align 4
+  %add.ptr167.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184, i64 14
+  store float 0.000000e+00, ptr addrspace(1) %add.ptr167.1, align 4
+  %add.ptr175.1.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr167.1, i64 1
+  store float 0.000000e+00, ptr addrspace(1) %add.ptr175.1.1, align 4
+  %add.ptr184.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184, i64 196
+  store float %i151, ptr addrspace(1) %add.ptr184.1, align 4
+  %add.ptr184.2 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.1, i64 196
+  store float %i155, ptr addrspace(1) %add.ptr184.2, align 4
+  %add.ptr184.3 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.2, i64 196
+  store float %i159, ptr addrspace(1) %add.ptr184.3, align 4
+  %add.ptr184.4 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.3, i64 196
+  store float %i163, ptr addrspace(1) %add.ptr184.4, align 4
+  %add.ptr154.5 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.4, i64 1
+  store float 0.000000e+00, ptr addrspace(1) %add.ptr154.5, align 4
+  %add.ptr184.5 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.4, i64 196
+  store float %i167, ptr addrspace(1) %add.ptr184.5, align 4
+  %add.ptr154.6 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.5, i64 1
+  store float 0.000000e+00, ptr addrspace(1) %add.ptr154.6, align 4
+  %add.ptr184.6 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.5, i64 196
+  store float %i171, ptr addrspace(1) %add.ptr184.6, align 4
+  %add.ptr184.7 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.6, i64 196
+  store float %i175, ptr addrspace(1) %add.ptr184.7, align 4
+  %add.ptr167.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.7, i64 14
+  store float 0.000000e+00, ptr addrspace(1) %add.ptr167.8, align 4
+  %add.ptr175.1.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr167.8, i64 1
+  store float 0.000000e+00, ptr addrspace(1) %add.ptr175.1.8, align 4
+  %add.ptr184.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.7, i64 196
+  store float %i179, ptr addrspace(1) %add.ptr184.8, align 4
+  %add.ptr184.9 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.8, i64 196
+  store float %i183, ptr addrspace(1) %add.ptr184.9, align 4
+  %add.ptr184.10 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.9, i64 196
+  store float %i187, ptr addrspace(1) %add.ptr184.10, align 4
+  %add.ptr184.11 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.10, i64 196
+  store float %i191, ptr addrspace(1) %add.ptr184.11, align 4
+  %add.ptr184.12 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.11, i64 196
+  store float %i195, ptr addrspace(1) %add.ptr184.12, align 4
+  %add.ptr184.13 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.12, i64 196
+  store float %i199, ptr addrspace(1) %add.ptr184.13, align 4
+  %add.ptr184.14 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.13, i64 196
+  store float %i203, ptr addrspace(1) %add.ptr184.14, align 4
+  %add.ptr184.15 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.14, i64 196
+  store float %i207, ptr addrspace(1) %add.ptr184.15, align 4
+  %add.ptr184.16 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.15, i64 196
+  store float %i211, ptr addrspace(1) %add.ptr184.16, align 4
+  %add.ptr184.17 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.16, i64 196
+  store float %i215, ptr addrspace(1) %add.ptr184.17, align 4
+  %add.ptr184.18 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.17, i64 196
+  store float %i219, ptr addrspace(1) %add.ptr184.18, align 4
+  %add.ptr184.19 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.18, i64 196
+  store float %i223, ptr addrspace(1) %add.ptr184.19, align 4
+  %add.ptr184.20 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.19, i64 196
+  store float %i227, ptr addrspace(1) %add.ptr184.20, align 4
+  %add.ptr184.21 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.20, i64 196
+  store float %i231, ptr addrspace(1) %add.ptr184.21, align 4
+  %add.ptr184.22 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.21, i64 196
+  store float %i235, ptr addrspace(1) %add.ptr184.22, align 4
+  %add.ptr184.23 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.22, i64 196
+  store float %i239, ptr addrspace(1) %add.ptr184.23, align 4
+  %add.ptr184.24 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.23, i64 196
+  store float %i243, ptr addrspace(1) %add.ptr184.24, align 4
+  %add.ptr184.25 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.24, i64 196
+  store float %i247, ptr addrspace(1) %add.ptr184.25, align 4
+  %add.ptr184.26 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.25, i64 196
+  store float %i251, ptr addrspace(1) %add.ptr184.26, align 4
+  %add.ptr184.27 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.26, i64 196
+  store float %i255, ptr addrspace(1) %add.ptr184.27, align 4
+  %add.ptr184.28 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.27, i64 196
+  store float %i259, ptr addrspace(1) %add.ptr184.28, align 4
+  %add.ptr184.29 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.28, i64 196
+  store float %i263, ptr addrspace(1) %add.ptr184.29, align 4
+  %add.ptr184.30 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.29, i64 196
+  store float %i267, ptr addrspace(1) %add.ptr184.30, align 4
   ret void
 }
 
 declare float @llvm.fmuladd.f32(float, float, float) #0
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare i32 @llvm.amdgcn.workgroup.id.x() #1
-declare align 4 i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #1
+declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #1
 
 attributes #0 = { nofree nosync nounwind readnone speculatable willreturn }
 attributes #1 = { nounwind readnone speculatable willreturn }
 
 !0 = !{i32 1, i32 2, i32 1, i32 0}
 !1 = !{!"none", !"none", !"none", !"none"}
-!2 = !{!"float*", !"float*", !"float*", !"float"}
+!2 = !{!"ptr", !"ptr", !"ptr", !"float"}
 !3 = !{!"restrict const", !"restrict const", !"restrict", !""}
 !4 = !{i32 256, i32 1, i32 1}
 !5 = !{i32 0, i32 1024}

diff  --git a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll
index 5a0065ec14a0b..8b166b4c1bf3f 100644
--- a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.groupstaticsize() #1
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_inline_imm:
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 4{{$}}
-define amdgpu_kernel void @br_scc_eq_i32_inline_imm(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_inline_imm(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 4
   br i1 %cmp0, label %endif, label %if
@@ -19,13 +19,13 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_max:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x7fff{{$}}
-define amdgpu_kernel void @br_scc_eq_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_max(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 32767
   br i1 %cmp0, label %endif, label %if
@@ -35,13 +35,13 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_max_p1:
 ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0x8000{{$}}
-define amdgpu_kernel void @br_scc_eq_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_max_p1(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 32768
   br i1 %cmp0, label %endif, label %if
@@ -51,13 +51,13 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_ne_i32_simm16_max_p1:
 ; GCN: s_cmpk_lg_u32 s{{[0-9]+}}, 0x8000{{$}}
-define amdgpu_kernel void @br_scc_ne_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i32_simm16_max_p1(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp ne i32 %cond, 32768
   br i1 %cmp0, label %endif, label %if
@@ -67,13 +67,13 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_min:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x8000{{$}}
-define amdgpu_kernel void @br_scc_eq_i32_simm16_min(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_min(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, -32768
   br i1 %cmp0, label %endif, label %if
@@ -83,13 +83,13 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_min_m1:
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0xffff7fff{{$}}
-define amdgpu_kernel void @br_scc_eq_i32_simm16_min_m1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_min_m1(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, -32769
   br i1 %cmp0, label %endif, label %if
@@ -99,13 +99,13 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm15_max:
 ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0xffff{{$}}
-define amdgpu_kernel void @br_scc_eq_i32_uimm15_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_uimm15_max(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65535
   br i1 %cmp0, label %endif, label %if
@@ -115,13 +115,13 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm16_max:
 ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0xffff{{$}}
-define amdgpu_kernel void @br_scc_eq_i32_uimm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_uimm16_max(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65535
   br i1 %cmp0, label %endif, label %if
@@ -131,13 +131,13 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm16_max_p1:
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0x10000{{$}}
-define amdgpu_kernel void @br_scc_eq_i32_uimm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_uimm16_max_p1(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65536
   br i1 %cmp0, label %endif, label %if
@@ -147,14 +147,14 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x41{{$}}
-define amdgpu_kernel void @br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -164,13 +164,13 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_ne_i32:
 ; GCN: s_cmpk_lg_i32 s{{[0-9]+}}, 0x41{{$}}
-define amdgpu_kernel void @br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp ne i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -180,13 +180,13 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_sgt_i32:
 ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x41{{$}}
-define amdgpu_kernel void @br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sgt_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp sgt i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -196,13 +196,13 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_sgt_i32_simm16_max:
 ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x7fff{{$}}
-define amdgpu_kernel void @br_scc_sgt_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sgt_i32_simm16_max(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp sgt i32 %cond, 32767
   br i1 %cmp0, label %endif, label %if
@@ -212,13 +212,13 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_sgt_i32_simm16_max_p1:
 ; GCN: s_cmp_gt_i32 s{{[0-9]+}}, 0x8000{{$}}
-define amdgpu_kernel void @br_scc_sgt_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sgt_i32_simm16_max_p1(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp sgt i32 %cond, 32768
   br i1 %cmp0, label %endif, label %if
@@ -228,30 +228,30 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_sge_i32:
 ; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sge_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sge i32 %cond, %size
   br i1 %cmp0, label %endif, label %if
 
 if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
+  call void asm sideeffect "; $0", "v"(ptr addrspace(3) @lds)
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_slt_i32:
 ; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x41{{$}}
-define amdgpu_kernel void @br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_slt_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp slt i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -261,64 +261,64 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_sle_i32:
 ; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sle_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sle i32 %cond, %size
   br i1 %cmp0, label %endif, label %if
 
 if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
+  call void asm sideeffect "; $0", "v"(ptr addrspace(3) @lds)
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_ugt_i32:
 ; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ugt_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ugt i32 %cond, %size
   br i1 %cmp0, label %endif, label %if
 
 if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
+  call void asm sideeffect "; $0", "v"(ptr addrspace(3) @lds)
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_uge_i32:
 ; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_uge_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp uge i32 %cond, %size
   br i1 %cmp0, label %endif, label %if
 
 if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
+  call void asm sideeffect "; $0", "v"(ptr addrspace(3) @lds)
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32:
 ; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x41{{$}}
-define amdgpu_kernel void @br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp ult i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -328,13 +328,13 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32_min_simm16:
 ; GCN: s_cmp_lt_u32 s2, 0xffff8000
-define amdgpu_kernel void @br_scc_ult_i32_min_simm16(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32_min_simm16(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp ult i32 %cond, -32768
   br i1 %cmp0, label %endif, label %if
@@ -344,13 +344,13 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32_min_simm16_m1:
 ; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 0xffff7fff{{$}}
-define amdgpu_kernel void @br_scc_ult_i32_min_simm16_m1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32_min_simm16_m1(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp ult i32 %cond, -32769
   br i1 %cmp0, label %endif, label %if
@@ -360,200 +360,200 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_ule_i32:
 ; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ule_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ule i32 %cond, %size
   br i1 %cmp0, label %endif, label %if
 
 if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
+  call void asm sideeffect "; $0", "v"(ptr addrspace(3) @lds)
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_br_scc_eq_i32:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_eq_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp eq i32 %size, %cond
   br i1 %cmp0, label %endif, label %if
 
 if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
+  call void asm sideeffect "; $0", "v"(ptr addrspace(3) @lds)
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_br_scc_ne_i32:
 ; GCN: s_cmpk_lg_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ne_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ne i32 %size, %cond
   br i1 %cmp0, label %endif, label %if
 
 if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
+  call void asm sideeffect "; $0", "v"(ptr addrspace(3) @lds)
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_br_scc_sgt_i32:
 ; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_sgt_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sgt i32 %size, %cond
   br i1 %cmp0, label %endif, label %if
 
 if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
+  call void asm sideeffect "; $0", "v"(ptr addrspace(3) @lds)
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_br_scc_sge_i32:
 ; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_sge_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sge i32 %size, %cond
   br i1 %cmp0, label %endif, label %if
 
 if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
+  call void asm sideeffect "; $0", "v"(ptr addrspace(3) @lds)
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_br_scc_slt_i32:
 ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_slt_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp slt i32 %size, %cond
   br i1 %cmp0, label %endif, label %if
 
 if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
+  call void asm sideeffect "; $0", "v"(ptr addrspace(3) @lds)
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_br_scc_sle_i32:
 ; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_sle_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sle i32 %size, %cond
   br i1 %cmp0, label %endif, label %if
 
 if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
+  call void asm sideeffect "; $0", "v"(ptr addrspace(3) @lds)
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_br_scc_ugt_i32:
 ; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ugt_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ugt i32 %size, %cond
   br i1 %cmp0, label %endif, label %if
 
 if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
+  call void asm sideeffect "; $0", "v"(ptr addrspace(3) @lds)
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_br_scc_uge_i32:
 ; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_uge_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp uge i32 %size, %cond
   br i1 %cmp0, label %endif, label %if
 
 if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
+  call void asm sideeffect "; $0", "v"(ptr addrspace(3) @lds)
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_br_scc_ult_i32:
 ; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ult_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ult i32 %size, %cond
   br i1 %cmp0, label %endif, label %if
 
 if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
+  call void asm sideeffect "; $0", "v"(ptr addrspace(3) @lds)
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}commute_br_scc_ule_i32:
 ; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ule_i32(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ule i32 %size, %cond
   br i1 %cmp0, label %endif, label %if
 
 if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
+  call void asm sideeffect "; $0", "v"(ptr addrspace(3) @lds)
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32_non_u16:
 ; GCN: s_cmp_lt_u32 s2, 0xfffff7ff
-define amdgpu_kernel void @br_scc_ult_i32_non_u16(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32_non_u16(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %not.size = xor i32 %size, -1
@@ -561,11 +561,11 @@ entry:
   br i1 %cmp0, label %endif, label %if
 
 if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
+  call void asm sideeffect "; $0", "v"(ptr addrspace(3) @lds)
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
@@ -573,7 +573,7 @@ endif:
 ; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, 4
 
 ; SI: v_cmp_eq_u64_e64
-define amdgpu_kernel void @br_scc_eq_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i64_inline_imm(i64 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp eq i64 %cond, 4
   br i1 %cmp0, label %endif, label %if
@@ -583,7 +583,7 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
@@ -593,7 +593,7 @@ endif:
 ; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, s[[[K_LO]]:[[K_HI]]]
 
 ; SI: v_cmp_eq_u64_e32
-define amdgpu_kernel void @br_scc_eq_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i64_simm16(i64 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp eq i64 %cond, 4294968530
   br i1 %cmp0, label %endif, label %if
@@ -603,7 +603,7 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
@@ -611,7 +611,7 @@ endif:
 ; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, 4
 
 ; SI: v_cmp_ne_u64_e64
-define amdgpu_kernel void @br_scc_ne_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i64_inline_imm(i64 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp ne i64 %cond, 4
   br i1 %cmp0, label %endif, label %if
@@ -621,7 +621,7 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
@@ -631,7 +631,7 @@ endif:
 ; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, s[[[K_LO]]:[[K_HI]]]
 
 ; SI: v_cmp_ne_u64_e32
-define amdgpu_kernel void @br_scc_ne_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i64_simm16(i64 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %cmp0 = icmp ne i64 %cond, 4294968530
   br i1 %cmp0, label %endif, label %if
@@ -641,7 +641,7 @@ if:
   br label %endif
 
 endif:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/speculative-execution-freecasts.ll b/llvm/test/CodeGen/AMDGPU/speculative-execution-freecasts.ll
index 376a746eb8253..fbeb4abc135a6 100644
--- a/llvm/test/CodeGen/AMDGPU/speculative-execution-freecasts.ll
+++ b/llvm/test/CodeGen/AMDGPU/speculative-execution-freecasts.ll
@@ -19,10 +19,10 @@ b:
 ; CHECK-LABEL: @ifThen_addrspacecast(
 ; CHECK: addrspacecast
 ; CHECK: br i1 true
-define void @ifThen_addrspacecast(i32* %y) {
+define void @ifThen_addrspacecast(ptr %y) {
   br i1 true, label %a, label %b
 a:
-  %x = addrspacecast i32* %y to i32 addrspace(1)*
+  %x = addrspacecast ptr %y to ptr addrspace(1)
   br label %b
 
 b:

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index 8d851bb8a6cda..7588f5d6ed81e 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -9,9 +9,9 @@
 ; GCN-NOT: buffer_load_dword
 ; GCN:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
 ; GCN:     ScratchSize: 0
-define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, <4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 {
+define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg, ptr addrspace(1) %out) #2 {
 bb:
-  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
   %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai.1, i32 0, i32 0, i32 0)
   %cmp = icmp eq i32 %cond, 0
@@ -19,12 +19,12 @@ bb:
 
 use:
   call void asm sideeffect "", "a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5)
-  store volatile <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> addrspace(1)* %out
+  store volatile <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, ptr addrspace(1) %out
   br label %st
 
 st:
-  %gep1 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 16
-  %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 32
+  %gep1 = getelementptr <4 x float>, ptr addrspace(1) %out, i64 16
+  %gep2 = getelementptr <4 x float>, ptr addrspace(1) %out, i64 32
   call void asm sideeffect "", "a,a"(<4 x float> %mai.1, <4 x float> %mai.2)
   ret void
 }
@@ -56,7 +56,7 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
 ; GCN-NOT: buffer_load_dword
 ; GCN:     v_accvgpr_write_b32
 ; GCN:     ScratchSize: 0
-define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 {
+define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 {
 bb:
   %v = call i32 asm sideeffect "", "=a"()
   br label %use
@@ -65,7 +65,7 @@ use:
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 2.0>, i32 0, i32 0, i32 0)
   call void asm sideeffect "", "a"(i32 %v)
   %elt1 = extractelement <32 x float> %mai.1, i32 0
-  store float %elt1, float addrspace(1)* %arg
+  store float %elt1, ptr addrspace(1) %arg
   ret void
 }
 
@@ -103,15 +103,15 @@ use:
 ; GFX90A:  global_store_dwordx4 v[0:1], v[2:5], off
 
 ; GCN: ScratchSize: 20
-define amdgpu_kernel void @max_6regs_used_8a(<4 x float> addrspace(1)* %arg) #4 {
+define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %v0 = call float asm sideeffect "; def $0", "=v"()
   %a4 = call <4 x float> asm sideeffect "; def $0", "=a"()
-  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid
-  %mai.in = load <4 x float>, <4 x float> addrspace(1)* %gep
+  %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid
+  %mai.in = load <4 x float>, ptr addrspace(1) %gep
   %mai.out = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai.in, i32 0, i32 0, i32 0)
-  store <4 x float> %mai.out, <4 x float> addrspace(1)* %gep
-  store volatile <4 x float> %a4, <4 x float> addrspace(1)* undef
+  store <4 x float> %mai.out, ptr addrspace(1) %gep
+  store volatile <4 x float> %a4, ptr addrspace(1) undef
   call void asm sideeffect "; use $0", "v"(float %v0);
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll b/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
index ce9232f656aa5..fdf2ead0c96ff 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
@@ -4,15 +4,15 @@
 ; allocate scratch registers correctly. Check that this test compiles without
 ; error.
 ; TONGA-LABEL: test
-define amdgpu_kernel void @test(<256 x i32> addrspace(1)* %out, <256 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
-  %aptr = getelementptr <256 x i32>, <256 x i32> addrspace(1)* %in, i32 %tid
-  %a = load <256 x i32>, <256 x i32> addrspace(1)* %aptr
+  %aptr = getelementptr <256 x i32>, ptr addrspace(1) %in, i32 %tid
+  %a = load <256 x i32>, ptr addrspace(1) %aptr
   call void asm sideeffect "", "~{memory}" ()
-  %outptr = getelementptr <256 x i32>, <256 x i32> addrspace(1)* %in, i32 %tid
-  store <256 x i32> %a, <256 x i32> addrspace(1)* %outptr
+  %outptr = getelementptr <256 x i32>, ptr addrspace(1) %in, i32 %tid
+  store <256 x i32> %a, ptr addrspace(1) %outptr
 
 ; mark 128-bit SGPR registers as used so they are unavailable for the
 ; scratch resource descriptor

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll b/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll
index ec31ecc43c414..bae6c8b69c124 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll
@@ -13,28 +13,28 @@
 ; CHECK-NEXT: s_or_b64 exec
 ; CHECK: buffer_
 
-define amdgpu_kernel void @spill_cfg_position(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @spill_cfg_position(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #0
-  %tmp14 = load i32, i32 addrspace(1)* %arg, align 4
-  %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
-  %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4
-  %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
-  %tmp18 = load i32, i32 addrspace(1)* %tmp17, align 4
-  %tmp19 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3
-  %tmp20 = load i32, i32 addrspace(1)* %tmp19, align 4
-  %tmp21 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4
-  %tmp22 = load i32, i32 addrspace(1)* %tmp21, align 4
-  %tmp23 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 5
-  %tmp24 = load i32, i32 addrspace(1)* %tmp23, align 4
-  %tmp25 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 6
-  %tmp26 = load i32, i32 addrspace(1)* %tmp25, align 4
-  %tmp27 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 7
-  %tmp28 = load i32, i32 addrspace(1)* %tmp27, align 4
-  %tmp29 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 8
-  %tmp30 = load i32, i32 addrspace(1)* %tmp29, align 4
-  %tmp33 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp1
-  %tmp34 = load i32, i32 addrspace(1)* %tmp33, align 4
+  %tmp14 = load i32, ptr addrspace(1) %arg, align 4
+  %tmp15 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
+  %tmp16 = load i32, ptr addrspace(1) %tmp15, align 4
+  %tmp17 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
+  %tmp18 = load i32, ptr addrspace(1) %tmp17, align 4
+  %tmp19 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 3
+  %tmp20 = load i32, ptr addrspace(1) %tmp19, align 4
+  %tmp21 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 4
+  %tmp22 = load i32, ptr addrspace(1) %tmp21, align 4
+  %tmp23 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 5
+  %tmp24 = load i32, ptr addrspace(1) %tmp23, align 4
+  %tmp25 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 6
+  %tmp26 = load i32, ptr addrspace(1) %tmp25, align 4
+  %tmp27 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 7
+  %tmp28 = load i32, ptr addrspace(1) %tmp27, align 4
+  %tmp29 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 8
+  %tmp30 = load i32, ptr addrspace(1) %tmp29, align 4
+  %tmp33 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp1
+  %tmp34 = load i32, ptr addrspace(1) %tmp33, align 4
   %tmp35 = icmp eq i32 %tmp34, 0
   br i1 %tmp35, label %bb44, label %bb36
 
@@ -69,7 +69,7 @@ bb52:                                             ; preds = %bb44, %bb36
   %tmp60 = add i32 %tmp59, %tmp28
   %tmp61 = add i32 %tmp60, %tmp57
   %tmp62 = add i32 %tmp61, %tmp53
-  store i32 %tmp62, i32 addrspace(1)* %tmp33, align 4
+  store i32 %tmp62, ptr addrspace(1) %tmp33, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
index 1a71bf00ea6a5..9dcb7d247f889 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
@@ -32,7 +32,7 @@
 ; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]]
 
 ; GCN: s_add_i32 s{{[0-9]+}}, m0, 1
-define amdgpu_kernel void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @spill_m0(i32 %cond, ptr addrspace(1) %out) #0 {
 entry:
   %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={m0}"() #0
   %cmp0 = icmp eq i32 %cond, 0
@@ -44,7 +44,7 @@ if:
 
 endif:
   %foo = call i32 asm sideeffect "s_add_i32 $0, $1, 1", "=s,{m0}"(i32 %m0) #0
-  store i32 %foo, i32 addrspace(1)* %out
+  store i32 %foo, ptr addrspace(1) %out
   ret void
 }
 
@@ -56,15 +56,14 @@ endif:
 ; GCN-NOT: v_readlane_b32 m0
 ; GCN-NOT: s_buffer_store_dword m0
 ; GCN-NOT: s_buffer_load_dword m0
-define amdgpu_ps void @spill_kill_m0_lds(<16 x i8> addrspace(4)* inreg %arg, <16 x i8> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %m0) #0 {
+define amdgpu_ps void @spill_kill_m0_lds(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %m0) #0 {
 main_body:
   %tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0)
   %cmp = fcmp ueq float 0.000000e+00, %tmp
   br i1 %cmp, label %if, label %else
 
 if:                                               ; preds = %main_body
-  %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0
-  %lds_data_ = load float, float addrspace(3)* %lds_ptr
+  %lds_data_ = load float, ptr addrspace(3) @lds
   %lds_data = call float @llvm.amdgcn.wqm.f32(float %lds_data_)
   br label %endif
 
@@ -115,11 +114,11 @@ main_body:
    br i1 %cmp, label %if, label %else
 
 if:                                               ; preds = %main_body
-  store volatile i32 8, i32 addrspace(1)* undef
+  store volatile i32 8, ptr addrspace(1) undef
   br label %endif
 
 else:                                             ; preds = %main_body
-  store volatile i32 11, i32 addrspace(1)* undef
+  store volatile i32 11, ptr addrspace(1) undef
   br label %endif
 
 endif:
@@ -170,12 +169,12 @@ endif:
 ; TOSMEM: s_endpgm
 define amdgpu_kernel void @restore_m0_lds(i32 %arg) {
   %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={m0}"() #0
-  %sval = load volatile i64, i64 addrspace(4)* undef
+  %sval = load volatile i64, ptr addrspace(4) undef
   %cmp = icmp eq i32 %arg, 0
   br i1 %cmp, label %ret, label %bb
 
 bb:
-  store volatile i64 %sval, i64 addrspace(3)* undef
+  store volatile i64 %sval, ptr addrspace(3) undef
   call void asm sideeffect "; use $0", "{m0}"(i32 %m0) #0
   br label %ret
 

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
index ec53c552c525e..65292eb96c694 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
@@ -43,18 +43,17 @@ entry:
   ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
   ; the instruction offset field.
   %alloca = alloca i8, i32 4088, align 4, addrspace(5)
-  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 
-  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
+  %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
 
 
-  %a = load volatile i32, i32 addrspace(5)* %aptr
+  %a = load volatile i32, ptr addrspace(5) %aptr
 
   ; Force %a to spill.
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 
-  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
-  store volatile i32 %a, i32 addrspace(5)* %outptr
+  %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
+  store volatile i32 %a, ptr addrspace(5) %outptr
 
   ret void
 }
@@ -97,16 +96,15 @@ entry:
   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
   ; fit in the instruction, and has to live in the SGPR offset.
   %alloca = alloca i8, i32 4092, align 4, addrspace(5)
-  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 
-  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
+  %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
   ; 0x40000 / 64 = 4096 (for wave64)
-  %a = load volatile i32, i32 addrspace(5)* %aptr
+  %a = load volatile i32, ptr addrspace(5) %aptr
   ; Force %a to spill
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 
-  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
-  store volatile i32 %a, i32 addrspace(5)* %outptr
+  %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
+  store volatile i32 %a, ptr addrspace(5) %outptr
 
   ret void
 }
@@ -159,9 +157,8 @@ entry:
   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
   ; fit in the instruction, and has to live in the SGPR offset.
   %alloca = alloca i8, i32 4096, align 4, addrspace(5)
-  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 
-  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
+  %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
 
   %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
   %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
@@ -174,7 +171,7 @@ entry:
   %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
 
   ; 0x40000 / 64 = 4096 (for wave64)
-  %a = load volatile i32, i32 addrspace(5)* %aptr
+  %a = load volatile i32, ptr addrspace(5) %aptr
   call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
 
   %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
@@ -242,9 +239,8 @@ entry:
   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
   ; fit in the instruction, and has to live in the SGPR offset.
   %alloca = alloca i8, i32 4096, align 4, addrspace(5)
-  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 
-  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
+  %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
 
   %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
   %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
@@ -257,7 +253,7 @@ entry:
   %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
 
   ; 0x40000 / 64 = 4096 (for wave64)
-  %a = load volatile i32, i32 addrspace(5)* %aptr
+  %a = load volatile i32, ptr addrspace(5) %aptr
   call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
 
   %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
@@ -326,17 +322,15 @@ entry:
   ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
   ; the instruction offset field.
   %alloca = alloca i8, i32 4084, align 4, addrspace(5)
-  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
-  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
-  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
-  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
+  %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
+  %a = load volatile <2 x i32>, ptr addrspace(5) %aptr
 
   ; Force %a to spill.
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 
   ; Ensure the alloca sticks around.
-  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
-  %b = load volatile i32, i32 addrspace(5)* %bptr
+  %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
+  %b = load volatile i32, ptr addrspace(5) %bptr
 
   ; Ensure the spill is of the full super-reg.
   call void asm sideeffect "; $0", "r"(<2 x i32> %a)
@@ -395,19 +389,17 @@ entry:
   ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
   ; in the SGPR offset.
   %alloca = alloca i8, i32 4088, align 4, addrspace(5)
-  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
-  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 
   ; 0x3ff00 / 64 = 4092 (for wave64)
-  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
-  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
+  %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
+  %a = load volatile <2 x i32>, ptr addrspace(5) %aptr
 
   ; Force %a to spill.
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 
   ; Ensure the alloca sticks around.
-  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
-  %b = load volatile i32, i32 addrspace(5)* %bptr
+  %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
+  %b = load volatile i32, ptr addrspace(5) %bptr
 
   ; Ensure the spill is of the full super-reg.
   call void asm sideeffect "; $0", "r"(<2 x i32> %a)
@@ -449,18 +441,17 @@ entry:
   ; slot is added. It's hard to hit the actual limit since we're also
   ; going to insert the emergency stack slot for large frames.
   %alloca = alloca i8, i32 4088, align 4, addrspace(5)
-  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 
-  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
+  %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
 
 
-  %a = load volatile i32, i32 addrspace(5)* %aptr
+  %a = load volatile i32, ptr addrspace(5) %aptr
 
   ; Force %a to spill.
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 
-  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
-  store volatile i32 %a, i32 addrspace(5)* %outptr
+  %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
+  store volatile i32 %a, ptr addrspace(5) %outptr
 
   ret void
 }
@@ -501,17 +492,16 @@ entry:
   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
   ; fit in the instruction, and has to live in the SGPR offset.
   %alloca = alloca i8, i32 4096, align 4, addrspace(5)
-  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 
-  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
+  %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
   ; 0x40000 / 64 = 4096 (for wave64)
-  %a = load volatile i32, i32 addrspace(5)* %aptr
+  %a = load volatile i32, ptr addrspace(5) %aptr
 
   ; Force %a to spill
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 
-  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
-  store volatile i32 %a, i32 addrspace(5)* %outptr
+  %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
+  store volatile i32 %a, ptr addrspace(5) %outptr
 
   ret void
 }
@@ -565,17 +555,15 @@ entry:
   ; still fits below offset 4096 (4084 + 8 - 4 = 4092), and can be placed in
   ; the instruction offset field.
   %alloca = alloca i8, i32 4084, align 4, addrspace(5)
-  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
-  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
-  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
-  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
+  %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
+  %a = load volatile <2 x i32>, ptr addrspace(5) %aptr
 
   ; Force %a to spill.
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 
   ; Ensure the alloca sticks around.
-  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
-  %b = load volatile i32, i32 addrspace(5)* %bptr
+  %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
+  %b = load volatile i32, ptr addrspace(5) %bptr
 
   ; Ensure the spill is of the full super-reg.
   call void asm sideeffect "; $0", "r"(<2 x i32> %a)
@@ -630,19 +618,17 @@ entry:
   ; does not fit below offset 4096 (408 + 4 + 8 - 4 = 4096), and has to live
   ; in the SGPR offset.
   %alloca = alloca i8, i32 4088, align 4, addrspace(5)
-  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
-  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 
   ; 0x3ff0000 / 64 = 4092 (for wave64)
-  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
-  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
+  %aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
+  %a = load volatile <2 x i32>, ptr addrspace(5) %aptr
 
   ; Force %a to spill.
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 
   ; Ensure the alloca sticks around.
-  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
-  %b = load volatile i32, i32 addrspace(5)* %bptr
+  %bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
+  %b = load volatile i32, ptr addrspace(5) %bptr
 
   ; Ensure the spill is of the full super-reg.
   call void asm sideeffect "; $0", "r"(<2 x i32> %a)

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index b7428f6562166..2aba719a03a50 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -13,7 +13,7 @@
 ; mechanism works even when many spills happen.
 
 ; Just test that it compiles successfully.
-define amdgpu_kernel void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX6-LABEL: test:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_mov_b32 s44, SCRATCH_RSRC_DWORD0
@@ -10055,8 +10055,8 @@ entry:
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
 
-  %aptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %in, i32 %tid
-  %a = load <1280 x i32>, <1280 x i32> addrspace(1)* %aptr
+  %aptr = getelementptr <1280 x i32>, ptr addrspace(1) %in, i32 %tid
+  %a = load <1280 x i32>, ptr addrspace(1) %aptr
 
 ; mark most VGPR registers as used to increase register pressure
   call void asm sideeffect "", "~{v4},~{v8},~{v12},~{v16},~{v20},~{v24},~{v28},~{v32}" ()
@@ -10067,13 +10067,13 @@ entry:
   call void asm sideeffect "", "~{v164},~{v168},~{v172},~{v176},~{v180},~{v184},~{v188},~{v192}" ()
   call void asm sideeffect "", "~{v196},~{v200},~{v204},~{v208},~{v212},~{v216},~{v220},~{v224}" ()
 
-  %outptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %out, i32 %tid
-  store <1280 x i32> %a, <1280 x i32> addrspace(1)* %outptr
+  %outptr = getelementptr <1280 x i32>, ptr addrspace(1) %out, i32 %tid
+  store <1280 x i32> %a, ptr addrspace(1) %outptr
 
   ret void
 }
 
-define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GFX6-LABEL: test_limited_sgpr:
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -11064,13 +11064,13 @@ entry:
   %scratch = alloca <1280 x i32>, align 8, addrspace(5)
 
 ; load VGPR data
-  %aptr = getelementptr <64 x i32>, <64 x i32> addrspace(1)* %in, i32 %tid
-  %a = load <64 x i32>, <64 x i32> addrspace(1)* %aptr
+  %aptr = getelementptr <64 x i32>, ptr addrspace(1) %in, i32 %tid
+  %a = load <64 x i32>, ptr addrspace(1) %aptr
 
 ; make sure scratch is used
   %x = extractelement <64 x i32> %a, i32 0
-  %sptr0 = getelementptr <1280 x i32>, <1280 x i32> addrspace(5)* %scratch, i32 %x, i32 0
-  store i32 1, i32 addrspace(5)* %sptr0
+  %sptr0 = getelementptr <1280 x i32>, ptr addrspace(5) %scratch, i32 %x, i32 0
+  store i32 1, ptr addrspace(5) %sptr0
 
 ; fill up SGPRs
   %sgpr0 = call <8 x i32> asm sideeffect "; def $0", "=s" ()
@@ -11099,8 +11099,8 @@ bb0:
   br label %ret
 
 ret:
-  %outptr = getelementptr <64 x i32>, <64 x i32> addrspace(1)* %out, i32 %tid
-  store <64 x i32> %a, <64 x i32> addrspace(1)* %outptr
+  %outptr = getelementptr <64 x i32>, ptr addrspace(1) %out, i32 %tid
+  store <64 x i32> %a, ptr addrspace(1) %outptr
 
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
index 521cd91ef7a0b..941240da8ba94 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
@@ -14,40 +14,40 @@
 ; GFX908: ScratchSize: 0
 ; GFX908: VGPRBlocks: 2
 ; GFX908: NumVGPRsForWavesPerEU: 10
-define amdgpu_kernel void @max_11_vgprs_used_9a(i32 addrspace(1)* %p) #0 {
-  %tid = load volatile i32, i32 addrspace(1)* undef
+define amdgpu_kernel void @max_11_vgprs_used_9a(ptr addrspace(1) %p) #0 {
+  %tid = load volatile i32, ptr addrspace(1) undef
   call void asm sideeffect "", "a,a,a,a,a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9)
-  %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid
-  %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p1, i32 4
-  %p3 = getelementptr inbounds i32, i32 addrspace(1)* %p2, i32 8
-  %p4 = getelementptr inbounds i32, i32 addrspace(1)* %p3, i32 12
-  %p5 = getelementptr inbounds i32, i32 addrspace(1)* %p4, i32 16
-  %p6 = getelementptr inbounds i32, i32 addrspace(1)* %p5, i32 20
-  %p7 = getelementptr inbounds i32, i32 addrspace(1)* %p6, i32 24
-  %p8 = getelementptr inbounds i32, i32 addrspace(1)* %p7, i32 28
-  %p9 = getelementptr inbounds i32, i32 addrspace(1)* %p8, i32 32
-  %p10 = getelementptr inbounds i32, i32 addrspace(1)* %p9, i32 36
-  %v1 = load volatile i32, i32 addrspace(1)* %p1
-  %v2 = load volatile i32, i32 addrspace(1)* %p2
-  %v3 = load volatile i32, i32 addrspace(1)* %p3
-  %v4 = load volatile i32, i32 addrspace(1)* %p4
-  %v5 = load volatile i32, i32 addrspace(1)* %p5
-  %v6 = load volatile i32, i32 addrspace(1)* %p6
-  %v7 = load volatile i32, i32 addrspace(1)* %p7
-  %v8 = load volatile i32, i32 addrspace(1)* %p8
-  %v9 = load volatile i32, i32 addrspace(1)* %p9
-  %v10 = load volatile i32, i32 addrspace(1)* %p10
+  %p1 = getelementptr inbounds i32, ptr addrspace(1) %p, i32 %tid
+  %p2 = getelementptr inbounds i32, ptr addrspace(1) %p1, i32 4
+  %p3 = getelementptr inbounds i32, ptr addrspace(1) %p2, i32 8
+  %p4 = getelementptr inbounds i32, ptr addrspace(1) %p3, i32 12
+  %p5 = getelementptr inbounds i32, ptr addrspace(1) %p4, i32 16
+  %p6 = getelementptr inbounds i32, ptr addrspace(1) %p5, i32 20
+  %p7 = getelementptr inbounds i32, ptr addrspace(1) %p6, i32 24
+  %p8 = getelementptr inbounds i32, ptr addrspace(1) %p7, i32 28
+  %p9 = getelementptr inbounds i32, ptr addrspace(1) %p8, i32 32
+  %p10 = getelementptr inbounds i32, ptr addrspace(1) %p9, i32 36
+  %v1 = load volatile i32, ptr addrspace(1) %p1
+  %v2 = load volatile i32, ptr addrspace(1) %p2
+  %v3 = load volatile i32, ptr addrspace(1) %p3
+  %v4 = load volatile i32, ptr addrspace(1) %p4
+  %v5 = load volatile i32, ptr addrspace(1) %p5
+  %v6 = load volatile i32, ptr addrspace(1) %p6
+  %v7 = load volatile i32, ptr addrspace(1) %p7
+  %v8 = load volatile i32, ptr addrspace(1) %p8
+  %v9 = load volatile i32, ptr addrspace(1) %p9
+  %v10 = load volatile i32, ptr addrspace(1) %p10
   call void asm sideeffect "", "v,v,v,v,v,v,v,v,v,v"(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, i32 %v9, i32 %v10)
-  store volatile i32 %v1, i32 addrspace(1)* undef
-  store volatile i32 %v2, i32 addrspace(1)* undef
-  store volatile i32 %v3, i32 addrspace(1)* undef
-  store volatile i32 %v4, i32 addrspace(1)* undef
-  store volatile i32 %v5, i32 addrspace(1)* undef
-  store volatile i32 %v6, i32 addrspace(1)* undef
-  store volatile i32 %v7, i32 addrspace(1)* undef
-  store volatile i32 %v8, i32 addrspace(1)* undef
-  store volatile i32 %v9, i32 addrspace(1)* undef
-  store volatile i32 %v10, i32 addrspace(1)* undef
+  store volatile i32 %v1, ptr addrspace(1) undef
+  store volatile i32 %v2, ptr addrspace(1) undef
+  store volatile i32 %v3, ptr addrspace(1) undef
+  store volatile i32 %v4, ptr addrspace(1) undef
+  store volatile i32 %v5, ptr addrspace(1) undef
+  store volatile i32 %v6, ptr addrspace(1) undef
+  store volatile i32 %v7, ptr addrspace(1) undef
+  store volatile i32 %v8, ptr addrspace(1) undef
+  store volatile i32 %v9, ptr addrspace(1) undef
+  store volatile i32 %v10, ptr addrspace(1) undef
   ret void
 }
 
@@ -83,25 +83,25 @@ define amdgpu_kernel void @max_11_vgprs_used_9a(i32 addrspace(1)* %p) #0 {
 ; GFX908: ScratchSize: 12
 ; GFX908: VGPRBlocks: 2
 ; GFX908: NumVGPRsForWavesPerEU: 11
-define amdgpu_kernel void @max_11_vgprs_used_1a_partial_spill(i64 addrspace(1)* %p) #0 {
-  %tid = load volatile i32, i32 addrspace(1)* undef
+define amdgpu_kernel void @max_11_vgprs_used_1a_partial_spill(ptr addrspace(1) %p) #0 {
+  %tid = load volatile i32, ptr addrspace(1) undef
   call void asm sideeffect "", "a"(i32 1)
-  %p1 = getelementptr inbounds i64, i64 addrspace(1)* %p, i32 %tid
-  %p2 = getelementptr inbounds i64, i64 addrspace(1)* %p1, i32 8
-  %p3 = getelementptr inbounds i64, i64 addrspace(1)* %p2, i32 16
-  %p4 = getelementptr inbounds i64, i64 addrspace(1)* %p3, i32 24
-  %p5 = getelementptr inbounds i64, i64 addrspace(1)* %p4, i32 32
-  %v1 = load volatile i64, i64 addrspace(1)* %p1
-  %v2 = load volatile i64, i64 addrspace(1)* %p2
-  %v3 = load volatile i64, i64 addrspace(1)* %p3
-  %v4 = load volatile i64, i64 addrspace(1)* %p4
-  %v5 = load volatile i64, i64 addrspace(1)* %p5
+  %p1 = getelementptr inbounds i64, ptr addrspace(1) %p, i32 %tid
+  %p2 = getelementptr inbounds i64, ptr addrspace(1) %p1, i32 8
+  %p3 = getelementptr inbounds i64, ptr addrspace(1) %p2, i32 16
+  %p4 = getelementptr inbounds i64, ptr addrspace(1) %p3, i32 24
+  %p5 = getelementptr inbounds i64, ptr addrspace(1) %p4, i32 32
+  %v1 = load volatile i64, ptr addrspace(1) %p1
+  %v2 = load volatile i64, ptr addrspace(1) %p2
+  %v3 = load volatile i64, ptr addrspace(1) %p3
+  %v4 = load volatile i64, ptr addrspace(1) %p4
+  %v5 = load volatile i64, ptr addrspace(1) %p5
   call void asm sideeffect "", "v,v,v,v,v"(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5)
-  store volatile i64 %v1, i64 addrspace(1)* %p2
-  store volatile i64 %v2, i64 addrspace(1)* %p3
-  store volatile i64 %v3, i64 addrspace(1)* %p4
-  store volatile i64 %v4, i64 addrspace(1)* %p5
-  store volatile i64 %v5, i64 addrspace(1)* %p1
+  store volatile i64 %v1, ptr addrspace(1) %p2
+  store volatile i64 %v2, ptr addrspace(1) %p3
+  store volatile i64 %v3, ptr addrspace(1) %p4
+  store volatile i64 %v4, ptr addrspace(1) %p5
+  store volatile i64 %v5, ptr addrspace(1) %p1
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
index e892f4176fc6c..c056d35c56beb 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
@@ -17,39 +17,39 @@
 ; GCN:    VGPRBlocks: 2
 ; GFX900: NumVGPRsForWavesPerEU: 11
 ; GFX908: NumVGPRsForWavesPerEU: 10
-define amdgpu_kernel void @max_11_vgprs(i32 addrspace(1)* %p) #2 {
-  %tid = load volatile i32, i32 addrspace(1)* undef
-  %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid
-  %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p1, i32 4
-  %p3 = getelementptr inbounds i32, i32 addrspace(1)* %p2, i32 8
-  %p4 = getelementptr inbounds i32, i32 addrspace(1)* %p3, i32 12
-  %p5 = getelementptr inbounds i32, i32 addrspace(1)* %p4, i32 16
-  %p6 = getelementptr inbounds i32, i32 addrspace(1)* %p5, i32 20
-  %p7 = getelementptr inbounds i32, i32 addrspace(1)* %p6, i32 24
-  %p8 = getelementptr inbounds i32, i32 addrspace(1)* %p7, i32 28
-  %p9 = getelementptr inbounds i32, i32 addrspace(1)* %p8, i32 32
-  %p10 = getelementptr inbounds i32, i32 addrspace(1)* %p9, i32 36
-  %v1 = load volatile i32, i32 addrspace(1)* %p1
-  %v2 = load volatile i32, i32 addrspace(1)* %p2
-  %v3 = load volatile i32, i32 addrspace(1)* %p3
-  %v4 = load volatile i32, i32 addrspace(1)* %p4
-  %v5 = load volatile i32, i32 addrspace(1)* %p5
-  %v6 = load volatile i32, i32 addrspace(1)* %p6
-  %v7 = load volatile i32, i32 addrspace(1)* %p7
-  %v8 = load volatile i32, i32 addrspace(1)* %p8
-  %v9 = load volatile i32, i32 addrspace(1)* %p9
-  %v10 = load volatile i32, i32 addrspace(1)* %p10
+define amdgpu_kernel void @max_11_vgprs(ptr addrspace(1) %p) #2 {
+  %tid = load volatile i32, ptr addrspace(1) undef
+  %p1 = getelementptr inbounds i32, ptr addrspace(1) %p, i32 %tid
+  %p2 = getelementptr inbounds i32, ptr addrspace(1) %p1, i32 4
+  %p3 = getelementptr inbounds i32, ptr addrspace(1) %p2, i32 8
+  %p4 = getelementptr inbounds i32, ptr addrspace(1) %p3, i32 12
+  %p5 = getelementptr inbounds i32, ptr addrspace(1) %p4, i32 16
+  %p6 = getelementptr inbounds i32, ptr addrspace(1) %p5, i32 20
+  %p7 = getelementptr inbounds i32, ptr addrspace(1) %p6, i32 24
+  %p8 = getelementptr inbounds i32, ptr addrspace(1) %p7, i32 28
+  %p9 = getelementptr inbounds i32, ptr addrspace(1) %p8, i32 32
+  %p10 = getelementptr inbounds i32, ptr addrspace(1) %p9, i32 36
+  %v1 = load volatile i32, ptr addrspace(1) %p1
+  %v2 = load volatile i32, ptr addrspace(1) %p2
+  %v3 = load volatile i32, ptr addrspace(1) %p3
+  %v4 = load volatile i32, ptr addrspace(1) %p4
+  %v5 = load volatile i32, ptr addrspace(1) %p5
+  %v6 = load volatile i32, ptr addrspace(1) %p6
+  %v7 = load volatile i32, ptr addrspace(1) %p7
+  %v8 = load volatile i32, ptr addrspace(1) %p8
+  %v9 = load volatile i32, ptr addrspace(1) %p9
+  %v10 = load volatile i32, ptr addrspace(1) %p10
   call void asm sideeffect "", "v,v,v,v,v,v,v,v,v,v"(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, i32 %v9, i32 %v10)
-  store volatile i32 %v1, i32 addrspace(1)* undef
-  store volatile i32 %v2, i32 addrspace(1)* undef
-  store volatile i32 %v3, i32 addrspace(1)* undef
-  store volatile i32 %v4, i32 addrspace(1)* undef
-  store volatile i32 %v5, i32 addrspace(1)* undef
-  store volatile i32 %v6, i32 addrspace(1)* undef
-  store volatile i32 %v7, i32 addrspace(1)* undef
-  store volatile i32 %v8, i32 addrspace(1)* undef
-  store volatile i32 %v9, i32 addrspace(1)* undef
-  store volatile i32 %v10, i32 addrspace(1)* undef
+  store volatile i32 %v1, ptr addrspace(1) undef
+  store volatile i32 %v2, ptr addrspace(1) undef
+  store volatile i32 %v3, ptr addrspace(1) undef
+  store volatile i32 %v4, ptr addrspace(1) undef
+  store volatile i32 %v5, ptr addrspace(1) undef
+  store volatile i32 %v6, ptr addrspace(1) undef
+  store volatile i32 %v7, ptr addrspace(1) undef
+  store volatile i32 %v8, ptr addrspace(1) undef
+  store volatile i32 %v9, ptr addrspace(1) undef
+  store volatile i32 %v10, ptr addrspace(1) undef
   ret void
 }
 
@@ -66,11 +66,11 @@ define amdgpu_kernel void @max_11_vgprs(i32 addrspace(1)* %p) #2 {
 ; GFX908: ScratchSize: 68
 ; GFX908: VGPRBlocks: 2
 ; GFX908: NumVGPRsForWavesPerEU: 10
-define amdgpu_kernel void @max_10_vgprs_spill_v32(<32 x float> addrspace(1)* %p) #0 {
+define amdgpu_kernel void @max_10_vgprs_spill_v32(ptr addrspace(1) %p) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
-  %v = load volatile <32 x float>, <32 x float> addrspace(1)* %gep
-  store volatile <32 x float> %v, <32 x float> addrspace(1)* undef
+  %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %p, i32 %tid
+  %v = load volatile <32 x float>, ptr addrspace(1) %gep
+  store volatile <32 x float> %v, ptr addrspace(1) undef
   ret void
 }
 
@@ -92,35 +92,35 @@ define amdgpu_kernel void @max_10_vgprs_spill_v32(<32 x float> addrspace(1)* %p)
 ; GCN908:    VGPRBlocks: 62
 ; GFX900:    NumVGPRsForWavesPerEU: 256
 ; GFX908:    NumVGPRsForWavesPerEU: 252
-define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) #1 {
+define amdgpu_kernel void @max_256_vgprs_spill_9x32(ptr addrspace(1) %p) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
-  %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
-  %p3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p2, i32 %tid
-  %p4 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p3, i32 %tid
-  %p5 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p4, i32 %tid
-  %p6 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p5, i32 %tid
-  %p7 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p6, i32 %tid
-  %p8 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p7, i32 %tid
-  %p9 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p8, i32 %tid
-  %v1 = load volatile <32 x float>, <32 x float> addrspace(1)* %p1
-  %v2 = load volatile <32 x float>, <32 x float> addrspace(1)* %p2
-  %v3 = load volatile <32 x float>, <32 x float> addrspace(1)* %p3
-  %v4 = load volatile <32 x float>, <32 x float> addrspace(1)* %p4
-  %v5 = load volatile <32 x float>, <32 x float> addrspace(1)* %p5
-  %v6 = load volatile <32 x float>, <32 x float> addrspace(1)* %p6
-  %v7 = load volatile <32 x float>, <32 x float> addrspace(1)* %p7
-  %v8 = load volatile <32 x float>, <32 x float> addrspace(1)* %p8
-  %v9 = load volatile <32 x float>, <32 x float> addrspace(1)* %p9
-  store volatile <32 x float> %v1, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v2, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v3, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v4, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v5, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v6, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v7, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v8, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v9, <32 x float> addrspace(1)* undef
+  %p1 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p, i32 %tid
+  %p2 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p1, i32 %tid
+  %p3 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p2, i32 %tid
+  %p4 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p3, i32 %tid
+  %p5 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p4, i32 %tid
+  %p6 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p5, i32 %tid
+  %p7 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p6, i32 %tid
+  %p8 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p7, i32 %tid
+  %p9 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p8, i32 %tid
+  %v1 = load volatile <32 x float>, ptr addrspace(1) %p1
+  %v2 = load volatile <32 x float>, ptr addrspace(1) %p2
+  %v3 = load volatile <32 x float>, ptr addrspace(1) %p3
+  %v4 = load volatile <32 x float>, ptr addrspace(1) %p4
+  %v5 = load volatile <32 x float>, ptr addrspace(1) %p5
+  %v6 = load volatile <32 x float>, ptr addrspace(1) %p6
+  %v7 = load volatile <32 x float>, ptr addrspace(1) %p7
+  %v8 = load volatile <32 x float>, ptr addrspace(1) %p8
+  %v9 = load volatile <32 x float>, ptr addrspace(1) %p9
+  store volatile <32 x float> %v1, ptr addrspace(1) undef
+  store volatile <32 x float> %v2, ptr addrspace(1) undef
+  store volatile <32 x float> %v3, ptr addrspace(1) undef
+  store volatile <32 x float> %v4, ptr addrspace(1) undef
+  store volatile <32 x float> %v5, ptr addrspace(1) undef
+  store volatile <32 x float> %v6, ptr addrspace(1) undef
+  store volatile <32 x float> %v7, ptr addrspace(1) undef
+  store volatile <32 x float> %v8, ptr addrspace(1) undef
+  store volatile <32 x float> %v9, ptr addrspace(1) undef
   ret void
 }
 
@@ -143,38 +143,38 @@ define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %
 ; GFX908: VGPRBlocks: 62
 ; GFX900: NumVGPRsForWavesPerEU: 256
 ; GFX908: NumVGPRsForWavesPerEU: 252
-define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) #1 {
+define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(ptr addrspace(1) %p) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
-  %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
-  %p3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p2, i32 %tid
-  %p4 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p3, i32 %tid
-  %p5 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p4, i32 %tid
-  %p6 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p5, i32 %tid
-  %p7 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p6, i32 %tid
-  %p8 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p7, i32 %tid
-  %p9 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p8, i32 %tid
-  %v1 = load volatile <32 x float>, <32 x float> addrspace(1)* %p1
-  %v2 = load volatile <32 x float>, <32 x float> addrspace(1)* %p2
-  %v3 = load volatile <32 x float>, <32 x float> addrspace(1)* %p3
-  %v4 = load volatile <32 x float>, <32 x float> addrspace(1)* %p4
-  %v5 = load volatile <32 x float>, <32 x float> addrspace(1)* %p5
-  %v6 = load volatile <32 x float>, <32 x float> addrspace(1)* %p6
-  %v7 = load volatile <32 x float>, <32 x float> addrspace(1)* %p7
-  %v8 = load volatile <32 x float>, <32 x float> addrspace(1)* %p8
-  %v9 = load volatile <32 x float>, <32 x float> addrspace(1)* %p9
+  %p1 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p, i32 %tid
+  %p2 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p1, i32 %tid
+  %p3 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p2, i32 %tid
+  %p4 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p3, i32 %tid
+  %p5 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p4, i32 %tid
+  %p6 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p5, i32 %tid
+  %p7 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p6, i32 %tid
+  %p8 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p7, i32 %tid
+  %p9 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p8, i32 %tid
+  %v1 = load volatile <32 x float>, ptr addrspace(1) %p1
+  %v2 = load volatile <32 x float>, ptr addrspace(1) %p2
+  %v3 = load volatile <32 x float>, ptr addrspace(1) %p3
+  %v4 = load volatile <32 x float>, ptr addrspace(1) %p4
+  %v5 = load volatile <32 x float>, ptr addrspace(1) %p5
+  %v6 = load volatile <32 x float>, ptr addrspace(1) %p6
+  %v7 = load volatile <32 x float>, ptr addrspace(1) %p7
+  %v8 = load volatile <32 x float>, ptr addrspace(1) %p8
+  %v9 = load volatile <32 x float>, ptr addrspace(1) %p9
   br label %st
 
 st:
-  store volatile <32 x float> %v1, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v2, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v3, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v4, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v5, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v6, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v7, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v8, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v9, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v1, ptr addrspace(1) undef
+  store volatile <32 x float> %v2, ptr addrspace(1) undef
+  store volatile <32 x float> %v3, ptr addrspace(1) undef
+  store volatile <32 x float> %v4, ptr addrspace(1) undef
+  store volatile <32 x float> %v5, ptr addrspace(1) undef
+  store volatile <32 x float> %v6, ptr addrspace(1) undef
+  store volatile <32 x float> %v7, ptr addrspace(1) undef
+  store volatile <32 x float> %v8, ptr addrspace(1) undef
+  store volatile <32 x float> %v9, ptr addrspace(1) undef
   ret void
 }
 
@@ -185,34 +185,34 @@ st:
 ; GFX908: v_accvgpr_write_b32
 ; GFX908: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32
 ; GFX908: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
-define void @stack_args_vgpr_spill(<32 x float> %arg0, <32 x float> %arg1, <32 x float> addrspace(1)* %p) #1 {
+define void @stack_args_vgpr_spill(<32 x float> %arg0, <32 x float> %arg1, ptr addrspace(1) %p) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
-  %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
-  %p3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p2, i32 %tid
-  %p4 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p3, i32 %tid
-  %p5 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p4, i32 %tid
-  %p6 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p5, i32 %tid
-  %p7 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p6, i32 %tid
-  %v1 = load volatile <32 x float>, <32 x float> addrspace(1)* %p1
-  %v2 = load volatile <32 x float>, <32 x float> addrspace(1)* %p2
-  %v3 = load volatile <32 x float>, <32 x float> addrspace(1)* %p3
-  %v4 = load volatile <32 x float>, <32 x float> addrspace(1)* %p4
-  %v5 = load volatile <32 x float>, <32 x float> addrspace(1)* %p5
-  %v6 = load volatile <32 x float>, <32 x float> addrspace(1)* %p6
-  %v7 = load volatile <32 x float>, <32 x float> addrspace(1)* %p7
+  %p1 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p, i32 %tid
+  %p2 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p1, i32 %tid
+  %p3 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p2, i32 %tid
+  %p4 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p3, i32 %tid
+  %p5 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p4, i32 %tid
+  %p6 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p5, i32 %tid
+  %p7 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p6, i32 %tid
+  %v1 = load volatile <32 x float>, ptr addrspace(1) %p1
+  %v2 = load volatile <32 x float>, ptr addrspace(1) %p2
+  %v3 = load volatile <32 x float>, ptr addrspace(1) %p3
+  %v4 = load volatile <32 x float>, ptr addrspace(1) %p4
+  %v5 = load volatile <32 x float>, ptr addrspace(1) %p5
+  %v6 = load volatile <32 x float>, ptr addrspace(1) %p6
+  %v7 = load volatile <32 x float>, ptr addrspace(1) %p7
   br label %st
 
 st:
-  store volatile <32 x float> %arg0, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %arg1, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v1, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v2, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v3, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v4, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v5, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v6, <32 x float> addrspace(1)* undef
-  store volatile <32 x float> %v7, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %arg0, ptr addrspace(1) undef
+  store volatile <32 x float> %arg1, ptr addrspace(1) undef
+  store volatile <32 x float> %v1, ptr addrspace(1) undef
+  store volatile <32 x float> %v2, ptr addrspace(1) undef
+  store volatile <32 x float> %v3, ptr addrspace(1) undef
+  store volatile <32 x float> %v4, ptr addrspace(1) undef
+  store volatile <32 x float> %v5, ptr addrspace(1) undef
+  store volatile <32 x float> %v6, ptr addrspace(1) undef
+  store volatile <32 x float> %v7, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
index ecde59e82caa0..906e2f88a21dd 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
@@ -15,7 +15,7 @@
 ; VMEM: s_cbranch_scc1
 
 ; VMEM: buffer_load_dword
-define amdgpu_kernel void @spill_sgpr_x2(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @spill_sgpr_x2(ptr addrspace(1) %out, i32 %in) #0 {
   %wide.sgpr = call <2 x i32>  asm sideeffect "; def $0", "=s" () #0
   %cmp = icmp eq i32 %in, 0
   br i1 %cmp, label %bb0, label %ret
@@ -44,7 +44,7 @@ ret:
 ; VMEM: s_cbranch_scc1
 
 ; VMEM: buffer_load_dword
-define amdgpu_kernel void @spill_sgpr_x3(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @spill_sgpr_x3(ptr addrspace(1) %out, i32 %in) #0 {
   %wide.sgpr = call <3 x i32>  asm sideeffect "; def $0", "=s" () #0
   %cmp = icmp eq i32 %in, 0
   br i1 %cmp, label %bb0, label %ret
@@ -75,7 +75,7 @@ ret:
 ; VMEM: s_cbranch_scc1
 
 ; VMEM: buffer_load_dword
-define amdgpu_kernel void @spill_sgpr_x4(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @spill_sgpr_x4(ptr addrspace(1) %out, i32 %in) #0 {
   %wide.sgpr = call <4 x i32>  asm sideeffect "; def $0", "=s" () #0
   %cmp = icmp eq i32 %in, 0
   br i1 %cmp, label %bb0, label %ret
@@ -108,7 +108,7 @@ ret:
 ; VMEM: s_cbranch_scc1
 
 ; VMEM: buffer_load_dword
-define amdgpu_kernel void @spill_sgpr_x5(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @spill_sgpr_x5(ptr addrspace(1) %out, i32 %in) #0 {
   %wide.sgpr = call <5 x i32>  asm sideeffect "; def $0", "=s" () #0
   %cmp = icmp eq i32 %in, 0
   br i1 %cmp, label %bb0, label %ret
@@ -146,7 +146,7 @@ ret:
 ; VMEM: s_cbranch_scc1
 
 ; VMEM: buffer_load_dword
-define amdgpu_kernel void @spill_sgpr_x8(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @spill_sgpr_x8(ptr addrspace(1) %out, i32 %in) #0 {
   %wide.sgpr = call <8 x i32>  asm sideeffect "; def $0", "=s" () #0
   %cmp = icmp eq i32 %in, 0
   br i1 %cmp, label %bb0, label %ret
@@ -200,7 +200,7 @@ ret:
 ; VMEM: s_cbranch_scc1
 
 ; VMEM: buffer_load_dword
-define amdgpu_kernel void @spill_sgpr_x16(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @spill_sgpr_x16(ptr addrspace(1) %out, i32 %in) #0 {
   %wide.sgpr = call <16 x i32>  asm sideeffect "; def $0", "=s" () #0
   %cmp = icmp eq i32 %in, 0
   br i1 %cmp, label %bb0, label %ret
@@ -286,7 +286,7 @@ ret:
 ; VMEM: s_cbranch_scc1
 
 ; VMEM: buffer_load_dword
-define amdgpu_kernel void @spill_sgpr_x32(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @spill_sgpr_x32(ptr addrspace(1) %out, i32 %in) #0 {
   %wide.sgpr = call <32 x i32>  asm sideeffect "; def $0", "=s" () #0
   %cmp = icmp eq i32 %in, 0
   br i1 %cmp, label %bb0, label %ret

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll
index ab7590ad67e6c..e8a46bd72aec2 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll
@@ -29,7 +29,7 @@ define void @sgpr_spill_writelane() {
 
 ; FIXME: The writelane intrinsic doesn't really overwrite any inactive lanes
 ; and hence there is no need to preserve the VGPR it modifies.
-define void @device_writelane_intrinsic(i32 addrspace(1)* %out, i32 %src) {
+define void @device_writelane_intrinsic(ptr addrspace(1) %out, i32 %src) {
 ; GCN-LABEL: device_writelane_intrinsic:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -46,11 +46,11 @@ define void @device_writelane_intrinsic(i32 addrspace(1)* %out, i32 %src) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %writelane = call i32 @llvm.amdgcn.writelane(i32 %src, i32 23, i32 15)
-  store i32 %writelane, i32 addrspace(1)* %out, align 4
+  store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @kernel_writelane_intrinsic(i32 addrspace(1)* %out, i32 %src0, i32 %src1) {
+define amdgpu_kernel void @kernel_writelane_intrinsic(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
 ; GCN-LABEL: kernel_writelane_intrinsic:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -62,6 +62,6 @@ define amdgpu_kernel void @kernel_writelane_intrinsic(i32 addrspace(1)* %out, i3
 ; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GCN-NEXT:    s_endpgm
   %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 45)
-  store i32 %writelane, i32 addrspace(1)* %out, align 4
+  store i32 %writelane, ptr addrspace(1) %out, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
index d85774e215369..00564a7db77bc 100644
--- a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
@@ -29,7 +29,7 @@ define void @spill_more_than_wavesize_csr_sgprs() {
 ; CHECK-DAG:    v_readlane_b32 s98, v0, 63
 define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() {
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %alloca
+  store volatile i32 0, ptr addrspace(5) %alloca
   call void asm sideeffect "",
    "~{s35},~{s36},~{s37},~{s38},~{s39},~{s40},~{s41},~{s42}
    ,~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49},~{s50}

diff  --git a/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll b/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll
index ca43ce6a438e2..6d91c33fd2876 100644
--- a/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll
@@ -131,7 +131,7 @@ define hidden i64 @split_i64_arg(i64 returned %arg) local_unnamed_addr #0 !dbg !
   ret i64 %arg, !dbg !69
 }
 
-define hidden i8 addrspace(1)* @split_ptr_arg(i8 addrspace(1)* readnone returned %arg) local_unnamed_addr #0 !dbg !70 {
+define hidden ptr addrspace(1) @split_ptr_arg(ptr addrspace(1) readnone returned %arg) local_unnamed_addr #0 !dbg !70 {
 ; GCN-LABEL: split_ptr_arg:
 ; GCN:       .Lfunc_begin6:
 ; GCN-NEXT:    .loc 0 27 0 ; /tmp/dbg.cl:27:0
@@ -145,8 +145,8 @@ define hidden i8 addrspace(1)* @split_ptr_arg(i8 addrspace(1)* readnone returned
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ; GCN-NEXT:  .Ltmp17:
 ; GCN:         .cfi_endproc
-  call void @llvm.dbg.value(metadata i8 addrspace(1)* %arg, metadata !76, metadata !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef)), !dbg !77
-  ret i8 addrspace(1)* %arg, !dbg !78
+  call void @llvm.dbg.value(metadata ptr addrspace(1) %arg, metadata !76, metadata !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef)), !dbg !77
+  ret ptr addrspace(1) %arg, !dbg !78
 }
 
 declare void @llvm.dbg.value(metadata, metadata, metadata) #1

diff  --git a/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll b/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
index 7645dad73e659..c86c78b42e1dc 100644
--- a/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
+++ b/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
@@ -13,50 +13,50 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone
 ; SI: v_mov_b32_e32 [[V_VAL:v[0-9]+]], s
 ; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x18f, [[V_VAL]]
 ; SI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc
-define amdgpu_kernel void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) {
-  %v.val = load volatile i32, i32 addrspace(1)* %in
+define amdgpu_kernel void @imp_def_vcc_split_i64_add_0(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %s.val) {
+  %v.val = load volatile i32, ptr addrspace(1) %in
   %vec.0 = insertelement <2 x i32> undef, i32 %s.val, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 %v.val, i32 1
   %bc = bitcast <2 x i32> %vec.1 to i64
   %add = add i64 %bc, 399
-  store i64 %add, i64 addrspace(1)* %out, align 8
+  store i64 %add, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_imp_def_vcc_split_i64_add_0:
 ; SI: s_add_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x18f
 ; SI: s_addc_u32 {{s[0-9]+}}, 0xf423f, 0
-define amdgpu_kernel void @s_imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
+define amdgpu_kernel void @s_imp_def_vcc_split_i64_add_0(ptr addrspace(1) %out, i32 %val) {
   %vec.0 = insertelement <2 x i32> undef, i32 %val, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 999999, i32 1
   %bc = bitcast <2 x i32> %vec.1 to i64
   %add = add i64 %bc, 399
-  store i64 %add, i64 addrspace(1)* %out, align 8
+  store i64 %add, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_1:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define amdgpu_kernel void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
-  %v.val = load volatile i32, i32 addrspace(1)* %in
+define amdgpu_kernel void @imp_def_vcc_split_i64_add_1(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val0, i64 %val1) {
+  %v.val = load volatile i32, ptr addrspace(1) %in
   %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 %v.val, i32 1
   %bc = bitcast <2 x i32> %vec.1 to i64
   %add = add i64 %bc, %val1
-  store i64 %add, i64 addrspace(1)* %out, align 8
+  store i64 %add, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_imp_def_vcc_split_i64_add_1:
 ; SI: s_add_u32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 ; SI: s_addc_u32 {{s[0-9]+}}, 0x1869f, {{s[0-9]+}}
-define amdgpu_kernel void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) {
+define amdgpu_kernel void @s_imp_def_vcc_split_i64_add_1(ptr addrspace(1) %out, i32 %val0, i64 %val1) {
   %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 99999, i32 1
   %bc = bitcast <2 x i32> %vec.1 to i64
   %add = add i64 %bc, %val1
-  store i64 %add, i64 addrspace(1)* %out, align 8
+  store i64 %add, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -64,14 +64,14 @@ define amdgpu_kernel void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out,
 ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_2:
 ; SI: v_add_i32_e32 {{v[0-9]+}}, vcc, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: v_addc_u32_e32 {{v[0-9]+}}, vcc, {{v[0-9]+}}, {{v[0-9]+}}, vcc
-define amdgpu_kernel void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
+define amdgpu_kernel void @imp_def_vcc_split_i64_add_2(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val0, i64 %val1) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
-  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %load = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %load = load i32, ptr addrspace(1) %gep
   %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 %load, i32 1
   %bc = bitcast <2 x i32> %vec.1 to i64
   %add = add i64 %bc, %val1
-  store i64 %add, i64 addrspace(1)* %out, align 8
+  store i64 %add, ptr addrspace(1) %out, align 8
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/split-smrd.ll b/llvm/test/CodeGen/AMDGPU/split-smrd.ll
index d8dc6217c692d..6fd7554c77900 100644
--- a/llvm/test/CodeGen/AMDGPU/split-smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/split-smrd.ll
@@ -6,7 +6,7 @@
 
 ; GCN-LABEL: {{^}}split_smrd_add_worklist:
 ; GCN: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
-define amdgpu_ps void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(4)* inreg %arg) #0 {
+define amdgpu_ps void @split_smrd_add_worklist(ptr addrspace(4) inreg %arg) #0 {
 bb:
   %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 96, i32 0)
   %tmp1 = bitcast float %tmp to i32
@@ -19,8 +19,8 @@ bb3:                                              ; preds = %bb
   %tmp4 = bitcast float %tmp to i32
   %tmp5 = add i32 %tmp4, 4
   %tmp6 = sext i32 %tmp5 to i64
-  %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(4)* %arg, i64 0, i64 %tmp6
-  %tmp8 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp7, align 32, !tbaa !0
+  %tmp7 = getelementptr [34 x <8 x i32>], ptr addrspace(4) %arg, i64 0, i64 %tmp6
+  %tmp8 = load <8 x i32>, ptr addrspace(4) %tmp7, align 32, !tbaa !0
   %tmp9 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float bitcast (i32 1061158912 to float), float bitcast (i32 1048576000 to float), <8 x i32> %tmp8, <4 x i32> undef, i1 0, i32 0, i32 0)
   %tmp10 = extractelement <4 x float> %tmp9, i32 0
   %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef)

diff  --git a/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll b/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
index 6cecc23a8810c..02d5943f7791a 100644
--- a/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
@@ -29,7 +29,7 @@
 ; GCN-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24
 
 ; GCN: s_endpgm
-define amdgpu_kernel void @ds_reorder_vector_split(<4 x i64> addrspace(1)* nocapture readonly %srcValues, i32 addrspace(1)* nocapture readonly %offsets, <4 x i64> addrspace(1)* nocapture %destBuffer, i32 %alignmentOffset, i32 %tmp, i32 %tmp1, i32 %x.i.12.i) #0 {
+define amdgpu_kernel void @ds_reorder_vector_split(ptr addrspace(1) nocapture readonly %srcValues, ptr addrspace(1) nocapture readonly %offsets, ptr addrspace(1) nocapture %destBuffer, i32 %alignmentOffset, i32 %tmp, i32 %tmp1, i32 %x.i.12.i) #0 {
 entry:
   %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp3 = tail call i32 @llvm.amdgcn.workitem.id.y()
@@ -41,37 +41,36 @@ entry:
   %x.i.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
   %mul.26.i = mul i32 %x.i.12.i, %x.i.i
   %add.i = add i32 %tmp2, %mul.26.i
-  %arrayidx = getelementptr [256 x [8 x <4 x i64>]], [256 x [8 x <4 x i64>]] addrspace(3)* @sPrivateStorage, i32 0, i32 %tmp9, i32 %add.i
-  store <4 x i64> zeroinitializer, <4 x i64> addrspace(3)* %arrayidx
+  %arrayidx = getelementptr [256 x [8 x <4 x i64>]], ptr addrspace(3) @sPrivateStorage, i32 0, i32 %tmp9, i32 %add.i
+  store <4 x i64> zeroinitializer, ptr addrspace(3) %arrayidx
   %tmp12 = sext i32 %add.i to i64
-  %arrayidx1 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %srcValues, i64 %tmp12
-  %tmp13 = load <4 x i64>, <4 x i64> addrspace(1)* %arrayidx1
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %offsets, i64 %tmp12
-  %tmp14 = load i32, i32 addrspace(1)* %arrayidx2
-  %add.ptr = getelementptr [256 x [8 x <4 x i64>]], [256 x [8 x <4 x i64>]] addrspace(3)* @sPrivateStorage, i32 0, i32 %tmp9, i32 0, i32 %alignmentOffset
+  %arrayidx1 = getelementptr inbounds <4 x i64>, ptr addrspace(1) %srcValues, i64 %tmp12
+  %tmp13 = load <4 x i64>, ptr addrspace(1) %arrayidx1
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %offsets, i64 %tmp12
+  %tmp14 = load i32, ptr addrspace(1) %arrayidx2
+  %add.ptr = getelementptr [256 x [8 x <4 x i64>]], ptr addrspace(3) @sPrivateStorage, i32 0, i32 %tmp9, i32 0, i32 %alignmentOffset
   %mul.i = shl i32 %tmp14, 2
-  %arrayidx.i = getelementptr inbounds i64, i64 addrspace(3)* %add.ptr, i32 %mul.i
-  %tmp15 = bitcast i64 addrspace(3)* %arrayidx.i to <4 x i64> addrspace(3)*
-  store <4 x i64> %tmp13, <4 x i64> addrspace(3)* %tmp15
-  %add.ptr6 = getelementptr [256 x [8 x <4 x i64>]], [256 x [8 x <4 x i64>]] addrspace(3)* @sPrivateStorage, i32 0, i32 %tmp9, i32 %tmp14, i32 %alignmentOffset
+  %arrayidx.i = getelementptr inbounds i64, ptr addrspace(3) %add.ptr, i32 %mul.i
+  store <4 x i64> %tmp13, ptr addrspace(3) %arrayidx.i
+  %add.ptr6 = getelementptr [256 x [8 x <4 x i64>]], ptr addrspace(3) @sPrivateStorage, i32 0, i32 %tmp9, i32 %tmp14, i32 %alignmentOffset
   %tmp16 = sext i32 %tmp14 to i64
   %tmp17 = sext i32 %alignmentOffset to i64
-  %add.ptr9 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %destBuffer, i64 %tmp16, i64 %tmp17
+  %add.ptr9 = getelementptr inbounds <4 x i64>, ptr addrspace(1) %destBuffer, i64 %tmp16, i64 %tmp17
   %tmp18 = bitcast <4 x i64> %tmp13 to i256
   %trunc = trunc i256 %tmp18 to i64
-  store i64 %trunc, i64 addrspace(1)* %add.ptr9
-  %arrayidx10.1 = getelementptr inbounds i64, i64 addrspace(3)* %add.ptr6, i32 1
-  %tmp19 = load i64, i64 addrspace(3)* %arrayidx10.1
-  %arrayidx11.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr9, i64 1
-  store i64 %tmp19, i64 addrspace(1)* %arrayidx11.1
-  %arrayidx10.2 = getelementptr inbounds i64, i64 addrspace(3)* %add.ptr6, i32 2
-  %tmp20 = load i64, i64 addrspace(3)* %arrayidx10.2
-  %arrayidx11.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr9, i64 2
-  store i64 %tmp20, i64 addrspace(1)* %arrayidx11.2
-  %arrayidx10.3 = getelementptr inbounds i64, i64 addrspace(3)* %add.ptr6, i32 3
-  %tmp21 = load i64, i64 addrspace(3)* %arrayidx10.3
-  %arrayidx11.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr9, i64 3
-  store i64 %tmp21, i64 addrspace(1)* %arrayidx11.3
+  store i64 %trunc, ptr addrspace(1) %add.ptr9
+  %arrayidx10.1 = getelementptr inbounds i64, ptr addrspace(3) %add.ptr6, i32 1
+  %tmp19 = load i64, ptr addrspace(3) %arrayidx10.1
+  %arrayidx11.1 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr9, i64 1
+  store i64 %tmp19, ptr addrspace(1) %arrayidx11.1
+  %arrayidx10.2 = getelementptr inbounds i64, ptr addrspace(3) %add.ptr6, i32 2
+  %tmp20 = load i64, ptr addrspace(3) %arrayidx10.2
+  %arrayidx11.2 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr9, i64 2
+  store i64 %tmp20, ptr addrspace(1) %arrayidx11.2
+  %arrayidx10.3 = getelementptr inbounds i64, ptr addrspace(3) %add.ptr6, i32 3
+  %tmp21 = load i64, ptr addrspace(3) %arrayidx10.3
+  %arrayidx11.3 = getelementptr inbounds i64, ptr addrspace(1) %add.ptr9, i64 3
+  store i64 %tmp21, ptr addrspace(1) %arrayidx11.3
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index 99a2091918057..54c4698e03dd8 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -6,26 +6,26 @@
 ; FUNC-LABEL: {{^}}srem_i16_7:
 ; GFX9: s_movk_i32 {{s[0-9]+}}, 0x4925
 ; GFX9: v_mul_lo_u32
-define amdgpu_kernel void @srem_i16_7(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
-  %num = load i16, i16 addrspace(1) * %in
+define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %num = load i16, ptr addrspace(1) %in
   %result = srem i16 %num, 7
-  store i16 %result, i16 addrspace(1)* %out
+  store i16 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in
-  %den = load i32, i32 addrspace(1) * %den_ptr
+define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in
+  %den = load i32, ptr addrspace(1) %den_ptr
   %result = srem i32 %num, %den
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %num = load i32, i32 addrspace(1) * %in
+define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %num = load i32, ptr addrspace(1) %in
   %result = srem i32 %num, 4
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -36,89 +36,89 @@ define amdgpu_kernel void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_mul_lo_u32
 ; SI: v_sub_{{[iu]}}32
 ; SI: s_endpgm
-define amdgpu_kernel void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %num = load i32, i32 addrspace(1) * %in
+define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %num = load i32, ptr addrspace(1) %in
   %result = srem i32 %num, 7
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
-  %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
+define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
+  %num = load <2 x i32>, ptr addrspace(1) %in
+  %den = load <2 x i32>, ptr addrspace(1) %den_ptr
   %result = srem <2 x i32> %num, %den
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
+define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %num = load <2 x i32>, ptr addrspace(1) %in
   %result = srem <2 x i32> %num, <i32 4, i32 4>
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
+define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
+  %num = load <4 x i32>, ptr addrspace(1) %in
+  %den = load <4 x i32>, ptr addrspace(1) %den_ptr
   %result = srem <4 x i32> %num, %den
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
+define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %num = load <4 x i32>, ptr addrspace(1) %in
   %result = srem <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %den_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
-  %num = load i64, i64 addrspace(1) * %in
-  %den = load i64, i64 addrspace(1) * %den_ptr
+define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i64, ptr addrspace(1) %in, i64 1
+  %num = load i64, ptr addrspace(1) %in
+  %den = load i64, ptr addrspace(1) %den_ptr
   %result = srem i64 %num, %den
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %num = load i64, i64 addrspace(1) * %in
+define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %num = load i64, ptr addrspace(1) %in
   %result = srem i64 %num, 4
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
-  %den_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
-  %num = load <2 x i64>, <2 x i64> addrspace(1) * %in
-  %den = load <2 x i64>, <2 x i64> addrspace(1) * %den_ptr
+define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 1
+  %num = load <2 x i64>, ptr addrspace(1) %in
+  %den = load <2 x i64>, ptr addrspace(1) %den_ptr
   %result = srem <2 x i64> %num, %den
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
-  %num = load <2 x i64>, <2 x i64> addrspace(1) * %in
+define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %num = load <2 x i64>, ptr addrspace(1) %in
   %result = srem <2 x i64> %num, <i64 4, i64 4>
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
-  %den_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
-  %num = load <4 x i64>, <4 x i64> addrspace(1) * %in
-  %den = load <4 x i64>, <4 x i64> addrspace(1) * %den_ptr
+define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in, i64 1
+  %num = load <4 x i64>, ptr addrspace(1) %in
+  %den = load <4 x i64>, ptr addrspace(1) %den_ptr
   %result = srem <4 x i64> %num, %den
-  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  store <4 x i64> %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
-  %num = load <4 x i64>, <4 x i64> addrspace(1) * %in
+define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %num = load <4 x i64>, ptr addrspace(1) %in
   %result = srem <4 x i64> %num, <i64 4, i64 4, i64 4, i64 4>
-  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  store <4 x i64> %result, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 8fdf6d1683ebb..27b551ad88376 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s
 
-define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0xd
@@ -209,7 +209,7 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[12:15], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = urem i64 %x, %y
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -445,7 +445,7 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem23_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -518,11 +518,11 @@ define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64
   %1 = ashr i64 %x, 41
   %2 = ashr i64 %y, 41
   %result = srem i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem24_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -595,7 +595,7 @@ define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64
   %1 = ashr i64 %x, 40
   %2 = ashr i64 %y, 40
   %result = srem i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -653,7 +653,7 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) {
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem25_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -726,11 +726,11 @@ define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64
   %1 = ashr i64 %x, 39
   %2 = ashr i64 %y, 39
   %result = srem i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem31_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -803,12 +803,12 @@ define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64
   %1 = ashr i64 %x, 33
   %2 = ashr i64 %y, 33
   %result = srem i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; 32 known sign bits
-define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem32_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s8, s[0:1], 0xe
@@ -869,12 +869,12 @@ define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64
   %1 = ashr i64 %x, 32
   %2 = ashr i64 %y, 32
   %result = srem i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; 33 known sign bits
-define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem33_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1114,11 +1114,11 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
   %1 = ashr i64 %x, 31
   %2 = ashr i64 %y, 31
   %result = srem i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48 %y) {
+define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %y) {
 ; GCN-LABEL: s_test_srem24_48:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1269,11 +1269,11 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
   %1 = ashr i48 %x, 24
   %2 = ashr i48 %y, 24
   %result = srem i48 %1, %2
-  store i48 %result, i48 addrspace(1)* %out
+  store i48 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) {
 ; GCN-LABEL: s_test_srem_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1464,7 +1464,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = srem i64 24, %x
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -1959,7 +1959,7 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_srem24_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x) {
 ; GCN-LABEL: s_test_srem24_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2025,11 +2025,11 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    s_endpgm
   %x.shr = ashr i64 %x, 40
   %result = srem i64 24, %x.shr
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_srem24_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_srem24_k_den_i64(ptr addrspace(1) %out, i64 %x) {
 ; GCN-LABEL: s_test_srem24_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2093,7 +2093,7 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    s_endpgm
   %x.shr = ashr i64 %x, 40
   %result = srem i64 %x.shr, 23423
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/store-barrier.ll b/llvm/test/CodeGen/AMDGPU/store-barrier.ll
index afa4e94222cd9..c45579a421721 100644
--- a/llvm/test/CodeGen/AMDGPU/store-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-barrier.ll
@@ -12,27 +12,27 @@
 ; CHECK: s_barrier
 ; CHECK: s_endpgm
 ; Function Attrs: nounwind
-define amdgpu_kernel void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) #0 {
+define amdgpu_kernel void @test(ptr addrspace(3) nocapture %arg, ptr addrspace(1) nocapture readonly %arg1, ptr addrspace(1) nocapture readonly %arg2, ptr addrspace(1) nocapture %arg3, i32 %arg4, i64 %tmp9) #0 {
 bb:
-  %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9
-  %tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2
-  %tmp14 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp13
-  %tmp15 = load <2 x i8>, <2 x i8> addrspace(3)* %tmp14, align 1
+  %tmp10 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp9
+  %tmp13 = load i32, ptr addrspace(1) %tmp10, align 2
+  %tmp14 = getelementptr inbounds <2 x i8>, ptr addrspace(3) %arg, i32 %tmp13
+  %tmp15 = load <2 x i8>, ptr addrspace(3) %tmp14, align 1
   %tmp16 = add i32 %tmp13, 1
-  %tmp17 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp16
-  store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 1
+  %tmp17 = getelementptr inbounds <2 x i8>, ptr addrspace(3) %arg, i32 %tmp16
+  store <2 x i8> %tmp15, ptr addrspace(3) %tmp17, align 1
   tail call void @llvm.amdgcn.s.barrier()
-  %tmp25 = load i32, i32 addrspace(1)* %tmp10, align 4
+  %tmp25 = load i32, ptr addrspace(1) %tmp10, align 4
   %tmp26 = sext i32 %tmp25 to i64
   %tmp27 = sext i32 %arg4 to i64
-  %tmp28 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 %arg4
-  %tmp29 = load i8, i8 addrspace(3)* %tmp28, align 1
-  %tmp30 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 %tmp27
-  store i8 %tmp29, i8 addrspace(1)* %tmp30, align 1
-  %tmp32 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 0
-  %tmp33 = load i8, i8 addrspace(3)* %tmp32, align 1
-  %tmp35 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 0
-  store i8 %tmp33, i8 addrspace(1)* %tmp35, align 1
+  %tmp28 = getelementptr inbounds <2 x i8>, ptr addrspace(3) %arg, i32 %tmp25, i32 %arg4
+  %tmp29 = load i8, ptr addrspace(3) %tmp28, align 1
+  %tmp30 = getelementptr inbounds <2 x i8>, ptr addrspace(1) %arg3, i64 %tmp26, i64 %tmp27
+  store i8 %tmp29, ptr addrspace(1) %tmp30, align 1
+  %tmp32 = getelementptr inbounds <2 x i8>, ptr addrspace(3) %arg, i32 %tmp25, i32 0
+  %tmp33 = load i8, ptr addrspace(3) %tmp32, align 1
+  %tmp35 = getelementptr inbounds <2 x i8>, ptr addrspace(1) %arg3, i64 %tmp26, i64 0
+  store i8 %tmp33, ptr addrspace(1) %tmp35, align 1
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/store-global.ll b/llvm/test/CodeGen/AMDGPU/store-global.ll
index 6085e1a88549b..1d4fd51cc166f 100644
--- a/llvm/test/CodeGen/AMDGPU/store-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-global.ll
@@ -13,9 +13,9 @@
 
 ; SIVI: buffer_store_byte
 ; GFX9: global_store_byte
-define amdgpu_kernel void @store_i1(i1 addrspace(1)* %out) {
+define amdgpu_kernel void @store_i1(ptr addrspace(1) %out) {
 entry:
-  store i1 true, i1 addrspace(1)* %out
+  store i1 true, ptr addrspace(1) %out
   ret void
 }
 
@@ -33,9 +33,9 @@ entry:
 
 ; SIVI: buffer_store_byte
 ; GFX9: global_store_byte
-define amdgpu_kernel void @store_i8(i8 addrspace(1)* %out, i8 %in) {
+define amdgpu_kernel void @store_i8(ptr addrspace(1) %out, i8 %in) {
 entry:
-  store i8 %in, i8 addrspace(1)* %out
+  store i8 %in, ptr addrspace(1) %out
   ret void
 }
 
@@ -54,9 +54,9 @@ entry:
 
 ; SIVI: buffer_store_short
 ; GFX9: global_store_short
-define amdgpu_kernel void @store_i16(i16 addrspace(1)* %out, i16 %in) {
+define amdgpu_kernel void @store_i16(ptr addrspace(1) %out, i16 %in) {
 entry:
-  store i16 %in, i16 addrspace(1)* %out
+  store i16 %in, ptr addrspace(1) %out
   ret void
 }
 
@@ -70,9 +70,9 @@ entry:
 
 ; EG: MEM_RAT MSKOR
 ; EG: MEM_RAT MSKOR
-define amdgpu_kernel void @store_i24(i24 addrspace(1)* %out, i24 %in) {
+define amdgpu_kernel void @store_i24(ptr addrspace(1) %out, i24 %in) {
 entry:
-  store i24 %in, i24 addrspace(1)* %out
+  store i24 %in, ptr addrspace(1) %out
   ret void
 }
 
@@ -87,9 +87,9 @@ entry:
 
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
 ; CM-NOT: MEM_RAT
-define amdgpu_kernel void @store_i25(i25 addrspace(1)* %out, i25 %in) {
+define amdgpu_kernel void @store_i25(ptr addrspace(1) %out, i25 %in) {
 entry:
-  store i25 %in, i25 addrspace(1)* %out
+  store i25 %in, ptr addrspace(1) %out
   ret void
 }
 
@@ -103,10 +103,10 @@ entry:
 
 ; SIVI: buffer_store_short
 ; GFX9: global_store_short
-define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8(ptr addrspace(1) %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
-  store <2 x i8> %0, <2 x i8> addrspace(1)* %out
+  store <2 x i8> %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -120,10 +120,10 @@ entry:
 ; CM-NOT: MEM_RAT MSKOR
 
 ; SI: buffer_store_byte
-define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8_unaligned(ptr addrspace(1) %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
-  store <2 x i8> %0, <2 x i8> addrspace(1)* %out, align 1
+  store <2 x i8> %0, ptr addrspace(1) %out, align 1
   ret void
 }
 
@@ -135,10 +135,10 @@ entry:
 
 ; SIVI: buffer_store_dword
 ; GFX9: global_store_dword
-define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16(ptr addrspace(1) %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
-  store <2 x i16> %0, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -158,10 +158,10 @@ entry:
 
 ; GFX9: global_store_short
 ; GFX9: global_store_short
-define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16_unaligned(ptr addrspace(1) %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
-  store <2 x i16> %0, <2 x i16> addrspace(1)* %out, align 2
+  store <2 x i16> %0, ptr addrspace(1) %out, align 2
   ret void
 }
 
@@ -172,10 +172,10 @@ entry:
 
 ; SIVI: buffer_store_dword
 ; GFX9: global_store_dword
-define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8(ptr addrspace(1) %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
-  store <4 x i8> %0, <4 x i8> addrspace(1)* %out
+  store <4 x i8> %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -199,10 +199,10 @@ entry:
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI-NOT: buffer_store_dword
-define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_unaligned(ptr addrspace(1) %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
-  store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 1
+  store <4 x i8> %0, ptr addrspace(1) %out, align 1
   ret void
 }
 
@@ -220,10 +220,10 @@ entry:
 ; SI: buffer_store_short
 ; SI: buffer_store_short
 ; SI-NOT: buffer_store_dword
-define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_halfaligned(ptr addrspace(1) %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
-  store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 2
+  store <4 x i8> %0, ptr addrspace(1) %out, align 2
   ret void
 }
 
@@ -236,8 +236,8 @@ entry:
 ; SIVI: buffer_store_dword
 ; GFX9: global_store_dword
 
-define amdgpu_kernel void @store_f32(float addrspace(1)* %out, float %in) {
-  store float %in, float addrspace(1)* %out
+define amdgpu_kernel void @store_f32(ptr addrspace(1) %out, float %in) {
+  store float %in, ptr addrspace(1) %out
   ret void
 }
 
@@ -248,10 +248,10 @@ define amdgpu_kernel void @store_f32(float addrspace(1)* %out, float %in) {
 
 ; SIVI: buffer_store_dwordx2
 ; GFX9: global_store_dwordx2
-define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i16(ptr addrspace(1) %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i16>
-  store <4 x i16> %0, <4 x i16> addrspace(1)* %out
+  store <4 x i16> %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -264,11 +264,11 @@ entry:
 ; SIVI: buffer_store_dwordx2
 ; GFX9: global_store_dwordx2
 
-define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @store_v2f32(ptr addrspace(1) %out, float %a, float %b) {
 entry:
   %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
   %1 = insertelement <2 x float> %0, float %b, i32 1
-  store <2 x float> %1, <2 x float> addrspace(1)* %out
+  store <2 x float> %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -282,8 +282,8 @@ entry:
 
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}},
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.XY}}, {{T[0-9]+\.[XYZW]}},
-define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind {
-  store <3 x i32> %a, <3 x i32> addrspace(1)* %out, align 16
+define amdgpu_kernel void @store_v3i32(ptr addrspace(1) %out, <3 x i32> %a) nounwind {
+  store <3 x i32> %a, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -296,9 +296,9 @@ define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %
 
 ; SIVI: buffer_store_dwordx4
 ; GFX9: global_store_dwordx4
-define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32(ptr addrspace(1) %out, <4 x i32> %in) {
 entry:
-  store <4 x i32> %in, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %in, ptr addrspace(1) %out
   ret void
 }
 
@@ -311,9 +311,9 @@ entry:
 
 ; SIVI: buffer_store_dwordx4
 ; GFX9: global_store_dwordx4
-define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32_unaligned(ptr addrspace(1) %out, <4 x i32> %in) {
 entry:
-  store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
+  store <4 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -327,9 +327,9 @@ entry:
 
 ; SIVI: buffer_store_dwordx4
 ; GFX9: global_store_dwordx4
-define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-  %1 = load <4 x float>, <4 x float> addrspace(1) * %in
-  store <4 x float> %1, <4 x float> addrspace(1)* %out
+define amdgpu_kernel void @store_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %1 = load <4 x float>, ptr addrspace(1) %in
+  store <4 x float> %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -340,10 +340,10 @@ define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x floa
 
 ; SIVI: buffer_store_byte
 ; GFX9: global_store_byte
-define amdgpu_kernel void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i8(ptr addrspace(1) %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
-  store i8 %0, i8 addrspace(1)* %out
+  store i8 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -351,10 +351,10 @@ entry:
 ; EG: MEM_RAT MSKOR
 ; SIVI: buffer_store_short
 ; GFX9: global_store_short
-define amdgpu_kernel void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i16(ptr addrspace(1) %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
-  store i16 %0, i16 addrspace(1)* %out
+  store i16 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -371,14 +371,14 @@ entry:
 
 ; SIVI: buffer_store_dwordx2
 ; GFX9: global_store_dwordx2
-define amdgpu_kernel void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* nocapture %mem) #0 {
+define amdgpu_kernel void @vecload2(ptr addrspace(1) nocapture %out, ptr addrspace(4) nocapture %mem) #0 {
 entry:
-  %0 = load i32, i32 addrspace(4)* %mem, align 4
-  %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(4)* %mem, i64 1
-  %1 = load i32, i32 addrspace(4)* %arrayidx1.i, align 4
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
-  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
+  %0 = load i32, ptr addrspace(4) %mem, align 4
+  %arrayidx1.i = getelementptr inbounds i32, ptr addrspace(4) %mem, i64 1
+  %1 = load i32, ptr addrspace(4) %arrayidx1.i, align 4
+  store i32 %0, ptr addrspace(1) %out, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
+  store i32 %1, ptr addrspace(1) %arrayidx1, align 4
   ret void
 }
 
@@ -391,15 +391,15 @@ entry:
 
 ; SIVI: buffer_store_dwordx4
 ; GFX9: global_store_dwordx4
-define amdgpu_kernel void @i128-const-store(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @i128-const-store(ptr addrspace(1) %out) {
 entry:
-  store i32 1, i32 addrspace(1)* %out, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
-  store i32 1, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
-  store i32 2, i32 addrspace(1)* %arrayidx4, align 4
-  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
-  store i32 2, i32 addrspace(1)* %arrayidx6, align 4
+  store i32 1, ptr addrspace(1) %out, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
+  store i32 1, ptr addrspace(1) %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
+  store i32 2, ptr addrspace(1) %arrayidx4, align 4
+  %arrayidx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
+  store i32 2, ptr addrspace(1) %arrayidx6, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/store-hi16.ll b/llvm/test/CodeGen/AMDGPU/store-hi16.ll
index e4699f3f926fe..dab50b16e2d5d 100644
--- a/llvm/test/CodeGen/AMDGPU/store-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-hi16.ll
@@ -13,12 +13,12 @@
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_global_hi_v2i16(i16 addrspace(1)* %out, i32 %arg) #0 {
+define void @store_global_hi_v2i16(ptr addrspace(1) %out, i32 %arg) #0 {
 entry:
   ; FIXME: ABI for pre-gfx9
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
-  store i16 %hi, i16 addrspace(1)* %out
+  store i16 %hi, ptr addrspace(1) %out
   ret void
 }
 
@@ -32,12 +32,12 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_global_hi_v2f16(half addrspace(1)* %out, i32 %arg) #0 {
+define void @store_global_hi_v2f16(ptr addrspace(1) %out, i32 %arg) #0 {
 entry:
   ; FIXME: ABI for pre-gfx9
   %value = bitcast i32 %arg to <2 x half>
   %hi = extractelement <2 x half> %value, i32 1
-  store half %hi, half addrspace(1)* %out
+  store half %hi, ptr addrspace(1) %out
   ret void
 }
 
@@ -51,11 +51,11 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_global_hi_i32_shift(i16 addrspace(1)* %out, i32 %value) #0 {
+define void @store_global_hi_i32_shift(ptr addrspace(1) %out, i32 %value) #0 {
 entry:
   %hi32 = lshr i32 %value, 16
   %hi = trunc i32 %hi32 to i16
-  store i16 %hi, i16 addrspace(1)* %out
+  store i16 %hi, ptr addrspace(1) %out
   ret void
 }
 
@@ -69,12 +69,12 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_global_hi_v2i16_i8(i8 addrspace(1)* %out, i32 %arg) #0 {
+define void @store_global_hi_v2i16_i8(ptr addrspace(1) %out, i32 %arg) #0 {
 entry:
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
   %trunc = trunc i16 %hi to i8
-  store i8 %trunc, i8 addrspace(1)* %out
+  store i8 %trunc, ptr addrspace(1) %out
   ret void
 }
 
@@ -88,11 +88,11 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_global_hi_i8_shift(i8 addrspace(1)* %out, i32 %value) #0 {
+define void @store_global_hi_i8_shift(ptr addrspace(1) %out, i32 %value) #0 {
 entry:
   %hi32 = lshr i32 %value, 16
   %hi = trunc i32 %hi32 to i8
-  store i8 %hi, i8 addrspace(1)* %out
+  store i8 %hi, ptr addrspace(1) %out
   ret void
 }
 
@@ -107,13 +107,13 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_global_hi_v2i16_max_offset(i16 addrspace(1)* %out, i32 %arg) #0 {
+define void @store_global_hi_v2i16_max_offset(ptr addrspace(1) %out, i32 %arg) #0 {
 entry:
   ; FIXME: ABI for pre-gfx9
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
-  %gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 2047
-  store i16 %hi, i16 addrspace(1)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(1) %out, i64 2047
+  store i16 %hi, ptr addrspace(1) %gep
   ret void
 }
 
@@ -128,12 +128,12 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_global_hi_v2i16_min_offset(i16 addrspace(1)* %out, i32 %arg) #0 {
+define void @store_global_hi_v2i16_min_offset(ptr addrspace(1) %out, i32 %arg) #0 {
 entry:
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
-  %gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 -2048
-  store i16 %hi, i16 addrspace(1)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(1) %out, i64 -2048
+  store i16 %hi, ptr addrspace(1) %gep
   ret void
 }
 
@@ -148,13 +148,13 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_global_hi_v2i16_i8_max_offset(i8 addrspace(1)* %out, i32 %arg) #0 {
+define void @store_global_hi_v2i16_i8_max_offset(ptr addrspace(1) %out, i32 %arg) #0 {
 entry:
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
   %trunc = trunc i16 %hi to i8
-  %gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 4095
-  store i8 %trunc, i8 addrspace(1)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(1) %out, i64 4095
+  store i8 %trunc, ptr addrspace(1) %gep
   ret void
 }
 
@@ -169,13 +169,13 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_global_hi_v2i16_i8_min_offset(i8 addrspace(1)* %out, i32 %arg) #0 {
+define void @store_global_hi_v2i16_i8_min_offset(ptr addrspace(1) %out, i32 %arg) #0 {
 entry:
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
   %trunc = trunc i16 %hi to i8
-  %gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 -4095
-  store i8 %trunc, i8 addrspace(1)* %gep
+  %gep = getelementptr inbounds i8, ptr addrspace(1) %out, i64 -4095
+  store i8 %trunc, ptr addrspace(1) %gep
   ret void
 }
 
@@ -189,11 +189,11 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_flat_hi_v2i16(i16* %out, i32 %arg) #0 {
+define void @store_flat_hi_v2i16(ptr %out, i32 %arg) #0 {
 entry:
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
-  store i16 %hi, i16* %out
+  store i16 %hi, ptr %out
   ret void
 }
 
@@ -207,11 +207,11 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_flat_hi_v2f16(half* %out, i32 %arg) #0 {
+define void @store_flat_hi_v2f16(ptr %out, i32 %arg) #0 {
 entry:
   %value = bitcast i32 %arg to <2 x half>
   %hi = extractelement <2 x half> %value, i32 1
-  store half %hi, half* %out
+  store half %hi, ptr %out
   ret void
 }
 
@@ -225,11 +225,11 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_flat_hi_i32_shift(i16* %out, i32 %value) #0 {
+define void @store_flat_hi_i32_shift(ptr %out, i32 %value) #0 {
 entry:
   %hi32 = lshr i32 %value, 16
   %hi = trunc i32 %hi32 to i16
-  store i16 %hi, i16* %out
+  store i16 %hi, ptr %out
   ret void
 }
 
@@ -243,12 +243,12 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_flat_hi_v2i16_i8(i8* %out, i32 %arg) #0 {
+define void @store_flat_hi_v2i16_i8(ptr %out, i32 %arg) #0 {
 entry:
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
   %trunc = trunc i16 %hi to i8
-  store i8 %trunc, i8* %out
+  store i8 %trunc, ptr %out
   ret void
 }
 
@@ -262,11 +262,11 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_flat_hi_i8_shift(i8* %out, i32 %value) #0 {
+define void @store_flat_hi_i8_shift(ptr %out, i32 %value) #0 {
 entry:
   %hi32 = lshr i32 %value, 16
   %hi = trunc i32 %hi32 to i8
-  store i8 %hi, i8* %out
+  store i8 %hi, ptr %out
   ret void
 }
 
@@ -281,12 +281,12 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_flat_hi_v2i16_max_offset(i16* %out, i32 %arg) #0 {
+define void @store_flat_hi_v2i16_max_offset(ptr %out, i32 %arg) #0 {
 entry:
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
-  %gep = getelementptr inbounds i16, i16* %out, i64 2047
-  store i16 %hi, i16* %gep
+  %gep = getelementptr inbounds i16, ptr %out, i64 2047
+  store i16 %hi, ptr %gep
   ret void
 }
 
@@ -302,12 +302,12 @@ entry:
 ; GFX803: flat_store_short v[0:1], v2{{$}}
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_flat_hi_v2i16_neg_offset(i16* %out, i32 %arg) #0 {
+define void @store_flat_hi_v2i16_neg_offset(ptr %out, i32 %arg) #0 {
 entry:
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
-  %gep = getelementptr inbounds i16, i16* %out, i64 -1023
-  store i16 %hi, i16* %gep
+  %gep = getelementptr inbounds i16, ptr %out, i64 -1023
+  store i16 %hi, ptr %gep
   ret void
 }
 
@@ -322,13 +322,13 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_flat_hi_v2i16_i8_max_offset(i8* %out, i32 %arg) #0 {
+define void @store_flat_hi_v2i16_i8_max_offset(ptr %out, i32 %arg) #0 {
 entry:
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
   %trunc = trunc i16 %hi to i8
-  %gep = getelementptr inbounds i8, i8* %out, i64 4095
-  store i8 %trunc, i8* %gep
+  %gep = getelementptr inbounds i8, ptr %out, i64 4095
+  store i8 %trunc, ptr %gep
   ret void
 }
 
@@ -348,13 +348,13 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_flat_hi_v2i16_i8_neg_offset(i8* %out, i32 %arg) #0 {
+define void @store_flat_hi_v2i16_i8_neg_offset(ptr %out, i32 %arg) #0 {
 entry:
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
   %trunc = trunc i16 %hi to i8
-  %gep = getelementptr inbounds i8, i8* %out, i64 -4095
-  store i8 %trunc, i8* %gep
+  %gep = getelementptr inbounds i8, ptr %out, i64 -4095
+  store i8 %trunc, ptr %gep
   ret void
 }
 
@@ -369,12 +369,12 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_private_hi_v2i16(i16 addrspace(5)* %out, i32 %arg) #0 {
+define void @store_private_hi_v2i16(ptr addrspace(5) %out, i32 %arg) #0 {
 entry:
   ; FIXME: ABI for pre-gfx9
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
-  store i16 %hi, i16 addrspace(5)* %out
+  store i16 %hi, ptr addrspace(5) %out
   ret void
 }
 
@@ -389,12 +389,12 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_private_hi_v2f16(half addrspace(5)* %out, i32 %arg) #0 {
+define void @store_private_hi_v2f16(ptr addrspace(5) %out, i32 %arg) #0 {
 entry:
   ; FIXME: ABI for pre-gfx9
   %value = bitcast i32 %arg to <2 x half>
   %hi = extractelement <2 x half> %value, i32 1
-  store half %hi, half addrspace(5)* %out
+  store half %hi, ptr addrspace(5) %out
   ret void
 }
 
@@ -409,11 +409,11 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_private_hi_i32_shift(i16 addrspace(5)* %out, i32 %value) #0 {
+define void @store_private_hi_i32_shift(ptr addrspace(5) %out, i32 %value) #0 {
 entry:
   %hi32 = lshr i32 %value, 16
   %hi = trunc i32 %hi32 to i16
-  store i16 %hi, i16 addrspace(5)* %out
+  store i16 %hi, ptr addrspace(5) %out
   ret void
 }
 
@@ -428,12 +428,12 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_private_hi_v2i16_i8(i8 addrspace(5)* %out, i32 %arg) #0 {
+define void @store_private_hi_v2i16_i8(ptr addrspace(5) %out, i32 %arg) #0 {
 entry:
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
   %trunc = trunc i16 %hi to i8
-  store i8 %trunc, i8 addrspace(5)* %out
+  store i8 %trunc, ptr addrspace(5) %out
   ret void
 }
 
@@ -448,11 +448,11 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_private_hi_i8_shift(i8 addrspace(5)* %out, i32 %value) #0 {
+define void @store_private_hi_i8_shift(ptr addrspace(5) %out, i32 %value) #0 {
 entry:
   %hi32 = lshr i32 %value, 16
   %hi = trunc i32 %hi32 to i8
-  store i8 %hi, i8 addrspace(5)* %out
+  store i8 %hi, ptr addrspace(5) %out
   ret void
 }
 
@@ -466,12 +466,12 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_private_hi_v2i16_max_offset(i16 addrspace(5)* byval(i16) %out, i32 %arg) #0 {
+define void @store_private_hi_v2i16_max_offset(ptr addrspace(5) byval(i16) %out, i32 %arg) #0 {
 entry:
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
-  %gep = getelementptr inbounds i16, i16 addrspace(5)* %out, i64 2047
-  store i16 %hi, i16 addrspace(5)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(5) %out, i64 2047
+  store i16 %hi, ptr addrspace(5) %gep
   ret void
 }
 
@@ -494,7 +494,7 @@ entry:
   ; FIXME: ABI for pre-gfx9
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
-  store volatile i16 %hi, i16 addrspace(5)* null
+  store volatile i16 %hi, ptr addrspace(5) null
   ret void
 }
 
@@ -516,7 +516,7 @@ entry:
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
   %trunc = trunc i16 %hi to i8
-  store volatile i8 %trunc, i8 addrspace(5)* null
+  store volatile i8 %trunc, ptr addrspace(5) null
   ret void
 }
 
@@ -530,12 +530,12 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_local_hi_v2i16(i16 addrspace(3)* %out, i32 %arg) #0 {
+define void @store_local_hi_v2i16(ptr addrspace(3) %out, i32 %arg) #0 {
 entry:
   ; FIXME: ABI for pre-gfx9
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
-  store i16 %hi, i16 addrspace(3)* %out
+  store i16 %hi, ptr addrspace(3) %out
   ret void
 }
 
@@ -549,12 +549,12 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_local_hi_v2f16(half addrspace(3)* %out, i32 %arg) #0 {
+define void @store_local_hi_v2f16(ptr addrspace(3) %out, i32 %arg) #0 {
 entry:
   ; FIXME: ABI for pre-gfx9
   %value = bitcast i32 %arg to <2 x half>
   %hi = extractelement <2 x half> %value, i32 1
-  store half %hi, half addrspace(3)* %out
+  store half %hi, ptr addrspace(3) %out
   ret void
 }
 
@@ -568,11 +568,11 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_local_hi_i32_shift(i16 addrspace(3)* %out, i32 %value) #0 {
+define void @store_local_hi_i32_shift(ptr addrspace(3) %out, i32 %value) #0 {
 entry:
   %hi32 = lshr i32 %value, 16
   %hi = trunc i32 %hi32 to i16
-  store i16 %hi, i16 addrspace(3)* %out
+  store i16 %hi, ptr addrspace(3) %out
   ret void
 }
 
@@ -586,12 +586,12 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_local_hi_v2i16_i8(i8 addrspace(3)* %out, i32 %arg) #0 {
+define void @store_local_hi_v2i16_i8(ptr addrspace(3) %out, i32 %arg) #0 {
 entry:
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
   %trunc = trunc i16 %hi to i8
-  store i8 %trunc, i8 addrspace(3)* %out
+  store i8 %trunc, ptr addrspace(3) %out
   ret void
 }
 
@@ -604,13 +604,13 @@ entry:
 
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-define void @store_local_hi_v2i16_max_offset(i16 addrspace(3)* %out, i32 %arg) #0 {
+define void @store_local_hi_v2i16_max_offset(ptr addrspace(3) %out, i32 %arg) #0 {
 entry:
   ; FIXME: ABI for pre-gfx9
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
-  %gep = getelementptr inbounds i16, i16 addrspace(3)* %out, i64 32767
-  store i16 %hi, i16 addrspace(3)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(3) %out, i64 32767
+  store i16 %hi, ptr addrspace(3) %gep
   ret void
 }
 
@@ -624,15 +624,14 @@ entry:
 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, s32 offset:4058
 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-define void @store_private_hi_v2i16_to_offset(i32 %arg, [10 x i32] addrspace(5)* %obj0) #0 {
+define void @store_private_hi_v2i16_to_offset(i32 %arg, ptr addrspace(5) %obj0) #0 {
 entry:
   %obj1 = alloca [4096 x i16], align 2, addrspace(5)
-  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
-  store volatile i32 123, i32 addrspace(5)* %bc
+  store volatile i32 123, ptr addrspace(5) %obj0
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
-  %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027
-  store i16 %hi, i16 addrspace(5)* %gep
+  %gep = getelementptr inbounds [4096 x i16], ptr addrspace(5) %obj1, i32 0, i32 2027
+  store i16 %hi, ptr addrspace(5) %gep
   ret void
 }
 
@@ -645,16 +644,15 @@ entry:
 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, s32 offset:4059
 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-define void @store_private_hi_v2i16_i8_to_offset(i32 %arg, [10 x i32] addrspace(5)* %obj0) #0 {
+define void @store_private_hi_v2i16_i8_to_offset(i32 %arg, ptr addrspace(5) %obj0) #0 {
 entry:
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
-  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
-  store volatile i32 123, i32 addrspace(5)* %bc
+  store volatile i32 123, ptr addrspace(5) %obj0
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
-  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
+  %gep = getelementptr inbounds [4096 x i8], ptr addrspace(5) %obj1, i32 0, i32 4055
   %trunc = trunc i16 %hi to i8
-  store i8 %trunc, i8 addrspace(5)* %gep
+  store i8 %trunc, ptr addrspace(5) %gep
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll
index b47f8f4faa1ba..0663b968366ab 100644
--- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll
@@ -5,7 +5,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
 
-define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
+define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) {
 ; GFX9-LABEL: store_lds_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -72,11 +72,11 @@ define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i3
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
 ; GFX11-NEXT:    ds_store_b128 v4, v[0:3]
 ; GFX11-NEXT:    s_endpgm
-  store <4 x i32> %x, <4 x i32> addrspace(3)* %out
+  store <4 x i32> %x, ptr addrspace(3) %out
   ret void
 }
 
-define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
+define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i32> %x) {
 ; GFX9-LABEL: store_lds_v4i32_align1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -311,11 +311,11 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
 ; GFX11-NEXT:    ds_store_b8_d16_hi v0, v4 offset:6
 ; GFX11-NEXT:    ds_store_b8 v0, v2 offset:7
 ; GFX11-NEXT:    s_endpgm
-  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
+  store <4 x i32> %x, ptr addrspace(3) %out, align 1
   ret void
 }
 
-define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
+define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i32> %x) {
 ; GFX9-LABEL: store_lds_v4i32_align2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -433,11 +433,11 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
 ; GFX11-NEXT:    ds_store_b16 v0, v1 offset:8
 ; GFX11-NEXT:    ds_store_b16_d16_hi v0, v4 offset:6
 ; GFX11-NEXT:    s_endpgm
-  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2
+  store <4 x i32> %x, ptr addrspace(3) %out, align 2
   ret void
 }
 
-define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
+define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i32> %x) {
 ; GFX9-LABEL: store_lds_v4i32_align4:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -509,11 +509,11 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out,
 ; GFX11-NEXT:    ds_store_2addr_b32 v0, v1, v2 offset1:1
 ; GFX11-NEXT:    ds_store_2addr_b32 v0, v3, v4 offset0:2 offset1:3
 ; GFX11-NEXT:    s_endpgm
-  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4
+  store <4 x i32> %x, ptr addrspace(3) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
+define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i32> %x) {
 ; GFX9-LABEL: store_lds_v4i32_align8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -581,11 +581,11 @@ define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out,
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-NEXT:    ds_store_2addr_b64 v4, v[0:1], v[2:3] offset1:1
 ; GFX11-NEXT:    s_endpgm
-  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 8
+  store <4 x i32> %x, ptr addrspace(3) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @store_lds_v4i32_align16(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
+define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i32> %x) {
 ; GFX9-LABEL: store_lds_v4i32_align16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -652,6 +652,6 @@ define amdgpu_kernel void @store_lds_v4i32_align16(<4 x i32> addrspace(3)* %out,
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
 ; GFX11-NEXT:    ds_store_b128 v4, v[0:3]
 ; GFX11-NEXT:    s_endpgm
-  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 16
+  store <4 x i32> %x, ptr addrspace(3) %out, align 16
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll
index 2a74fd088a22b..1a7d6d91ebe01 100644
--- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll
@@ -5,7 +5,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
 
-define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
+define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) {
 ; GFX9-LABEL: store_lds_v3i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
@@ -68,11 +68,11 @@ define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i3
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0
 ; GFX11-NEXT:    ds_store_b96 v3, v[0:2]
 ; GFX11-NEXT:    s_endpgm
-  store <3 x i32> %x, <3 x i32> addrspace(3)* %out
+  store <3 x i32> %x, ptr addrspace(3) %out
   ret void
 }
 
-define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
+define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i32> %x) {
 ; GFX9-LABEL: store_lds_v3i32_align1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -259,11 +259,11 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
 ; GFX11-NEXT:    ds_store_b8_d16_hi v0, v3 offset:6
 ; GFX11-NEXT:    ds_store_b8 v0, v9 offset:7
 ; GFX11-NEXT:    s_endpgm
-  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
+  store <3 x i32> %x, ptr addrspace(3) %out, align 1
   ret void
 }
 
-define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
+define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i32> %x) {
 ; GFX9-LABEL: store_lds_v3i32_align2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -362,11 +362,11 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
 ; GFX11-NEXT:    ds_store_b16 v0, v1 offset:8
 ; GFX11-NEXT:    ds_store_b16_d16_hi v0, v3 offset:6
 ; GFX11-NEXT:    s_endpgm
-  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2
+  store <3 x i32> %x, ptr addrspace(3) %out, align 2
   ret void
 }
 
-define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
+define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i32> %x) {
 ; GFX9-LABEL: store_lds_v3i32_align4:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -433,11 +433,11 @@ define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out,
 ; GFX11-NEXT:    ds_store_2addr_b32 v0, v1, v2 offset1:1
 ; GFX11-NEXT:    ds_store_b32 v0, v3 offset:8
 ; GFX11-NEXT:    s_endpgm
-  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 4
+  store <3 x i32> %x, ptr addrspace(3) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
+define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i32> %x) {
 ; GFX9-LABEL: store_lds_v3i32_align8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -504,11 +504,11 @@ define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out,
 ; GFX11-NEXT:    ds_store_b32 v2, v3 offset:8
 ; GFX11-NEXT:    ds_store_b64 v2, v[0:1]
 ; GFX11-NEXT:    s_endpgm
-  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 8
+  store <3 x i32> %x, ptr addrspace(3) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
+define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i32> %x) {
 ; GFX9-LABEL: store_lds_v3i32_align16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
@@ -571,6 +571,6 @@ define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out,
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0
 ; GFX11-NEXT:    ds_store_b96 v3, v[0:2]
 ; GFX11-NEXT:    s_endpgm
-  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 16
+  store <3 x i32> %x, ptr addrspace(3) %out, align 16
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/store-local.ll b/llvm/test/CodeGen/AMDGPU/store-local.ll
index f302ea099b754..f3c0eed491d5b 100644
--- a/llvm/test/CodeGen/AMDGPU/store-local.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-local.ll
@@ -13,9 +13,9 @@
 ; CM: LDS_BYTE_WRITE
 
 ; GCN: ds_write_b8
-define amdgpu_kernel void @store_local_i1(i1 addrspace(3)* %out) {
+define amdgpu_kernel void @store_local_i1(ptr addrspace(3) %out) {
 entry:
-  store i1 true, i1 addrspace(3)* %out
+  store i1 true, ptr addrspace(3) %out
   ret void
 }
 
@@ -28,8 +28,8 @@ entry:
 ; CM: LDS_BYTE_WRITE
 
 ; GCN: ds_write_b8
-define amdgpu_kernel void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
-  store i8 %in, i8 addrspace(3)* %out
+define amdgpu_kernel void @store_local_i8(ptr addrspace(3) %out, i8 %in) {
+  store i8 %in, ptr addrspace(3) %out
   ret void
 }
 
@@ -42,8 +42,8 @@ define amdgpu_kernel void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
 ; CM: LDS_SHORT_WRITE
 
 ; GCN: ds_write_b16
-define amdgpu_kernel void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
-  store i16 %in, i16 addrspace(3)* %out
+define amdgpu_kernel void @store_local_i16(ptr addrspace(3) %out, i16 %in) {
+  store i16 %in, ptr addrspace(3) %out
   ret void
 }
 
@@ -56,9 +56,9 @@ define amdgpu_kernel void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
 ; CM: LDS_WRITE
 
 ; GCN: ds_write_b32
-define amdgpu_kernel void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
+define amdgpu_kernel void @store_local_v2i16(ptr addrspace(3) %out, <2 x i16> %in) {
 entry:
-  store <2 x i16> %in, <2 x i16> addrspace(3)* %out
+  store <2 x i16> %in, ptr addrspace(3) %out
   ret void
 }
 
@@ -71,9 +71,9 @@ entry:
 ; CM: LDS_WRITE
 
 ; GCN: ds_write_b32
-define amdgpu_kernel void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @store_local_v4i8(ptr addrspace(3) %out, <4 x i8> %in) {
 entry:
-  store <4 x i8> %in, <4 x i8> addrspace(3)* %out
+  store <4 x i8> %in, ptr addrspace(3) %out
   ret void
 }
 
@@ -97,9 +97,9 @@ entry:
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
-define amdgpu_kernel void @store_local_v4i8_unaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @store_local_v4i8_unaligned(ptr addrspace(3) %out, <4 x i8> %in) {
 entry:
-  store <4 x i8> %in, <4 x i8> addrspace(3)* %out, align 1
+  store <4 x i8> %in, ptr addrspace(3) %out, align 1
   ret void
 }
 
@@ -117,9 +117,9 @@ entry:
 
 ; GCN: ds_write_b16
 ; GCN: ds_write_b16
-define amdgpu_kernel void @store_local_v4i8_halfaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @store_local_v4i8_halfaligned(ptr addrspace(3) %out, <4 x i8> %in) {
 entry:
-  store <4 x i8> %in, <4 x i8> addrspace(3)* %out, align 2
+  store <4 x i8> %in, ptr addrspace(3) %out, align 2
   ret void
 }
 
@@ -136,9 +136,9 @@ entry:
 ; CM-NOT: LDS_WRITE
 
 ; GCN: ds_write_b64
-define amdgpu_kernel void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_local_v2i32(ptr addrspace(3) %out, <2 x i32> %in) {
 entry:
-  store <2 x i32> %in, <2 x i32> addrspace(3)* %out
+  store <2 x i32> %in, ptr addrspace(3) %out
   ret void
 }
 
@@ -159,9 +159,9 @@ entry:
 ; SI: ds_write2_b32
 ; VI: ds_write_b128
 ; GFX9: ds_write_b128
-define amdgpu_kernel void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_local_v4i32(ptr addrspace(3) %out, <4 x i32> %in) {
 entry:
-  store <4 x i32> %in, <4 x i32> addrspace(3)* %out
+  store <4 x i32> %in, ptr addrspace(3) %out
   ret void
 }
 
@@ -181,9 +181,9 @@ entry:
 
 ; GCN: ds_write2_b32
 ; GCN: ds_write2_b32
-define amdgpu_kernel void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_local_v4i32_align4(ptr addrspace(3) %out, <4 x i32> %in) {
 entry:
-  store <4 x i32> %in, <4 x i32> addrspace(3)* %out, align 4
+  store <4 x i32> %in, ptr addrspace(3) %out, align 4
   ret void
 }
 
@@ -193,10 +193,10 @@ entry:
 
 ; EG: LDS_BYTE_WRITE
 ; GCN: ds_write_b8
-define amdgpu_kernel void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
+define amdgpu_kernel void @store_local_i64_i8(ptr addrspace(3) %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
-  store i8 %0, i8 addrspace(3)* %out
+  store i8 %0, ptr addrspace(3) %out
   ret void
 }
 
@@ -206,9 +206,9 @@ entry:
 
 ; EG: LDS_SHORT_WRITE
 ; GCN: ds_write_b16
-define amdgpu_kernel void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
+define amdgpu_kernel void @store_local_i64_i16(ptr addrspace(3) %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
-  store i16 %0, i16 addrspace(3)* %out
+  store i16 %0, ptr addrspace(3) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/store-private.ll b/llvm/test/CodeGen/AMDGPU/store-private.ll
index 840dc509d28c6..0c79fda1a6e37 100644
--- a/llvm/test/CodeGen/AMDGPU/store-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-private.ll
@@ -15,9 +15,9 @@
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_byte
-define amdgpu_kernel void @store_i1(i1 addrspace(5)* %out) {
+define amdgpu_kernel void @store_i1(ptr addrspace(5) %out) {
 entry:
-  store i1 true, i1 addrspace(5)* %out
+  store i1 true, ptr addrspace(5) %out
   ret void
 }
 
@@ -46,9 +46,9 @@ entry:
 
 ; SI: buffer_store_byte
 
-define amdgpu_kernel void @store_i8(i8 addrspace(5)* %out, i8 %in) {
+define amdgpu_kernel void @store_i8(ptr addrspace(5) %out, i8 %in) {
 entry:
-  store i8 %in, i8 addrspace(5)* %out
+  store i8 %in, ptr addrspace(5) %out
   ret void
 }
 
@@ -74,9 +74,9 @@ entry:
 ; EG: MOV * T(0 + AR.x).X+, [[RES]]
 
 ; SI: buffer_store_short
-define amdgpu_kernel void @store_i16(i16 addrspace(5)* %out, i16 %in) {
+define amdgpu_kernel void @store_i16(ptr addrspace(5) %out, i16 %in) {
 entry:
-  store i16 %in, i16 addrspace(5)* %out
+  store i16 %in, ptr addrspace(5) %out
   ret void
 }
 
@@ -104,9 +104,9 @@ entry:
 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
 ; CM: MOVA_INT
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
-define amdgpu_kernel void @store_i24(i24 addrspace(5)* %out, i24 %in) {
+define amdgpu_kernel void @store_i24(ptr addrspace(5) %out, i24 %in) {
 entry:
-  store i24 %in, i24 addrspace(5)* %out
+  store i24 %in, ptr addrspace(5) %out
   ret void
 }
 
@@ -122,9 +122,9 @@ entry:
 ; CM: MOVA_INT
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 ; CM-NOT: MOVA_INT
-define amdgpu_kernel void @store_i25(i25 addrspace(5)* %out, i25 %in) {
+define amdgpu_kernel void @store_i25(ptr addrspace(5) %out, i25 %in) {
 entry:
-  store i25 %in, i25 addrspace(5)* %out
+  store i25 %in, ptr addrspace(5) %out
   ret void
 }
 
@@ -143,10 +143,10 @@ entry:
 ; CM-NOT: MOVA_INT
 
 ; SI: buffer_store_short
-define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(5)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8(ptr addrspace(5) %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
-  store <2 x i8> %0, <2 x i8> addrspace(5)* %out
+  store <2 x i8> %0, ptr addrspace(5) %out
   ret void
 }
 
@@ -174,10 +174,10 @@ entry:
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_byte
-define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(5)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8_unaligned(ptr addrspace(5) %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
-  store <2 x i8> %0, <2 x i8> addrspace(5)* %out, align 1
+  store <2 x i8> %0, ptr addrspace(5) %out, align 1
   ret void
 }
 
@@ -193,10 +193,10 @@ entry:
 ; CM-NOT: MOVA_INT
 
 ; SI: buffer_store_dword
-define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(5)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16(ptr addrspace(5) %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
-  store <2 x i16> %0, <2 x i16> addrspace(5)* %out
+  store <2 x i16> %0, ptr addrspace(5) %out
   ret void
 }
 
@@ -225,10 +225,10 @@ entry:
 
 ; SI: buffer_store_short
 ; SI: buffer_store_short
-define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(5)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16_unaligned(ptr addrspace(5) %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
-  store <2 x i16> %0, <2 x i16> addrspace(5)* %out, align 2
+  store <2 x i16> %0, ptr addrspace(5) %out, align 2
   ret void
 }
 
@@ -242,10 +242,10 @@ entry:
 ; CM-NOT: MOVA_INT
 
 ; SI: buffer_store_dword
-define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(5)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8(ptr addrspace(5) %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
-  store <4 x i8> %0, <4 x i8> addrspace(5)* %out
+  store <4 x i8> %0, ptr addrspace(5) %out
   ret void
 }
 
@@ -301,10 +301,10 @@ entry:
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI-NOT: buffer_store_dword
-define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(5)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_unaligned(ptr addrspace(5) %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
-  store <4 x i8> %0, <4 x i8> addrspace(5)* %out, align 1
+  store <4 x i8> %0, ptr addrspace(5) %out, align 1
   ret void
 }
 
@@ -412,10 +412,10 @@ entry:
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI-NOT: buffer_store_dword
-define amdgpu_kernel void @store_v8i8_unaligned(<8 x i8> addrspace(5)* %out, <8 x i32> %in) {
+define amdgpu_kernel void @store_v8i8_unaligned(ptr addrspace(5) %out, <8 x i32> %in) {
 entry:
   %0 = trunc <8 x i32> %in to <8 x i8>
-  store <8 x i8> %0, <8 x i8> addrspace(5)* %out, align 1
+  store <8 x i8> %0, ptr addrspace(5) %out, align 1
   ret void
 }
 
@@ -445,10 +445,10 @@ entry:
 ; SI: buffer_store_short
 ; SI: buffer_store_short
 ; SI-NOT: buffer_store_dword
-define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(5)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_halfaligned(ptr addrspace(5) %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
-  store <4 x i8> %0, <4 x i8> addrspace(5)* %out, align 2
+  store <4 x i8> %0, ptr addrspace(5) %out, align 2
   ret void
 }
 
@@ -462,8 +462,8 @@ entry:
 
 ; SI: buffer_store_dword
 
-define amdgpu_kernel void @store_f32(float addrspace(5)* %out, float %in) {
-  store float %in, float addrspace(5)* %out
+define amdgpu_kernel void @store_f32(ptr addrspace(5) %out, float %in) {
+  store float %in, ptr addrspace(5) %out
   ret void
 }
 
@@ -482,10 +482,10 @@ define amdgpu_kernel void @store_f32(float addrspace(5)* %out, float %in) {
 ; XSI: buffer_store_dwordx2
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(5)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i16(ptr addrspace(5) %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i16>
-  store <4 x i16> %0, <4 x i16> addrspace(5)* %out
+  store <4 x i16> %0, ptr addrspace(5) %out
   ret void
 }
 
@@ -506,11 +506,11 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 
-define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(5)* %out, float %a, float %b) {
+define amdgpu_kernel void @store_v2f32(ptr addrspace(5) %out, float %a, float %b) {
 entry:
   %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
   %1 = insertelement <2 x float> %0, float %b, i32 1
-  store <2 x float> %1, <2 x float> addrspace(5)* %out
+  store <2 x float> %1, ptr addrspace(5) %out
   ret void
 }
 
@@ -535,8 +535,8 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 
-define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(5)* %out, <3 x i32> %a) nounwind {
-  store <3 x i32> %a, <3 x i32> addrspace(5)* %out, align 16
+define amdgpu_kernel void @store_v3i32(ptr addrspace(5) %out, <3 x i32> %a) nounwind {
+  store <3 x i32> %a, ptr addrspace(5) %out, align 16
   ret void
 }
 
@@ -565,9 +565,9 @@ define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(5)* %out, <3 x i32> %
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(5)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32(ptr addrspace(5) %out, <4 x i32> %in) {
 entry:
-  store <4 x i32> %in, <4 x i32> addrspace(5)* %out
+  store <4 x i32> %in, ptr addrspace(5) %out
   ret void
 }
 
@@ -596,9 +596,9 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(5)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32_unaligned(ptr addrspace(5) %out, <4 x i32> %in) {
 entry:
-  store <4 x i32> %in, <4 x i32> addrspace(5)* %out, align 4
+  store <4 x i32> %in, ptr addrspace(5) %out, align 4
   ret void
 }
 
@@ -628,9 +628,9 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(5)* %out, <4 x float> addrspace(5)* %in) {
-  %1 = load <4 x float>, <4 x float> addrspace(5)* %in
-  store <4 x float> %1, <4 x float> addrspace(5)* %out
+define amdgpu_kernel void @store_v4f32(ptr addrspace(5) %out, ptr addrspace(5) %in) {
+  %1 = load <4 x float>, ptr addrspace(5) %in
+  store <4 x float> %1, ptr addrspace(5) %out
   ret void
 }
 
@@ -646,10 +646,10 @@ define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(5)* %out, <4 x floa
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_byte
-define amdgpu_kernel void @store_i64_i8(i8 addrspace(5)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i8(ptr addrspace(5) %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
-  store i8 %0, i8 addrspace(5)* %out
+  store i8 %0, ptr addrspace(5) %out
   ret void
 }
 
@@ -665,10 +665,10 @@ entry:
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_short
-define amdgpu_kernel void @store_i64_i16(i16 addrspace(5)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i16(ptr addrspace(5) %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
-  store i16 %0, i16 addrspace(5)* %out
+  store i16 %0, ptr addrspace(5) %out
   ret void
 }
 
@@ -691,14 +691,14 @@ entry:
 ; XSI: buffer_store_dwordx2
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define amdgpu_kernel void @vecload2(i32 addrspace(5)* nocapture %out, i32 addrspace(4)* nocapture %mem) #0 {
+define amdgpu_kernel void @vecload2(ptr addrspace(5) nocapture %out, ptr addrspace(4) nocapture %mem) #0 {
 entry:
-  %0 = load i32, i32 addrspace(4)* %mem, align 4
-  %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(4)* %mem, i64 1
-  %1 = load i32, i32 addrspace(4)* %arrayidx1.i, align 4
-  store i32 %0, i32 addrspace(5)* %out, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 1
-  store i32 %1, i32 addrspace(5)* %arrayidx1, align 4
+  %0 = load i32, ptr addrspace(4) %mem, align 4
+  %arrayidx1.i = getelementptr inbounds i32, ptr addrspace(4) %mem, i64 1
+  %1 = load i32, ptr addrspace(4) %arrayidx1.i, align 4
+  store i32 %0, ptr addrspace(5) %out, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr addrspace(5) %out, i64 1
+  store i32 %1, ptr addrspace(5) %arrayidx1, align 4
   ret void
 }
 
@@ -729,15 +729,15 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define amdgpu_kernel void @i128-const-store(i32 addrspace(5)* %out) {
+define amdgpu_kernel void @i128-const-store(ptr addrspace(5) %out) {
 entry:
-  store i32 1, i32 addrspace(5)* %out, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 1
-  store i32 1, i32 addrspace(5)* %arrayidx2, align 4
-  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 2
-  store i32 2, i32 addrspace(5)* %arrayidx4, align 4
-  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 3
-  store i32 2, i32 addrspace(5)* %arrayidx6, align 4
+  store i32 1, ptr addrspace(5) %out, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(5) %out, i64 1
+  store i32 1, ptr addrspace(5) %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr addrspace(5) %out, i64 2
+  store i32 2, ptr addrspace(5) %arrayidx4, align 4
+  %arrayidx6 = getelementptr inbounds i32, ptr addrspace(5) %out, i64 3
+  store i32 2, ptr addrspace(5) %arrayidx6, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/store-to-constant-error.ll b/llvm/test/CodeGen/AMDGPU/store-to-constant-error.ll
index 819f90acb02c7..92fb155430ba4 100644
--- a/llvm/test/CodeGen/AMDGPU/store-to-constant-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-to-constant-error.ll
@@ -3,8 +3,8 @@
 
 ; SDAG: LLVM ERROR: Cannot select: {{[a-z0-9]+}}: ch = store<(store (s32) into %ir.ptr.load, addrspace 4)>
 ; GISEL: LLVM ERROR: cannot select: G_STORE %{{[0-9]+}}:vgpr(s32), %{{[0-9]+}}:vgpr(p4) :: (store (s32) into %ir.ptr.load, addrspace 4) (in function: store_to_constant_i32)
-define amdgpu_kernel void @store_to_constant_i32(i32 addrspace(4)* %ptr) {
+define amdgpu_kernel void @store_to_constant_i32(ptr addrspace(4) %ptr) {
 bb:
-  store i32 1, i32 addrspace(4)* %ptr, align 4
+  store i32 1, ptr addrspace(4) %ptr, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/store-v3i64.ll b/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
index b2e69deb73cfc..1b0003c721b5f 100644
--- a/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
@@ -5,8 +5,8 @@
 ; GCN-LABEL: {{^}}global_store_v3i64:
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define amdgpu_kernel void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
-  store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 32
+define amdgpu_kernel void @global_store_v3i64(ptr addrspace(1) %out, <3 x i64> %x) {
+  store <3 x i64> %x, ptr addrspace(1) %out, align 32
   ret void
 }
 
@@ -40,8 +40,8 @@ define amdgpu_kernel void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define amdgpu_kernel void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
-  store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
+define amdgpu_kernel void @global_store_v3i64_unaligned(ptr addrspace(1) %out, <3 x i64> %x) {
+  store <3 x i64> %x, ptr addrspace(1) %out, align 1
   ret void
 }
 
@@ -54,8 +54,8 @@ define amdgpu_kernel void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)*
 
 ; VI: ds_write_b64
 ; VI: ds_write_b128
-define amdgpu_kernel void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
-  store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32
+define amdgpu_kernel void @local_store_v3i64(ptr addrspace(3) %out, <3 x i64> %x) {
+  store <3 x i64> %x, ptr addrspace(3) %out, align 32
   ret void
 }
 
@@ -89,8 +89,8 @@ define amdgpu_kernel void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
-define amdgpu_kernel void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
-  store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 1
+define amdgpu_kernel void @local_store_v3i64_unaligned(ptr addrspace(3) %out, <3 x i64> %x) {
+  store <3 x i64> %x, ptr addrspace(3) %out, align 1
   ret void
 }
 
@@ -98,18 +98,18 @@ define amdgpu_kernel void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %
 ; SI-DAG: buffer_store_dwordx2
 ; SI-DAG: buffer_store_dword v
 ; VI-DAG: buffer_store_dwordx3
-define amdgpu_kernel void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i32(ptr addrspace(1) %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i32>
-  store <3 x i32> %trunc, <3 x i32> addrspace(1)* %out
+  store <3 x i32> %trunc, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i16:
 ; GCN-DAG: buffer_store_short
 ; GCN-DAG: buffer_store_dword v
-define amdgpu_kernel void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i16(ptr addrspace(1) %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i16>
-  store <3 x i16> %trunc, <3 x i16> addrspace(1)* %out
+  store <3 x i16> %trunc, ptr addrspace(1) %out
   ret void
 }
 
@@ -117,9 +117,9 @@ define amdgpu_kernel void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(
 ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i8:
 ; GCN-DAG: buffer_store_short
 ; GCN-DAG: buffer_store_byte v
-define amdgpu_kernel void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i8(ptr addrspace(1) %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i8>
-  store <3 x i8> %trunc, <3 x i8> addrspace(1)* %out
+  store <3 x i8> %trunc, ptr addrspace(1) %out
   ret void
 }
 
@@ -127,8 +127,8 @@ define amdgpu_kernel void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)
 ; GCN-DAG: buffer_store_byte v
 ; GCN-DAG: buffer_store_byte v
 ; GCN-DAG: buffer_store_byte v
-define amdgpu_kernel void @global_truncstore_v3i64_to_v3i1(<3 x i1> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i1(ptr addrspace(1) %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i1>
-  store <3 x i1> %trunc, <3 x i1> addrspace(1)* %out
+  store <3 x i1> %trunc, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll b/llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll
index a2146b8195222..93a62c9f5fb4a 100644
--- a/llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll
@@ -5,8 +5,8 @@
 ; AMDGPUDAGToDAGISel::SelectMUBUFScratch() which is used for selecting
 ; scratch loads and stores.
 ; CHECK-LABEL: {{^}}store_vector_ptrs:
-define amdgpu_kernel void @store_vector_ptrs(<4 x i32 addrspace(5)*> addrspace(5)* %out, <4 x [1024 x i32] addrspace(5)*> %array) nounwind {
-  %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(5)*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
-  store <4 x i32 addrspace(5)*> %p, <4 x i32 addrspace(5)*> addrspace(5)* %out
+define amdgpu_kernel void @store_vector_ptrs(ptr addrspace(5) %out, <4 x ptr addrspace(5)> %array) nounwind {
+  %p = getelementptr [1024 x i32], <4 x ptr addrspace(5)> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
+  store <4 x ptr addrspace(5)> %p, ptr addrspace(5) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index 8417b2114011c..c330c68ecf6b3 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -5,7 +5,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
 
-define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 {
+define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 {
 ; CIVI-LABEL: local_store_i56:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -45,11 +45,11 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 {
 ; GFX11-NEXT:    ds_store_b32 v0, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store i56 %arg, i56 addrspace(3)* %ptr, align 8
+  store i56 %arg, ptr addrspace(3) %ptr, align 8
   ret void
 }
 
-define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 {
+define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
 ; HAWAII-LABEL: local_store_i55:
 ; HAWAII:       ; %bb.0:
 ; HAWAII-NEXT:    s_or_b32 s0, s4, 14
@@ -151,11 +151,11 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0
 ; GFX11-NEXT:    ds_store_b16 v1, v2 offset:4
 ; GFX11-NEXT:    ds_store_b32 v1, v3
 ; GFX11-NEXT:    s_endpgm
-  store i55 %arg, i55 addrspace(3)* %ptr, align 8
+  store i55 %arg, ptr addrspace(3) %ptr, align 8
   ret void
 }
 
-define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 {
+define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 {
 ; HAWAII-LABEL: local_store_i48:
 ; HAWAII:       ; %bb.0:
 ; HAWAII-NEXT:    s_load_dword s2, s[4:5], 0x0
@@ -218,11 +218,11 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0
 ; GFX11-NEXT:    ds_store_b16 v0, v1 offset:4
 ; GFX11-NEXT:    ds_store_b32 v0, v2
 ; GFX11-NEXT:    s_endpgm
-  store i48 %arg, i48 addrspace(3)* %ptr, align 8
+  store i48 %arg, ptr addrspace(3) %ptr, align 8
   ret void
 }
 
-define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 {
+define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 {
 ; HAWAII-LABEL: local_store_i65:
 ; HAWAII:       ; %bb.0:
 ; HAWAII-NEXT:    s_load_dword s2, s[4:5], 0x4
@@ -300,11 +300,11 @@ define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0
 ; GFX11-NEXT:    ds_store_b8 v2, v3 offset:8
 ; GFX11-NEXT:    ds_store_b64 v2, v[0:1]
 ; GFX11-NEXT:    s_endpgm
-  store i65 %arg, i65 addrspace(3)* %ptr, align 8
+  store i65 %arg, ptr addrspace(3) %ptr, align 8
   ret void
 }
 
-define void @local_store_i13(i13 addrspace(3)* %ptr, i13 %arg) #0 {
+define void @local_store_i13(ptr addrspace(3) %ptr, i13 %arg) #0 {
 ; CIVI-LABEL: local_store_i13:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -339,11 +339,11 @@ define void @local_store_i13(i13 addrspace(3)* %ptr, i13 %arg) #0 {
 ; GFX11-NEXT:    ds_store_b16 v0, v1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store i13 %arg, i13 addrspace(3)* %ptr, align 8
+  store i13 %arg, ptr addrspace(3) %ptr, align 8
   ret void
 }
 
-define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 {
+define void @local_store_i17(ptr addrspace(3) %ptr, i17 %arg) #0 {
 ; CIVI-LABEL: local_store_i17:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -382,7 +382,7 @@ define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 {
 ; GFX11-NEXT:    ds_store_b8_d16_hi v0, v2 offset:2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store i17 %arg, i17 addrspace(3)* %ptr, align 8
+  store i17 %arg, ptr addrspace(3) %ptr, align 8
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/structurize.ll b/llvm/test/CodeGen/AMDGPU/structurize.ll
index 3cceb2d45c93e..c7d73c5ee1c71 100644
--- a/llvm/test/CodeGen/AMDGPU/structurize.ll
+++ b/llvm/test/CodeGen/AMDGPU/structurize.ll
@@ -45,7 +45,7 @@
 ; CHECK: CF_END
 
 
-define amdgpu_kernel void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @branch_into_diamond(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
 entry:
 %0 = icmp ne i32 %a, 0
   br i1 %0, label %diamond_head, label %branch_from
@@ -78,6 +78,6 @@ diamond_true:
 
 done:
   %5 = phi i32 [%3, %diamond_false], [%div7, %diamond_true]
-  store i32 %5, i32 addrspace(1)* %out
+  store i32 %5, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/structurize1.ll b/llvm/test/CodeGen/AMDGPU/structurize1.ll
index 2e7d0e615e076..a39c1877cdfa8 100644
--- a/llvm/test/CodeGen/AMDGPU/structurize1.ll
+++ b/llvm/test/CodeGen/AMDGPU/structurize1.ll
@@ -19,7 +19,7 @@
 ; CHECK-LABEL: {{^}}if_inside_loop:
 ; CHECK: LOOP_START_DX10
 ; CHECK: END_LOOP
-define amdgpu_kernel void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @if_inside_loop(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) {
 entry:
   br label %for.body
 
@@ -57,6 +57,6 @@ for.inc:
   br i1 %7, label %for.body, label %exit
 
 exit:
-  store i32 %val.for.inc, i32 addrspace(1)* %out
+  store i32 %val.for.inc, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/sub.i16.ll b/llvm/test/CodeGen/AMDGPU/sub.i16.ll
index 1d38c520dfeb1..2b1577e832051 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.i16.ll
@@ -7,15 +7,15 @@
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define amdgpu_kernel void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
-  %a = load volatile i16, i16 addrspace(1)* %gep.in0
-  %b = load volatile i16, i16 addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds i16, ptr addrspace(1) %in1, i32 %tid
+  %a = load volatile i16, ptr addrspace(1) %gep.in0
+  %b = load volatile i16, ptr addrspace(1) %gep.in1
   %add = sub i16 %a, %b
-  store i16 %add, i16 addrspace(1)* %out
+  store i16 %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -24,13 +24,13 @@ define amdgpu_kernel void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xff85, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define amdgpu_kernel void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
-  %a = load volatile i16, i16 addrspace(1)* %gep.in0
+  %gep.out = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, ptr addrspace(1) %in0, i32 %tid
+  %a = load volatile i16, ptr addrspace(1) %gep.in0
   %add = sub i16 %a, 123
-  store i16 %add, i16 addrspace(1)* %out
+  store i16 %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -39,13 +39,13 @@ define amdgpu_kernel void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 a
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x34d, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define amdgpu_kernel void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
-  %a = load volatile i16, i16 addrspace(1)* %gep.in0
+  %gep.out = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, ptr addrspace(1) %in0, i32 %tid
+  %a = load volatile i16, ptr addrspace(1) %gep.in0
   %add = sub i16 %a, -845
-  store i16 %add, i16 addrspace(1)* %out
+  store i16 %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -54,13 +54,13 @@ define amdgpu_kernel void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], 63, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define amdgpu_kernel void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_inline_63(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
-  %a = load volatile i16, i16 addrspace(1)* %gep.in0
+  %gep.out = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, ptr addrspace(1) %in0, i32 %tid
+  %a = load volatile i16, ptr addrspace(1) %gep.in0
   %add = sub i16 %a, 63
-  store i16 %add, i16 addrspace(1)* %out
+  store i16 %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -70,16 +70,16 @@ define amdgpu_kernel void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; VI-NEXT: buffer_store_dword [[ADD]]
-define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
-  %a = load volatile i16, i16 addrspace(1)* %gep.in0
-  %b = load volatile i16, i16 addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds i16, ptr addrspace(1) %in1, i32 %tid
+  %a = load volatile i16, ptr addrspace(1) %gep.in0
+  %b = load volatile i16, ptr addrspace(1) %gep.in1
   %add = sub i16 %a, %b
   %ext = zext i16 %add to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -90,16 +90,16 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i1
 ; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI-DAG: v_sub_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]]
 ; VI: buffer_store_dwordx2 v[[[ADD]]:[[VZERO]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
-  %a = load volatile i16, i16 addrspace(1)* %gep.in0
-  %b = load volatile i16, i16 addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds i64, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds i16, ptr addrspace(1) %in1, i32 %tid
+  %a = load volatile i16, ptr addrspace(1) %gep.in0
+  %b = load volatile i16, ptr addrspace(1) %gep.in1
   %add = sub i16 %a, %b
   %ext = zext i16 %add to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -110,16 +110,16 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i1
 ; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; VI: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: buffer_store_dword [[SEXT]]
-define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
-  %a = load i16, i16 addrspace(1)* %gep.in0
-  %b = load i16, i16 addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds i16, ptr addrspace(1) %in1, i32 %tid
+  %a = load i16, ptr addrspace(1) %gep.in0
+  %b = load i16, ptr addrspace(1) %gep.in1
   %add = sub i16 %a, %b
   %ext = sext i16 %add to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -131,16 +131,16 @@ define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i1
 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
 ; VI:      v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; VI-NEXT: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
-define amdgpu_kernel void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_sext_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
-  %a = load i16, i16 addrspace(1)* %gep.in0
-  %b = load i16, i16 addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds i64, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds i16, ptr addrspace(1) %in1, i32 %tid
+  %a = load i16, ptr addrspace(1) %gep.in0
+  %b = load i16, ptr addrspace(1) %gep.in1
   %add = sub i16 %a, %b
   %ext = sext i16 %add to i64
-  store i64 %ext, i64 addrspace(1)* %out
+  store i64 %ext, ptr addrspace(1) %out
   ret void
 }
 
@@ -149,16 +149,16 @@ define amdgpu_kernel void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i1
 ; GCN-LABEL: {{^}}v_test_sub_i16_constant_commute:
 ; VI: v_subrev_u16_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}}
 ; CI: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 0x800, v{{[0-9]+}}
-define amdgpu_kernel void @v_test_sub_i16_constant_commute(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_constant_commute(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %size.trunc = trunc i32 %size to i16
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
+  call void asm sideeffect "; $0", "v"(ptr addrspace(3) @lds)
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
-  %a = load volatile i16, i16 addrspace(1)* %gep.in0
+  %gep.out = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds i16, ptr addrspace(1) %in0, i32 %tid
+  %a = load volatile i16, ptr addrspace(1) %gep.in0
   %add = sub i16 %a, %size.trunc
-  store i16 %add, i16 addrspace(1)* %out
+  store i16 %add, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 9d15c2357b9f3..67369b0f81875 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -7,40 +7,40 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable
 ; GCN-LABEL: {{^}}s_sub_i32:
 ; GCN: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}]
 ; GCN: s_sub_i32 s{{[0-9]+}}, s[[#LOAD + 2]], s[[#LOAD + 3]]
-define amdgpu_kernel void @s_sub_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
   %result = sub i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_sub_imm_i32:
 ; GCN: s_load_dword [[A:s[0-9]+]]
 ; GCN: s_sub_i32 s{{[0-9]+}}, 0x4d2, [[A]]
-define amdgpu_kernel void @s_sub_imm_i32(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) {
   %result = sub i32 1234, %a
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_sub_i32:
 ; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define amdgpu_kernel void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
+define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %a = load i32, ptr addrspace(1) %in
+  %b = load i32, ptr addrspace(1) %b_ptr
   %result = sub i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_sub_imm_i32:
 ; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0x7b, v{{[0-9]+}}
 ; GFX9: v_sub_u32_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
-define amdgpu_kernel void @test_sub_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %a = load i32, i32 addrspace(1)* %in
+define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %a = load i32, ptr addrspace(1) %in
   %result = sub i32 123, %a
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -50,12 +50,12 @@ define amdgpu_kernel void @test_sub_imm_i32(i32 addrspace(1)* %out, i32 addrspac
 
 ; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define amdgpu_kernel void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
+  %a = load <2 x i32>, ptr addrspace(1) %in
+  %b = load <2 x i32>, ptr addrspace(1) %b_ptr
   %result = sub <2 x i32> %a, %b
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -69,26 +69,26 @@ define amdgpu_kernel void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32
 ; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define amdgpu_kernel void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
+  %a = load <4 x i32>, ptr addrspace(1) %in
+  %b = load <4 x i32>, ptr addrspace(1) %b_ptr
   %result = sub <4 x i32> %a, %b
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_sub_i16:
 ; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
 ; GFX89: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
-  %b_ptr = getelementptr i16, i16 addrspace(1)* %gep, i32 1
-  %a = load volatile i16, i16 addrspace(1)* %gep
-  %b = load volatile i16, i16 addrspace(1)* %b_ptr
+  %gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
+  %b_ptr = getelementptr i16, ptr addrspace(1) %gep, i32 1
+  %a = load volatile i16, ptr addrspace(1) %gep
+  %b = load volatile i16, ptr addrspace(1) %b_ptr
   %result = sub i16 %a, %b
-  store i16 %result, i16 addrspace(1)* %out
+  store i16 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -97,14 +97,14 @@ define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)
 ; VI: v_sub_u16_sdwa v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 ; GFX9: v_pk_sub_i16
-define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid
-  %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %gep, i16 1
-  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep
-  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
+  %gep = getelementptr <2 x i16>, ptr addrspace(1) %in, i32 %tid
+  %b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %gep, i16 1
+  %a = load <2 x i16>, ptr addrspace(1) %gep
+  %b = load <2 x i16>, ptr addrspace(1) %b_ptr
   %result = sub <2 x i16> %a, %b
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -116,23 +116,23 @@ define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
 
 ; GFX9: v_pk_sub_i16
 ; GFX9: v_pk_sub_i16
-define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid
-  %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %gep, i16 1
-  %a = load <4 x i16>, <4 x i16> addrspace(1) * %gep
-  %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr
+  %gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid
+  %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %gep, i16 1
+  %a = load <4 x i16>, ptr addrspace(1) %gep
+  %b = load <4 x i16>, ptr addrspace(1) %b_ptr
   %result = sub <4 x i16> %a, %b
-  store <4 x i16> %result, <4 x i16> addrspace(1)* %out
+  store <4 x i16> %result, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_sub_i64:
 ; GCN: s_sub_u32
 ; GCN: s_subb_u32
-define amdgpu_kernel void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 %b) nounwind {
   %result = sub i64 %a, %b
-  store i64 %result, i64 addrspace(1)* %out, align 8
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -145,14 +145,14 @@ define amdgpu_kernel void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64
 
 ; GFX9: v_sub_co_u32_e32
 ; GFX9: v_subb_co_u32_e32
-define amdgpu_kernel void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
+define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
-  %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
-  %a = load i64, i64 addrspace(1)* %a_ptr
-  %b = load i64, i64 addrspace(1)* %b_ptr
+  %a_ptr = getelementptr i64, ptr addrspace(1) %inA, i32 %tid
+  %b_ptr = getelementptr i64, ptr addrspace(1) %inB, i32 %tid
+  %a = load i64, ptr addrspace(1) %a_ptr
+  %b = load i64, ptr addrspace(1) %b_ptr
   %result = sub i64 %a, %b
-  store i64 %result, i64 addrspace(1)* %out, align 8
+  store i64 %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -171,14 +171,14 @@ define amdgpu_kernel void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspa
 ; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc,
 ; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc,
 ; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc,
-define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
-  %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
-  %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr
-  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
+  %a_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inA, i32 %tid
+  %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inB, i32 %tid
+  %a = load <2 x i64>, ptr addrspace(1) %a_ptr
+  %b = load <2 x i64>, ptr addrspace(1) %b_ptr
   %result = sub <2 x i64> %a, %b
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -209,14 +209,14 @@ define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 ; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc,
 ; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc,
 ; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc,
-define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
-  %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid
-  %a = load <4 x i64>, <4 x i64> addrspace(1)* %a_ptr
-  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
+  %a_ptr = getelementptr <4 x i64>, ptr addrspace(1) %inA, i32 %tid
+  %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %inB, i32 %tid
+  %a = load <4 x i64>, ptr addrspace(1) %a_ptr
+  %b = load <4 x i64>, ptr addrspace(1) %b_ptr
   %result = sub <4 x i64> %a, %b
-  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  store <4 x i64> %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -234,7 +234,7 @@ define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 define amdgpu_ps void @sub_select_vop3(i32 inreg %s, i32 %v) {
   %vcc = call i64 asm sideeffect "; def vcc", "={vcc}"()
   %sub = sub i32 %v, %s
-  store i32 %sub, i32 addrspace(3)* undef
+  store i32 %sub, ptr addrspace(3) undef
   call void asm sideeffect "; use vcc", "{vcc}"(i64 %vcc)
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 58efbdfb40fb3..8ca335097abdd 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -5,7 +5,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -82,17 +82,17 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
-  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
-  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
+  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
+  %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
   %add = sub <2 x i16> %a, %b
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %add, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 {
+define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 {
 ; GFX9-LABEL: s_test_sub_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
@@ -163,14 +163,14 @@ define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
-  %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1
+  %a = load <2 x i16>, ptr addrspace(4) %in0
+  %b = load <2 x i16>, ptr addrspace(4) %in1
   %add = sub <2 x i16> %a, %b
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %add, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 {
+define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 {
 ; GCN-LABEL: s_test_sub_self_v2i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -201,14 +201,14 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
+  %a = load <2 x i16>, ptr addrspace(4) %in0
   %add = sub <2 x i16> %a, %a
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %add, ptr addrspace(1) %out
   ret void
 }
 
 ; FIXME: VI should not scalarize arg access.
-define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
+define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 {
 ; GFX9-LABEL: s_test_sub_v2i16_kernarg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -266,11 +266,11 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %add = sub <2 x i16> %a, %b
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %add, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_constant:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -332,16 +332,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %ou
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
   %add = sub <2 x i16> %a, <i16 123, i16 456>
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %add, ptr addrspace(1) %out
   ret void
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_neg_constant:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -403,15 +403,15 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)*
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
   %add = sub <2 x i16> %a, <i16 -845, i16 -991>
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %add, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_inline_neg1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -472,15 +472,15 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)*
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
   %add = sub <2 x i16> %a, <i16 -1, i16 -1>
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %add, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -540,16 +540,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
   %add = sub <2 x i16> %a, <i16 32, i16 0>
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %add, ptr addrspace(1) %out
   ret void
 }
 
 ; The high element gives fp
-define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -610,16 +610,16 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
   %add = sub <2 x i16> %a, <i16 0, i16 16256>
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %add, ptr addrspace(1) %out
   ret void
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -702,19 +702,19 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
-  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
-  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
+  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
+  %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
   %add = sub <2 x i16> %a, %b
   %ext = zext <2 x i16> %add to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -804,19 +804,19 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
-  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
-  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
+  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
+  %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
   %add = sub <2 x i16> %a, %b
   %ext = zext <2 x i16> %add to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -901,19 +901,19 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
-  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
-  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
+  %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
+  %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
   %add = sub <2 x i16> %a, %b
   %ext = sext <2 x i16> %add to <2 x i32>
-  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %ext, ptr addrspace(1) %out
   ret void
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
 ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -1008,14 +1008,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
-  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
-  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
-  %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
+  %a = load <2 x i16>, ptr addrspace(1) %gep.in0
+  %b = load <2 x i16>, ptr addrspace(1) %gep.in1
   %add = sub <2 x i16> %a, %b
   %ext = sext <2 x i16> %add to <2 x i64>
-  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %ext, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/sub_i1.ll b/llvm/test/CodeGen/AMDGPU/sub_i1.ll
index 886f93d782d1b..1c056c5a4faca 100644
--- a/llvm/test/CodeGen/AMDGPU/sub_i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub_i1.ll
@@ -6,21 +6,21 @@
 ; GCN-LABEL: {{^}}sub_var_var_i1:
 ; WAVE32: s_xor_b32
 ; WAVE64: s_xor_b64
-define amdgpu_kernel void @sub_var_var_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
-  %a = load volatile i1, i1 addrspace(1)* %in0
-  %b = load volatile i1, i1 addrspace(1)* %in1
+define amdgpu_kernel void @sub_var_var_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load volatile i1, ptr addrspace(1) %in0
+  %b = load volatile i1, ptr addrspace(1) %in1
   %sub = sub i1 %a, %b
-  store i1 %sub, i1 addrspace(1)* %out
+  store i1 %sub, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}sub_var_imm_i1:
 ; WAVE32: s_not_b32
 ; WAVE64: s_not_b64
-define amdgpu_kernel void @sub_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) {
-  %a = load volatile i1, i1 addrspace(1)* %in
+define amdgpu_kernel void @sub_var_imm_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %a = load volatile i1, ptr addrspace(1) %in
   %sub = sub i1 %a, 1
-  store i1 %sub, i1 addrspace(1)* %out
+  store i1 %sub, ptr addrspace(1) %out
   ret void
 }
 
@@ -28,24 +28,24 @@ define amdgpu_kernel void @sub_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)
 ; GCN: ; %endif
 ; WAVE32: s_not_b32
 ; WAVE64: s_not_b64
-define amdgpu_kernel void @sub_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
+define amdgpu_kernel void @sub_i1_cf(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %d_cmp = icmp ult i32 %tid, 16
   br i1 %d_cmp, label %if, label %else
 
 if:
-  %0 = load volatile i1, i1 addrspace(1)* %a
+  %0 = load volatile i1, ptr addrspace(1) %a
   br label %endif
 
 else:
-  %1 = load volatile i1, i1 addrspace(1)* %b
+  %1 = load volatile i1, ptr addrspace(1) %b
   br label %endif
 
 endif:
   %2 = phi i1 [%0, %if], [%1, %else]
   %3 = sub i1 %2, -1
-  store i1 %3, i1 addrspace(1)* %out
+  store i1 %3, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
index 62eea86ed8f75..9cf4277237935 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
@@ -4,7 +4,7 @@
 ; register coalescer because it is hidden with subregister insert/extract.
 target triple="amdgcn--"
 
-define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @foobar(float %a0, float %a1, ptr addrspace(1) %out) nounwind {
 ; CHECK-LABEL: foobar:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -49,7 +49,7 @@ ift:
 ife:
   %val = phi <4 x float> [ %v1, %ift ], [ %v0, %entry ]
   %v2 = extractelement <4 x float> %val, i32 1
-  store float %v2, float addrspace(1)* %out, align 4
+  store float %v2, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll b/llvm/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
index 57c267e54a146..f79ca18e7672d 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
@@ -12,7 +12,7 @@ define amdgpu_kernel void @foobar() {
   %v5 = icmp ne i32 %v4, 0
   %v6 = select i1 %v5, i32 undef, i32 0
   %v15 = insertelement <2 x i32> undef, i32 %v6, i32 1
-  store <2 x i32> %v15, <2 x i32> addrspace(1)* undef, align 8
+  store <2 x i32> %v15, ptr addrspace(1) undef, align 8
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/switch-default-block-unreachable.ll b/llvm/test/CodeGen/AMDGPU/switch-default-block-unreachable.ll
index 1a0d90c30d238..ad22c1b06cf49 100644
--- a/llvm/test/CodeGen/AMDGPU/switch-default-block-unreachable.ll
+++ b/llvm/test/CodeGen/AMDGPU/switch-default-block-unreachable.ll
@@ -48,7 +48,7 @@ define void @test() #1 {
 
   unreach.blk:                                      ; preds = %preheader.blk, %pre.false.blk
     %phi.val = phi i32 [ %call.pre.false, %pre.false.blk ], [ undef, %preheader.blk ]
-    store i32 %phi.val, i32* undef
+    store i32 %phi.val, ptr undef
     unreachable
 
   exit:                                             ; preds = %switch.blk

diff  --git a/llvm/test/CodeGen/AMDGPU/switch-unreachable.ll b/llvm/test/CodeGen/AMDGPU/switch-unreachable.ll
index 11d71f7fe2efa..68d6fb5b851ed 100644
--- a/llvm/test/CodeGen/AMDGPU/switch-unreachable.ll
+++ b/llvm/test/CodeGen/AMDGPU/switch-unreachable.ll
@@ -8,7 +8,7 @@
 ; CHECK-LABEL: {{^}}switch_unreachable:
 ; CHECK-NOT: s_endpgm
 ; CHECK: .Lfunc_end
-define amdgpu_kernel void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
+define amdgpu_kernel void @switch_unreachable(ptr addrspace(1) %g, ptr addrspace(3) %l, i32 %x) nounwind {
 centry:
   switch i32 %x, label %sw.default [
     i32 0, label %sw.bb

diff  --git a/llvm/test/CodeGen/AMDGPU/syncscopes.ll b/llvm/test/CodeGen/AMDGPU/syncscopes.ll
index c960f59456b8c..1b7a8122910e3 100644
--- a/llvm/test/CodeGen/AMDGPU/syncscopes.ll
+++ b/llvm/test/CodeGen/AMDGPU/syncscopes.ll
@@ -6,14 +6,14 @@
 ; GCN: FLAT_STORE_DWORD killed renamable $vgpr7_vgpr8, killed renamable $vgpr6, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("wavefront") seq_cst (s32) into %ir.wavefront_out)
 define void @syncscopes(
     i32 %agent,
-    i32* %agent_out,
+    ptr %agent_out,
     i32 %workgroup,
-    i32* %workgroup_out,
+    ptr %workgroup_out,
     i32 %wavefront,
-    i32* %wavefront_out) {
+    ptr %wavefront_out) {
 entry:
-  store atomic i32 %agent, i32* %agent_out syncscope("agent") seq_cst, align 4
-  store atomic i32 %workgroup, i32* %workgroup_out syncscope("workgroup") seq_cst, align 4
-  store atomic i32 %wavefront, i32* %wavefront_out syncscope("wavefront") seq_cst, align 4
+  store atomic i32 %agent, ptr %agent_out syncscope("agent") seq_cst, align 4
+  store atomic i32 %workgroup, ptr %workgroup_out syncscope("workgroup") seq_cst, align 4
+  store atomic i32 %wavefront, ptr %wavefront_out syncscope("wavefront") seq_cst, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/target-cpu.ll b/llvm/test/CodeGen/AMDGPU/target-cpu.ll
index 9a56e85decebf..14b48fc278fd7 100644
--- a/llvm/test/CodeGen/AMDGPU/target-cpu.ll
+++ b/llvm/test/CodeGen/AMDGPU/target-cpu.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -disable-promote-alloca-to-vector -verify-machineinstrs < %s | FileCheck %s
 
-declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #1
+declare ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #1
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
@@ -15,14 +15,13 @@ declare void @llvm.amdgcn.s.dcache.wb() #0
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
 define amdgpu_kernel void @target_none() #0 {
-  %kernargs = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
-  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(4)* %kernargs, i64 1024
-  %kernargs.gep.cast = bitcast i8 addrspace(4)* %kernargs.gep to i32 addrspace(1)* addrspace(4)*
-  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %kernargs.gep.cast
+  %kernargs = call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+  %kernargs.gep = getelementptr inbounds i8, ptr addrspace(4) %kernargs, i64 1024
+  %ptr = load ptr addrspace(1), ptr addrspace(4) %kernargs.gep
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
-  store i32 0, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %ptr, i64 %id.ext
+  store i32 0, ptr addrspace(1) %gep
   ret void
 }
 
@@ -31,14 +30,13 @@ define amdgpu_kernel void @target_none() #0 {
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
 define amdgpu_kernel void @target_tahiti() #1 {
-  %kernargs = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
-  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(4)* %kernargs, i64 1024
-  %kernargs.gep.cast = bitcast i8 addrspace(4)* %kernargs.gep to i32 addrspace(1)* addrspace(4)*
-  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %kernargs.gep.cast
+  %kernargs = call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+  %kernargs.gep = getelementptr inbounds i8, ptr addrspace(4) %kernargs, i64 1024
+  %ptr = load ptr addrspace(1), ptr addrspace(4) %kernargs.gep
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
-  store i32 0, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %ptr, i64 %id.ext
+  store i32 0, ptr addrspace(1) %gep
   ret void
 }
 
@@ -47,14 +45,13 @@ define amdgpu_kernel void @target_tahiti() #1 {
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
 ; CHECK: s_dcache_inv_vol
 define amdgpu_kernel void @target_bonaire() #3 {
-  %kernargs = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
-  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(4)* %kernargs, i64 1024
-  %kernargs.gep.cast = bitcast i8 addrspace(4)* %kernargs.gep to i32 addrspace(1)* addrspace(4)*
-  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %kernargs.gep.cast
+  %kernargs = call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+  %kernargs.gep = getelementptr inbounds i8, ptr addrspace(4) %kernargs, i64 1024
+  %ptr = load ptr addrspace(1), ptr addrspace(4) %kernargs.gep
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
-  store i32 0, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %ptr, i64 %id.ext
+  store i32 0, ptr addrspace(1) %gep
   call void @llvm.amdgcn.s.dcache.inv.vol()
   ret void
 }
@@ -64,27 +61,26 @@ define amdgpu_kernel void @target_bonaire() #3 {
 ; CHECK: flat_store_dword
 ; CHECK: s_dcache_wb{{$}}
 define amdgpu_kernel void @target_fiji() #4 {
-  %kernargs = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
-  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(4)* %kernargs, i64 1024
-  %kernargs.gep.cast = bitcast i8 addrspace(4)* %kernargs.gep to i32 addrspace(1)* addrspace(4)*
-  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %kernargs.gep.cast
+  %kernargs = call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+  %kernargs.gep = getelementptr inbounds i8, ptr addrspace(4) %kernargs, i64 1024
+  %ptr = load ptr addrspace(1), ptr addrspace(4) %kernargs.gep
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
-  store i32 0, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %ptr, i64 %id.ext
+  store i32 0, ptr addrspace(1) %gep
   call void @llvm.amdgcn.s.dcache.wb()
   ret void
 }
 
 ; CHECK-LABEL: {{^}}promote_alloca_enabled:
 ; CHECK: ds_read_b32
-define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
+define amdgpu_kernel void @promote_alloca_enabled(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #5 {
 entry:
   %stack = alloca [5 x i32], align 4, addrspace(5)
-  %tmp = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp
-  %load = load i32, i32 addrspace(5)* %arrayidx1
-  store i32 %load, i32 addrspace(1)* %out
+  %tmp = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp
+  %load = load i32, ptr addrspace(5) %arrayidx1
+  store i32 %load, ptr addrspace(1) %out
   ret void
 }
 
@@ -92,13 +88,13 @@ entry:
 ; CHECK: SCRATCH_RSRC_DWORD0
 ; CHECK: SCRATCH_RSRC_DWORD1
 ; CHECK: ScratchSize: 24
-define amdgpu_kernel void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 {
+define amdgpu_kernel void @promote_alloca_disabled(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #6 {
 entry:
   %stack = alloca [5 x i32], align 4, addrspace(5)
-  %tmp = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp
-  %load = load i32, i32 addrspace(5)* %arrayidx1
-  store i32 %load, i32 addrspace(1)* %out
+  %tmp = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp
+  %load = load i32, ptr addrspace(5) %arrayidx1
+  store i32 %load, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
index 1c92f74ec4bdf..08cca63f80ae3 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
@@ -4,33 +4,33 @@
 ; GCN-LABEL: {{^}}trunc_i64_bitcast_v2i32:
 ; GCN: buffer_load_dword v
 ; GCN: buffer_store_dword v
-define amdgpu_kernel void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
+define amdgpu_kernel void @trunc_i64_bitcast_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %ld = load <2 x i32>, ptr addrspace(1) %in
   %bc = bitcast <2 x i32> %ld to i64
   %trunc = trunc i64 %bc to i32
-  store i32 %trunc, i32 addrspace(1)* %out
+  store i32 %trunc, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}trunc_i96_bitcast_v3i32:
 ; GCN: buffer_load_dword v
 ; GCN: buffer_store_dword v
-define amdgpu_kernel void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) {
-  %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
+define amdgpu_kernel void @trunc_i96_bitcast_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %ld = load <3 x i32>, ptr addrspace(1) %in
   %bc = bitcast <3 x i32> %ld to i96
   %trunc = trunc i96 %bc to i32
-  store i32 %trunc, i32 addrspace(1)* %out
+  store i32 %trunc, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}trunc_i128_bitcast_v4i32:
 ; GCN: buffer_load_dword v
 ; GCN: buffer_store_dword v
-define amdgpu_kernel void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
+define amdgpu_kernel void @trunc_i128_bitcast_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %ld = load <4 x i32>, ptr addrspace(1) %in
   %bc = bitcast <4 x i32> %ld to i128
   %trunc = trunc i128 %bc to i32
-  store i32 %trunc, i32 addrspace(1)* %out
+  store i32 %trunc, ptr addrspace(1) %out
   ret void
 }
 
@@ -38,11 +38,11 @@ define amdgpu_kernel void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x
 ; GCN-LABEL: {{^}}trunc_i16_bitcast_v2i16:
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: buffer_store_short [[VAL]]
-define amdgpu_kernel void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
-  %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
+define amdgpu_kernel void @trunc_i16_bitcast_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %ld = load <2 x i16>, ptr addrspace(1) %in
   %bc = bitcast <2 x i16> %ld to i32
   %trunc = trunc i32 %bc to i16
-  store i16 %trunc, i16 addrspace(1)* %out
+  store i16 %trunc, ptr addrspace(1) %out
   ret void
 }
 
@@ -53,11 +53,11 @@ define amdgpu_kernel void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x
 ;      t30: i16 = truncate t23
 ; GCN: buffer_load_dword v[[VAL:[0-9]+]]
 ; GCN: buffer_store_short v[[VAL]], off
-define amdgpu_kernel void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
-  %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
+define amdgpu_kernel void @trunc_i16_bitcast_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %ld = load <4 x i16>, ptr addrspace(1) %in
   %bc = bitcast <4 x i16> %ld to i64
   %trunc = trunc i64 %bc to i16
-  store i16 %trunc, i16 addrspace(1)* %out
+  store i16 %trunc, ptr addrspace(1) %out
   ret void
 }
 
@@ -66,32 +66,32 @@ define amdgpu_kernel void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x
 ; SI: buffer_load_ubyte [[VAL:v[0-9]+]]
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; GCN: buffer_store_byte [[VAL]]
-define amdgpu_kernel void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
-  %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
+define amdgpu_kernel void @trunc_i8_bitcast_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %ld = load <2 x i8>, ptr addrspace(1) %in
   %bc = bitcast <2 x i8> %ld to i16
   %trunc = trunc i16 %bc to i8
-  store i8 %trunc, i8 addrspace(1)* %out
+  store i8 %trunc, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}trunc_i32_bitcast_v4i8:
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: buffer_store_byte [[VAL]]
-define amdgpu_kernel void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
-  %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in
+define amdgpu_kernel void @trunc_i32_bitcast_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %ld = load <4 x i8>, ptr addrspace(1) %in
   %bc = bitcast <4 x i8> %ld to i32
   %trunc = trunc i32 %bc to i8
-  store i8 %trunc, i8 addrspace(1)* %out
+  store i8 %trunc, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}trunc_i24_bitcast_v3i8:
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: buffer_store_byte [[VAL]]
-define amdgpu_kernel void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) {
-  %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
+define amdgpu_kernel void @trunc_i24_bitcast_v3i8(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %ld = load <3 x i8>, ptr addrspace(1) %in
   %bc = bitcast <3 x i8> %ld to i24
   %trunc = trunc i24 %bc to i8
-  store i8 %trunc, i8 addrspace(1)* %out
+  store i8 %trunc, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll b/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
index 9b9a807db2302..5631510b01a48 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
@@ -9,11 +9,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; SI: v_cmp_eq_u32_e32 vcc, 0, [[TMP]]{{$}}
 ; SI: v_cndmask_b32_e64
 ; SI: buffer_store_byte
-define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_0(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load i1, ptr addrspace(1) %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, 0
-  store i1 %cmp, i1 addrspace(1)* %out
+  store i1 %cmp, ptr addrspace(1) %out
   ret void
 }
 
@@ -25,22 +25,22 @@ define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %o
 ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
 ; SI: buffer_store_byte [[RESULT]]
-define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_0(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load i1, ptr addrspace(1) %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, 0
-  store i1 %cmp, i1 addrspace(1)* %out
+  store i1 %cmp, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load i1, ptr addrspace(1) %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, 1
-  store i1 %cmp, i1 addrspace(1)* %out
+  store i1 %cmp, ptr addrspace(1) %out
   ret void
 }
 
@@ -48,11 +48,11 @@ define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %o
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load i1, ptr addrspace(1) %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, 1
-  store i1 %cmp, i1 addrspace(1)* %out
+  store i1 %cmp, ptr addrspace(1) %out
   ret void
 }
 
@@ -60,22 +60,22 @@ define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %o
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load i1, ptr addrspace(1) %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, -1
-  store i1 %cmp, i1 addrspace(1)* %out
+  store i1 %cmp, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_neg1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load i1, ptr addrspace(1) %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, -1
-  store i1 %cmp, i1 addrspace(1)* %out
+  store i1 %cmp, ptr addrspace(1) %out
   ret void
 }
 
@@ -84,11 +84,11 @@ define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)*
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_0(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load i1, ptr addrspace(1) %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, 0
-  store i1 %cmp, i1 addrspace(1)* %out
+  store i1 %cmp, ptr addrspace(1) %out
   ret void
 }
 
@@ -96,22 +96,22 @@ define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %o
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_0(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load i1, ptr addrspace(1) %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, 0
-  store i1 %cmp, i1 addrspace(1)* %out
+  store i1 %cmp, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load i1, ptr addrspace(1) %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, 1
-  store i1 %cmp, i1 addrspace(1)* %out
+  store i1 %cmp, ptr addrspace(1) %out
   ret void
 }
 
@@ -122,11 +122,11 @@ define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %o
 ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
 ; SI: buffer_store_byte [[RESULT]]
-define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load i1, ptr addrspace(1) %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, 1
-  store i1 %cmp, i1 addrspace(1)* %out
+  store i1 %cmp, ptr addrspace(1) %out
   ret void
 }
 
@@ -137,22 +137,22 @@ define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %o
 ; XSI: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 0{{$}}
 ; XSI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP0]]
 ; XSI-NEXT: buffer_store_byte [[RESULT]]
-define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load i1, ptr addrspace(1) %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, -1
-  store i1 %cmp, i1 addrspace(1)* %out
+  store i1 %cmp, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_neg1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
-  %load = load i1, i1 addrspace(1)* %in
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+  %load = load i1, ptr addrspace(1) %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, -1
-  store i1 %cmp, i1 addrspace(1)* %out
+  store i1 %cmp, ptr addrspace(1) %out
   ret void
 }
 
@@ -162,13 +162,13 @@ define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)*
 ; SI: v_cmp_ne_u32_e32 vcc, -1, [[LOAD]]{{$}}
 ; SI-NEXT: v_cndmask_b32_e64
 ; SI: {{buffer|flat}}_store_byte
-define amdgpu_kernel void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.ptr = getelementptr i8, i8 addrspace(1)* %in, i32 %tid.x
-  %load = load i8, i8 addrspace(1)* %in.ptr
+  %in.ptr = getelementptr i8, ptr addrspace(1) %in, i32 %tid.x
+  %load = load i8, ptr addrspace(1) %in.ptr
   %masked = and i8 %load, 255
   %ext = sext i8 %masked to i32
   %cmp = icmp ne i32 %ext, -1
-  store i1 %cmp, i1 addrspace(1)* %out
+  store i1 %cmp, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
index eb2549ac60952..afec8f3512650 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -43,8 +43,8 @@ define i16 @trunc_bitcast_v2i32_to_i16(<2 x i32> %bar) {
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v0, 4, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
-  %load0 = load i32, i32 addrspace(1)* undef
-  %load1 = load i32, i32 addrspace(1)* null
+  %load0 = load i32, ptr addrspace(1) undef
+  %load1 = load i32, ptr addrspace(1) null
   %insert.0 = insertelement <2 x i32> undef, i32 %load0, i32 0
   %insert.1 = insertelement <2 x i32> %insert.0, i32 99, i32 1
   %bc = bitcast <2 x i32> %insert.1 to i64
@@ -72,8 +72,8 @@ define i16 @trunc_bitcast_v2f32_to_i16(<2 x float> %bar) {
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v0, 4, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
-  %load0 = load float, float addrspace(1)* undef
-  %load1 = load float, float addrspace(1)* null
+  %load0 = load float, ptr addrspace(1) undef
+  %load1 = load float, ptr addrspace(1) null
   %insert.0 = insertelement <2 x float> undef, float %load0, i32 0
   %insert.1 = insertelement <2 x float> %insert.0, float 4.0, i32 1
   %bc = bitcast <2 x float> %insert.1 to i64
@@ -82,7 +82,7 @@ define i16 @trunc_bitcast_v2f32_to_i16(<2 x float> %bar) {
   ret i16 %add
 }
 
-define amdgpu_kernel void @truncate_high_elt_extract_vector(<2 x i16> addrspace(1)* nocapture readonly %arg, <2 x i16> addrspace(1)* nocapture readonly %arg1, <2 x i16> addrspace(1)* nocapture %arg2) local_unnamed_addr {
+define amdgpu_kernel void @truncate_high_elt_extract_vector(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture readonly %arg1, ptr addrspace(1) nocapture %arg2) local_unnamed_addr {
 ; SI-LABEL: truncate_high_elt_extract_vector:
 ; SI:       ; %bb.0: ; %bb
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -119,10 +119,10 @@ define amdgpu_kernel void @truncate_high_elt_extract_vector(<2 x i16> addrspace(
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 bb:
-  %tmp = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %arg, i64 undef
-  %tmp3 = load <2 x i16>, <2 x i16> addrspace(1)* %tmp, align 4
-  %tmp4 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %arg1, i64 undef
-  %tmp5 = load <2 x i16>, <2 x i16> addrspace(1)* %tmp4, align 4
+  %tmp = getelementptr inbounds <2 x i16>, ptr addrspace(1) %arg, i64 undef
+  %tmp3 = load <2 x i16>, ptr addrspace(1) %tmp, align 4
+  %tmp4 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %arg1, i64 undef
+  %tmp5 = load <2 x i16>, ptr addrspace(1) %tmp4, align 4
   %tmp6 = sext <2 x i16> %tmp3 to <2 x i32>
   %tmp7 = sext <2 x i16> %tmp5 to <2 x i32>
   %tmp8 = extractelement <2 x i32> %tmp6, i64 0
@@ -132,8 +132,8 @@ bb:
   %tmp12 = insertelement <2 x i32> %tmp11, i32 undef, i32 1
   %tmp13 = lshr <2 x i32> %tmp12, <i32 16, i32 16>
   %tmp14 = trunc <2 x i32> %tmp13 to <2 x i16>
-  %tmp15 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %arg2, i64 undef
-  store <2 x i16> %tmp14, <2 x i16> addrspace(1)* %tmp15, align 4
+  %tmp15 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %arg2, i64 undef
+  store <2 x i16> %tmp14, ptr addrspace(1) %tmp15, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
index d67b8f981b281..a5bf3b832ec30 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
@@ -2,54 +2,54 @@
 
 ; GCN-LABEL: {{^}}global_truncstore_f64_to_f16:
 ; GCN: s_endpgm
-define amdgpu_kernel void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 {
-  %val = load double, double addrspace(1)* %in
+define amdgpu_kernel void @global_truncstore_f64_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %val = load double, ptr addrspace(1) %in
   %cvt = fptrunc double %val to half
-  store half %cvt, half addrspace(1)* %out
+  store half %cvt, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_truncstore_v2f64_to_v2f16:
 ; GCN: s_endpgm
-define amdgpu_kernel void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
-  %val = load <2 x double>, <2 x double> addrspace(1)* %in
+define amdgpu_kernel void @global_truncstore_v2f64_to_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %val = load <2 x double>, ptr addrspace(1) %in
   %cvt = fptrunc <2 x double> %val to <2 x half>
-  store <2 x half> %cvt, <2 x half> addrspace(1)* %out
+  store <2 x half> %cvt, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_truncstore_v3f64_to_v3f16:
 ; GCN: s_endpgm
-define amdgpu_kernel void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
-  %val = load <3 x double>, <3 x double> addrspace(1)* %in
+define amdgpu_kernel void @global_truncstore_v3f64_to_v3f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %val = load <3 x double>, ptr addrspace(1) %in
   %cvt = fptrunc <3 x double> %val to <3 x half>
-  store <3 x half> %cvt, <3 x half> addrspace(1)* %out
+  store <3 x half> %cvt, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_truncstore_v4f64_to_v4f16:
 ; GCN: s_endpgm
-define amdgpu_kernel void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
-  %val = load <4 x double>, <4 x double> addrspace(1)* %in
+define amdgpu_kernel void @global_truncstore_v4f64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %val = load <4 x double>, ptr addrspace(1) %in
   %cvt = fptrunc <4 x double> %val to <4 x half>
-  store <4 x half> %cvt, <4 x half> addrspace(1)* %out
+  store <4 x half> %cvt, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_truncstore_v8f64_to_v8f16:
 ; GCN: s_endpgm
-define amdgpu_kernel void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
-  %val = load <8 x double>, <8 x double> addrspace(1)* %in
+define amdgpu_kernel void @global_truncstore_v8f64_to_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %val = load <8 x double>, ptr addrspace(1) %in
   %cvt = fptrunc <8 x double> %val to <8 x half>
-  store <8 x half> %cvt, <8 x half> addrspace(1)* %out
+  store <8 x half> %cvt, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_truncstore_v16f64_to_v16f16:
 ; GCN: s_endpgm
-define amdgpu_kernel void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
-  %val = load <16 x double>, <16 x double> addrspace(1)* %in
+define amdgpu_kernel void @global_truncstore_v16f64_to_v16f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %val = load <16 x double>, ptr addrspace(1) %in
   %cvt = fptrunc <16 x double> %val to <16 x half>
-  store <16 x half> %cvt, <16 x half> addrspace(1)* %out
+  store <16 x half> %cvt, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll
index 8027c89ef3743..54d65d9f6d774 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll
@@ -7,17 +7,17 @@
 ; GCN: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
 ; GCN: buffer_store_byte [[VREG]],
-define amdgpu_kernel void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind {
+define amdgpu_kernel void @global_truncstore_i32_to_i1(ptr addrspace(1) %out, i32 %val) nounwind {
   %trunc = trunc i32 %val to i1
-  store i1 %trunc, i1 addrspace(1)* %out, align 1
+  store i1 %trunc, ptr addrspace(1) %out, align 1
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_truncstore_i64_to_i1:
 ; GCN: buffer_store_byte
-define amdgpu_kernel void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind {
+define amdgpu_kernel void @global_truncstore_i64_to_i1(ptr addrspace(1) %out, i64 %val) nounwind {
   %trunc = trunc i64 %val to i1
-  store i1 %trunc, i1 addrspace(1)* %out, align 1
+  store i1 %trunc, ptr addrspace(1) %out, align 1
   ret void
 }
 
@@ -27,15 +27,15 @@ define amdgpu_kernel void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i6
 ; GCN: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
 ; GCN: buffer_store_byte [[VREG]],
-define amdgpu_kernel void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
+define amdgpu_kernel void @s_arg_global_truncstore_i16_to_i1(ptr addrspace(1) %out, i16 %val) nounwind {
   %trunc = trunc i16 %val to i1
-  store i1 %trunc, i1 addrspace(1)* %out, align 1
+  store i1 %trunc, ptr addrspace(1) %out, align 1
   ret void
 }
 ; GCN-LABEL: {{^}}global_truncstore_i16_to_i1:
-define amdgpu_kernel void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind {
+define amdgpu_kernel void @global_truncstore_i16_to_i1(ptr addrspace(1) %out, i16 %val0, i16 %val1) nounwind {
   %add = add i16 %val0, %val1
   %trunc = trunc i16 %add to i1
-  store i1 %trunc, i1 addrspace(1)* %out, align 1
+  store i1 %trunc, ptr addrspace(1) %out, align 1
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/trunc-store-vec-i16-to-i8.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-vec-i16-to-i8.ll
index b1f0d3235a24a..cdce9d5966d4d 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-store-vec-i16-to-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store-vec-i16-to-i8.ll
@@ -2,59 +2,59 @@
 
 ; GCN-LABEL: {{^}}short_char:
 ; GCN: global_store_byte v
-define protected amdgpu_kernel void @short_char(i8 addrspace(1)* %out) {
+define protected amdgpu_kernel void @short_char(ptr addrspace(1) %out) {
 entry:
-  %tmp = load i16, i16 addrspace(1)* undef
+  %tmp = load i16, ptr addrspace(1) undef
   %tmp1 = trunc i16 %tmp to i8
-  store i8 %tmp1, i8 addrspace(1)* %out
+  store i8 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}short2_char4:
 ; GCN: global_store_dword v
-define protected amdgpu_kernel void @short2_char4(<4 x i8> addrspace(1)* %out) {
+define protected amdgpu_kernel void @short2_char4(ptr addrspace(1) %out) {
 entry:
-  %tmp = load <2 x i16>, <2 x i16> addrspace(1)* undef, align 4
+  %tmp = load <2 x i16>, ptr addrspace(1) undef, align 4
   %vecinit = shufflevector <2 x i16> %tmp, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %vecinit2 = shufflevector <4 x i16> %vecinit, <4 x i16> <i16 undef, i16 undef, i16 0, i16 0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
   %tmp1 = trunc <4 x i16> %vecinit2 to <4 x i8>
-  store <4 x i8> %tmp1, <4 x i8> addrspace(1)* %out, align 4
+  store <4 x i8> %tmp1, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}short4_char8:
 ; GCN: global_store_dwordx2 v
-define protected amdgpu_kernel void @short4_char8(<8 x i8> addrspace(1)* %out) {
+define protected amdgpu_kernel void @short4_char8(ptr addrspace(1) %out) {
 entry:
-  %tmp = load <4 x i16>, <4 x i16> addrspace(1)* undef, align 8
+  %tmp = load <4 x i16>, ptr addrspace(1) undef, align 8
   %vecinit = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %vecinit2 = shufflevector <8 x i16> %vecinit, <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 0, i16 0, i16 0>, <8 x i32> <i32 0, i32 1, i32 6, i32 7, i32 0, i32 1, i32 6, i32 7>
   %tmp1 = trunc <8 x i16> %vecinit2 to <8 x i8>
-  store <8 x i8> %tmp1, <8 x i8> addrspace(1)* %out, align 8
+  store <8 x i8> %tmp1, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}short8_char16:
 ; GCN: global_store_dwordx4 v
-define protected amdgpu_kernel void @short8_char16(<16 x i8> addrspace(1)* %out) {
+define protected amdgpu_kernel void @short8_char16(ptr addrspace(1) %out) {
 entry:
-  %tmp = load <8 x i16>, <8 x i16> addrspace(1)* undef, align 16
+  %tmp = load <8 x i16>, ptr addrspace(1) undef, align 16
   %vecinit = shufflevector <8 x i16> %tmp, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %vecinit2 = shufflevector <16 x i16> %vecinit, <16 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <16 x i32> <i32 0, i32 1, i32 6, i32 7, i32 0, i32 1, i32 6, i32 7, i32 0, i32 1, i32 6, i32 7, i32 0, i32 1, i32 6, i32 7>
   %tmp1 = trunc <16 x i16> %vecinit2 to <16 x i8>
-  store <16 x i8> %tmp1, <16 x i8> addrspace(1)* %out, align 16
+  store <16 x i8> %tmp1, ptr addrspace(1) %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}short16_char32:
 ; GCN: global_store_dwordx4 v
 ; GCN: global_store_dwordx4 v
-define protected amdgpu_kernel void @short16_char32(<32 x i8> addrspace(1)* %out) {
+define protected amdgpu_kernel void @short16_char32(ptr addrspace(1) %out) {
 entry:
-  %tmp = load <16 x i16>, <16 x i16> addrspace(1)* undef, align 32
+  %tmp = load <16 x i16>, ptr addrspace(1) undef, align 32
   %vecinit = shufflevector <16 x i16> %tmp, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %vecinit2 = shufflevector <32 x i16> %vecinit, <32 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 1, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef, i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <32 x i32> <i32 0, i32 1, i32 6, i32 7, i32 0, i32 1, i32 6, i32 7, i32 0, i32 1, i32 6, i32 7, i32 0, i32 1, i32 6, i32 7, i32 0, i32 1, i32 6, i32 7, i32 0, i32 1, i32 6, i32 7, i32 0, i32 1, i32 6, i32 7, i32 0, i32 1, i32 6, i32 7>
   %tmp1 = trunc <32 x i16> %vecinit2 to <32 x i8>
-  store <32 x i8> %tmp1, <32 x i8> addrspace(1)* %out, align 32
+  store <32 x i8> %tmp1, ptr addrspace(1) %out, align 32
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/trunc-store.ll b/llvm/test/CodeGen/AMDGPU/trunc-store.ll
index f45de679588f5..08e9bc87ed207 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store.ll
@@ -3,16 +3,16 @@
 
 ; FUNC-LABEL: {{^}}truncstore_arg_v16i32_to_v16i8:
 ; SI: buffer_store_dwordx4
-define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) {
+define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(ptr addrspace(1) %out, <16 x i32> %in) {
   %trunc = trunc <16 x i32> %in to <16 x i8>
-  store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out
+  store <16 x i8> %trunc, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}truncstore_arg_v16i64_to_v16i8:
 ; SI: buffer_store_dwordx4
-define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) {
+define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(ptr addrspace(1) %out, <16 x i64> %in) {
   %trunc = trunc <16 x i64> %in to <16 x i8>
-  store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out
+  store <16 x i8> %trunc, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll b/llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
index 3911909e345bc..da68a589a96c6 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
@@ -6,13 +6,13 @@
 
 ; CHECK-LABEL: {{^}}test:
 ; CHECK: MEM_RAT_CACHELESS STORE_RAW
-define amdgpu_kernel void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) {
+define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %cond, <4 x i8> %in) {
 entry:
   %0 = icmp eq i32 %cond, 0
   br i1 %0, label %if, label %done
 
 if:
-  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
+  store <4 x i8> %in, ptr addrspace(1) %out
   br label %done
 
 done:

diff  --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll
index f5401a1a254db..918193faf86b5 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc.ll
@@ -4,7 +4,7 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
-define amdgpu_kernel void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, [8 x i32], i64 %in) {
+define amdgpu_kernel void @trunc_i64_to_i32_store(ptr addrspace(1) %out, [8 x i32], i64 %in) {
 ; GCN-LABEL: {{^}}trunc_i64_to_i32_store:
 ; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[0:1],
 ; GCN: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]]
@@ -16,7 +16,7 @@ define amdgpu_kernel void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, [8 x i
 ; EG: LSHR
 ; EG-NEXT: 2(
 
-  %result = trunc i64 %in to i32 store i32 %result, i32 addrspace(1)* %out, align 4
+  %result = trunc i64 %in to i32 store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -28,10 +28,10 @@ define amdgpu_kernel void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, [8 x i
 ; SI: buffer_store_dword [[VSHL]]
 ; VI: flat_store_dword v[{{[0-9:]+}}], [[VSHL]]
 
-define amdgpu_kernel void @trunc_load_shl_i64(i32 addrspace(1)* %out, [8 x i32], i64 %a) {
+define amdgpu_kernel void @trunc_load_shl_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
   %b = shl i64 %a, 2
   %result = trunc i64 %b to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -45,50 +45,50 @@ define amdgpu_kernel void @trunc_load_shl_i64(i32 addrspace(1)* %out, [8 x i32],
 ; VI: flat_store_dword v[{{[0-9:]+}}], v[[LO_VREG]]
 ; GCN: v_mov_b32_e32
 ; GCN: v_mov_b32_e32
-define amdgpu_kernel void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @trunc_shl_i64(ptr addrspace(1) %out2, ptr addrspace(1) %out, i64 %a) {
   %aa = add i64 %a, 234 ; Prevent shrinking store.
   %b = shl i64 %aa, 2
   %result = trunc i64 %b to i32
-  store i32 %result, i32 addrspace(1)* %out, align 4
-  store i64 %b, i64 addrspace(1)* %out2, align 8 ; Prevent reducing ops to 32-bits
+  store i32 %result, ptr addrspace(1) %out, align 4
+  store i64 %b, ptr addrspace(1) %out2, align 8 ; Prevent reducing ops to 32-bits
   ret void
 }
 
 ; GCN-LABEL: {{^}}trunc_i32_to_i1:
 ; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}}
-define amdgpu_kernel void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
-  %a = load i32, i32 addrspace(1)* %ptr, align 4
+define amdgpu_kernel void @trunc_i32_to_i1(ptr addrspace(1) %out, ptr addrspace(1) %ptr) {
+  %a = load i32, ptr addrspace(1) %ptr, align 4
   %trunc = trunc i32 %a to i1
   %result = select i1 %trunc, i32 1, i32 0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}trunc_i8_to_i1:
 ; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}}
-define amdgpu_kernel void @trunc_i8_to_i1(i8 addrspace(1)* %out, i8 addrspace(1)* %ptr) {
-  %a = load i8, i8 addrspace(1)* %ptr, align 4
+define amdgpu_kernel void @trunc_i8_to_i1(ptr addrspace(1) %out, ptr addrspace(1) %ptr) {
+  %a = load i8, ptr addrspace(1) %ptr, align 4
   %trunc = trunc i8 %a to i1
   %result = select i1 %trunc, i8 1, i8 0
-  store i8 %result, i8 addrspace(1)* %out, align 4
+  store i8 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}sgpr_trunc_i16_to_i1:
 ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
-define amdgpu_kernel void @sgpr_trunc_i16_to_i1(i16 addrspace(1)* %out, i16 %a) {
+define amdgpu_kernel void @sgpr_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %a) {
   %trunc = trunc i16 %a to i1
   %result = select i1 %trunc, i16 1, i16 0
-  store i16 %result, i16 addrspace(1)* %out, align 4
+  store i16 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}sgpr_trunc_i32_to_i1:
 ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
-define amdgpu_kernel void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @sgpr_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %a) {
   %trunc = trunc i32 %a to i1
   %result = select i1 %trunc, i32 1, i32 0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -97,10 +97,10 @@ define amdgpu_kernel void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a)
 ; VI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: s_bitcmp1_b32 s[[SLO]], 0
 ; GCN: s_cselect_b32 {{s[0-9]+}}, 63, -12
-define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, [8 x i32], i64 %x) {
+define amdgpu_kernel void @s_trunc_i64_to_i1(ptr addrspace(1) %out, [8 x i32], i64 %x) {
   %trunc = trunc i64 %x to i1
   %sel = select i1 %trunc, i32 63, i32 -12
-  store i32 %sel, i32 addrspace(1)* %out
+  store i32 %sel, ptr addrspace(1) %out
   ret void
 }
 
@@ -110,14 +110,14 @@ define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, [8 x i32],
 ; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]]
 ; GCN: v_cmp_eq_u32_e32 vcc, 1, [[MASKED]]
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
-define amdgpu_kernel void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_trunc_i64_to_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %x = load i64, i64 addrspace(1)* %gep
+  %gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %x = load i64, ptr addrspace(1) %gep
 
   %trunc = trunc i64 %x to i1
   %sel = select i1 %trunc, i32 63, i32 -12
-  store i32 %sel, i32 addrspace(1)* %out.gep
+  store i32 %sel, ptr addrspace(1) %out.gep
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index 9ddf1498c0198..d9b0106ccc4e5 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -31,7 +31,7 @@ bb:
   unreachable
 }
 
-define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i, i32 %tmp5.i.i, i32 %tmp427.i, i1 %tmp438.i, double %tmp27.i, i1 %tmp48.i) {
+define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i32 %tmp5.i.i, i32 %tmp427.i, i1 %tmp438.i, double %tmp27.i, i1 %tmp48.i) {
 ; GLOBALNESS1-LABEL: kernel:
 ; GLOBALNESS1:       ; %bb.0: ; %bb
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[54:55], s[6:7]
@@ -804,15 +804,15 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GLOBALNESS0-NEXT:  .LBB1_34: ; %UnifiedUnreachableBlock
 bb:
-  store i32 0, i32 addrspace(1)* null, align 4
-  %tmp4 = load i32, i32 addrspace(1)* %arg1.global, align 4
+  store i32 0, ptr addrspace(1) null, align 4
+  %tmp4 = load i32, ptr addrspace(1) %arg1.global, align 4
   br label %bb5
 
 bb5:                                              ; preds = %bb5.backedge, %bb
   %tmp4.i.sroa.0.0 = phi <9 x double> [ undef, %bb ], [ %tmp4.i.sroa.0.1, %bb5.backedge ]
-  %tmp14.1.i = load i32, i32* inttoptr (i64 128 to i32*), align 128
-  store i32 0, i32 addrspace(5)* null, align 4
-  %tmp14.2.i = load i32, i32* inttoptr (i64 128 to i32*), align 128
+  %tmp14.1.i = load i32, ptr inttoptr (i64 128 to ptr), align 128
+  store i32 0, ptr addrspace(5) null, align 4
+  %tmp14.2.i = load i32, ptr inttoptr (i64 128 to ptr), align 128
   %tmp15.2.i = icmp eq i32 %tmp14.2.i, 0
   %spec.select.2.i = select i1 %tmp15.2.i, i32 0, i32 %tmp14.1.i
   tail call void @wobble()
@@ -833,8 +833,8 @@ bb11.i.i:                                         ; preds = %bb4.i.i
   unreachable
 
 baz.exit.i:                                       ; preds = %bb4.i.i, %bb5
-  %tmp26.i = load i32, i32* null, align 4
-  %tmp27.i4 = load double, double addrspace(1)* null, align 8
+  %tmp26.i = load i32, ptr null, align 4
+  %tmp27.i4 = load double, ptr addrspace(1) null, align 8
   %tmp31.i = icmp slt i32 %tmp26.i, 0
   br i1 %tmp31.i, label %bb33.i, label %bb64.i
 
@@ -843,7 +843,7 @@ bb33.i:                                           ; preds = %baz.exit.i
   br i1 %tmp38.i, label %bb39.i, label %bb44.lr.ph.i
 
 bb39.i:                                           ; preds = %bb33.i
-  store double 0.000000e+00, double addrspace(1)* null, align 8
+  store double 0.000000e+00, ptr addrspace(1) null, align 8
   br label %bb44.lr.ph.i
 
 bb44.lr.ph.i:                                     ; preds = %bb39.i, %bb33.i
@@ -883,13 +883,13 @@ spam.exit.i:                                      ; preds = %bb8.i.i, %bb6.i.i,
 bb55.i:                                           ; preds = %spam.exit.i
   tail call void @wobble()
   %tmp0 = extractelement <9 x double> %tmp4.i.sroa.0.0, i32 0
-  store double %tmp0, double addrspace(1)* null, align 8
+  store double %tmp0, ptr addrspace(1) null, align 8
   tail call void @wobble()
   %tmp61.i = icmp eq i32 %spec.select.2.i, 0
   br i1 %tmp61.i, label %bb62.i, label %bb63.i
 
 bb62.i:                                           ; preds = %bb55.i
-  store double 0.000000e+00, double addrspace(1)* null, align 8
+  store double 0.000000e+00, ptr addrspace(1) null, align 8
   br label %bb63.i
 
 bb63.i:                                           ; preds = %bb62.i, %bb55.i, %spam.exit.i, %bb46.i, %bb44.i
@@ -907,7 +907,7 @@ bb67.i:                                           ; preds = %bb64.i
   br i1 %tmp68.i, label %bb69.i, label %bb70.i
 
 bb69.i:                                           ; preds = %bb67.i
-  store double 0.000000e+00, double addrspace(1)* null, align 8
+  store double 0.000000e+00, ptr addrspace(1) null, align 8
   br label %bb70.i
 
 bb70.i:                                           ; preds = %bb69.i, %bb67.i
@@ -915,6 +915,6 @@ bb70.i:                                           ; preds = %bb69.i, %bb67.i
   br i1 %tmp3.i.i2, label %bb73.i, label %bb5.backedge
 
 bb73.i:                                           ; preds = %bb70.i
-  store double 0.000000e+00, double addrspace(1)* null, align 8
+  store double 0.000000e+00, ptr addrspace(1) null, align 8
   br label %bb5.backedge
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 3cbb8fad54136..b38f77e07fae6 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -5,7 +5,7 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GFX1030
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefixes=EG
 
-define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: udiv_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -174,15 +174,15 @@ define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; EG-NEXT:     CNDE_INT T0.X, PS, T1.W, PV.W,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
+  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %a = load i32, ptr addrspace(1) %in
+  %b = load i32, ptr addrspace(1) %b_ptr
   %result = udiv i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; SI-LABEL: s_udiv_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -335,7 +335,7 @@ define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %result = udiv i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -343,7 +343,7 @@ define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 ; The code generated by udiv is long and complex and may frequently
 ; change. The goal of this test is to make sure the ISel doesn't fail
 ; when it gets a v4i32 udiv
-define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: udiv_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -608,15 +608,15 @@ define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; EG-NEXT:     CNDE_INT T1.X, PS, T1.Z, PV.W,
 ; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
+  %a = load <2 x i32>, ptr addrspace(1) %in
+  %b = load <2 x i32>, ptr addrspace(1) %b_ptr
   %result = udiv <2 x i32> %a, %b
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: udiv_v4i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1087,15 +1087,15 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; EG-NEXT:     CNDE_INT T4.X, PS, T2.Y, PV.W,
 ; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+  %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
+  %a = load <4 x i32>, ptr addrspace(1) %in
+  %b = load <4 x i32>, ptr addrspace(1) %b_ptr
   %result = udiv <4 x i32> %a, %b
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: udiv_i32_div_pow2:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1173,14 +1173,14 @@ define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspa
 ; EG-NEXT:     LSHR T0.X, T0.X, literal.x,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    4(5.605194e-45), 2(2.802597e-45)
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
+  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %a = load i32, ptr addrspace(1) %in
   %result = udiv i32 %a, 16
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: udiv_i32_div_k_even:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1267,14 +1267,14 @@ define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrs
 ; EG-NEXT:     LSHR T0.X, PS, literal.x,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    25(3.503246e-44), 2(2.802597e-45)
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
+  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %a = load i32, ptr addrspace(1) %in
   %result = udiv i32 %a, 34259182
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: udiv_i32_div_k_odd:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1361,14 +1361,14 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrsp
 ; EG-NEXT:     LSHR T0.X, PS, literal.x,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    24(3.363116e-44), 2(2.802597e-45)
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
+  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %a = load i32, ptr addrspace(1) %in
   %result = udiv i32 %a, 34259183
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: v_udiv_i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1499,16 +1499,16 @@ define amdgpu_kernel void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %i
 ; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    255(3.573311e-43), 2(2.802597e-45)
-  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %num = load i8, i8 addrspace(1) * %in
-  %den = load i8, i8 addrspace(1) * %den_ptr
+  %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1
+  %num = load i8, ptr addrspace(1) %in
+  %den = load i8, ptr addrspace(1) %den_ptr
   %result = udiv i8 %num, %den
   %result.ext = zext i8 %result to i32
-  store i32 %result.ext, i32 addrspace(1)* %out
+  store i32 %result.ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: v_udiv_i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1648,16 +1648,16 @@ define amdgpu_kernel void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)*
 ; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
-  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
-  %num = load i16, i16 addrspace(1) * %in
-  %den = load i16, i16 addrspace(1) * %den_ptr
+  %den_ptr = getelementptr i16, ptr addrspace(1) %in, i16 1
+  %num = load i16, ptr addrspace(1) %in
+  %den = load i16, ptr addrspace(1) %den_ptr
   %result = udiv i16 %num, %den
   %result.ext = zext i16 %result to i32
-  store i32 %result.ext, i32 addrspace(1)* %out
+  store i32 %result.ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: v_udiv_i23:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1845,16 +1845,16 @@ define amdgpu_kernel void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)*
 ; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    8388607(1.175494e-38), 2(2.802597e-45)
-  %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
-  %num = load i23, i23 addrspace(1) * %in
-  %den = load i23, i23 addrspace(1) * %den_ptr
+  %den_ptr = getelementptr i23, ptr addrspace(1) %in, i23 1
+  %num = load i23, ptr addrspace(1) %in
+  %den = load i23, ptr addrspace(1) %den_ptr
   %result = udiv i23 %num, %den
   %result.ext = zext i23 %result to i32
-  store i32 %result.ext, i32 addrspace(1)* %out
+  store i32 %result.ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: v_udiv_i24:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2091,16 +2091,16 @@ define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)*
 ; EG-NEXT:     CNDE_INT T0.X, PS, T2.W, PV.W,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
-  %num = load i24, i24 addrspace(1) * %in
-  %den = load i24, i24 addrspace(1) * %den_ptr
+  %den_ptr = getelementptr i24, ptr addrspace(1) %in, i24 1
+  %num = load i24, ptr addrspace(1) %in
+  %den = load i24, ptr addrspace(1) %den_ptr
   %result = udiv i24 %num, %den
   %result.ext = zext i24 %result to i32
-  store i32 %result.ext, i32 addrspace(1)* %out
+  store i32 %result.ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
+define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) {
 ; SI-LABEL: scalarize_mulhu_4xi32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2239,9 +2239,9 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocaptu
 ; EG-NEXT:     LSHR T0.X, PS, literal.x,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Z, literal.y,
 ; EG-NEXT:    10(1.401298e-44), 2(2.802597e-45)
-  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
+  %1 = load <4 x i32>, ptr addrspace(1) %in, align 16
   %2 = udiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
-  store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %2, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -2301,7 +2301,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) {
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, 1,
 ; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
   %i = udiv i32 %p, 2
-  store volatile i32 %i, i32 addrspace(1)* undef
+  store volatile i32 %i, ptr addrspace(1) undef
   ret void
 }
 
@@ -2367,11 +2367,11 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
 ; EG-NEXT:     MOV * T1.X, literal.x,
 ; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
    %i = udiv i32 %p, 3
-   store volatile i32 %i, i32 addrspace(1)* undef
+   store volatile i32 %i, ptr addrspace(1) undef
    ret void
 }
 
-define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readonly %arg) {
+define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readonly %arg) {
 ; SI-LABEL: fdiv_test_denormals:
 ; SI:       ; %bb.0: ; %bb
 ; SI-NEXT:    s_mov_b32 s0, 0
@@ -2514,14 +2514,14 @@ define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readon
 ; EG-NEXT:     MOV * T1.X, literal.x,
 ; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
 bb:
-  %tmp = load i8, i8 addrspace(1)* null, align 1
+  %tmp = load i8, ptr addrspace(1) null, align 1
   %tmp1 = sext i8 %tmp to i32
-  %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 undef
-  %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1
+  %tmp2 = getelementptr inbounds i8, ptr addrspace(1) %arg, i64 undef
+  %tmp3 = load i8, ptr addrspace(1) %tmp2, align 1
   %tmp4 = sext i8 %tmp3 to i32
   %tmp5 = sdiv i32 %tmp1, %tmp4
   %tmp6 = trunc i32 %tmp5 to i8
-  store i8 %tmp6, i8 addrspace(1)* null, align 1
+  store i8 %tmp6, ptr addrspace(1) null, align 1
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 3c3c77b1a5331..2e3c38df149ae 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s
 
-define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_udiv_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
@@ -199,7 +199,7 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = udiv i64 %x, %y
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -401,7 +401,7 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_udiv24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_udiv24_64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_udiv24_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0xe
@@ -454,7 +454,7 @@ define amdgpu_kernel void @s_test_udiv24_64(i64 addrspace(1)* %out, i64 %x, i64
   %1 = lshr i64 %x, 40
   %2 = lshr i64 %y, 40
   %result = udiv i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -500,7 +500,7 @@ define i64 @v_test_udiv24_i64(i64 %x, i64 %y) {
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_udiv32_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_udiv32_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0xe
@@ -547,11 +547,11 @@ define amdgpu_kernel void @s_test_udiv32_i64(i64 addrspace(1)* %out, i64 %x, i64
   %1 = lshr i64 %x, 32
   %2 = lshr i64 %y, 32
   %result = udiv i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_udiv31_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_udiv31_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0xe
@@ -604,11 +604,11 @@ define amdgpu_kernel void @s_test_udiv31_i64(i64 addrspace(1)* %out, i64 %x, i64
   %1 = lshr i64 %x, 33
   %2 = lshr i64 %y, 33
   %result = udiv i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_udiv23_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_udiv23_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0xe
@@ -661,11 +661,11 @@ define amdgpu_kernel void @s_test_udiv23_i64(i64 addrspace(1)* %out, i64 %x, i64
   %1 = lshr i64 %x, 41
   %2 = lshr i64 %y, 41
   %result = udiv i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48 %y) {
+define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 %y) {
 ; GCN-LABEL: s_test_udiv24_i48:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -871,11 +871,11 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
   %1 = lshr i48 %x, 24
   %2 = lshr i48 %y, 24
   %result = udiv i48 %1, %2
-  store i48 %result, i48 addrspace(1)* %out
+  store i48 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) {
 ; GCN-LABEL: s_test_udiv_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1050,7 +1050,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = udiv i64 24, %x
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -1323,7 +1323,7 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) {
 ; GCN-LABEL: s_test_udiv_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
@@ -1494,7 +1494,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = udiv i64 %x, 24
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -1673,7 +1673,7 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_udiv24_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x) {
 ; GCN-LABEL: s_test_udiv24_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1717,11 +1717,11 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    s_endpgm
   %x.shr = lshr i64 %x, 40
   %result = udiv i64 24, %x.shr
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_udiv24_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_udiv24_k_den_i64(ptr addrspace(1) %out, i64 %x) {
 ; GCN-LABEL: s_test_udiv24_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1767,7 +1767,7 @@ define amdgpu_kernel void @s_test_udiv24_k_den_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    s_endpgm
   %x.shr = lshr i64 %x, 40
   %result = udiv i64 %x.shr, 23423
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index e1a79bf1e9c7a..21ae4feb25322 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
 
-define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1, [8 x i32], i32 %x, [8 x i32], i32 %y) {
+define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr addrspace(1) %out1, [8 x i32], i32 %x, [8 x i32], i32 %y) {
 ; R600-LABEL: test_udivrem:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    ALU 21, @4, KC0[CB0:0-32], KC1[]
@@ -116,13 +116,13 @@ define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32
 ; GFX8-NEXT:    flat_store_dword v[2:3], v0
 ; GFX8-NEXT:    s_endpgm
   %result0 = udiv i32 %x, %y
-  store i32 %result0, i32 addrspace(1)* %out0
+  store i32 %result0, ptr addrspace(1) %out0
   %result1 = urem i32 %x, %y
-  store i32 %result1, i32 addrspace(1)* %out1
+  store i32 %result1, ptr addrspace(1) %out1
   ret void
 }
 
-define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, <2 x i32> %y) {
 ; R600-LABEL: test_udivrem_v2:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    ALU 29, @4, KC0[CB0:0-32], KC1[]
@@ -257,13 +257,13 @@ define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i3
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
   %result0 = udiv <2 x i32> %x, %y
-  store <2 x i32> %result0, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %result0, ptr addrspace(1) %out
   %result1 = urem <2 x i32> %x, %y
-  store <2 x i32> %result1, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %result1, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x, <4 x i32> %y) {
 ; R600-LABEL: test_udivrem_v4:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    ALU 57, @4, KC0[CB0:0-32], KC1[]
@@ -502,8 +502,8 @@ define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i3
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
   %result0 = udiv <4 x i32> %x, %y
-  store <4 x i32> %result0, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %result0, ptr addrspace(1) %out
   %result1 = urem <4 x i32> %x, %y
-  store <4 x i32> %result1, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %result1, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/udivrem24.ll b/llvm/test/CodeGen/AMDGPU/udivrem24.ll
index 6b629ca9582f0..99effa8479bae 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem24.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem24.ll
@@ -12,12 +12,12 @@
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define amdgpu_kernel void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
-  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %num = load i8, i8 addrspace(1) * %in
-  %den = load i8, i8 addrspace(1) * %den_ptr
+define amdgpu_kernel void @udiv24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1
+  %num = load i8, ptr addrspace(1) %in
+  %den = load i8, ptr addrspace(1) %den_ptr
   %result = udiv i8 %num, %den
-  store i8 %result, i8 addrspace(1)* %out
+  store i8 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -31,12 +31,12 @@ define amdgpu_kernel void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define amdgpu_kernel void @udiv24_i8_denorm_flush_in_out(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
-  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %num = load i8, i8 addrspace(1) * %in
-  %den = load i8, i8 addrspace(1) * %den_ptr
+define amdgpu_kernel void @udiv24_i8_denorm_flush_in_out(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1
+  %num = load i8, ptr addrspace(1) %in
+  %den = load i8, ptr addrspace(1) %den_ptr
   %result = udiv i8 %num, %den
-  store i8 %result, i8 addrspace(1)* %out
+  store i8 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -50,12 +50,12 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_in_out(i8 addrspace(1)* %out,
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define amdgpu_kernel void @udiv24_i8_denorm_flush_in(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
-  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %num = load i8, i8 addrspace(1) * %in
-  %den = load i8, i8 addrspace(1) * %den_ptr
+define amdgpu_kernel void @udiv24_i8_denorm_flush_in(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+  %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1
+  %num = load i8, ptr addrspace(1) %in
+  %den = load i8, ptr addrspace(1) %den_ptr
   %result = udiv i8 %num, %den
-  store i8 %result, i8 addrspace(1)* %out
+  store i8 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -69,12 +69,12 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_in(i8 addrspace(1)* %out, i8 a
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define amdgpu_kernel void @udiv24_i8_denorm_flush_out(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #2 {
-  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %num = load i8, i8 addrspace(1) * %in
-  %den = load i8, i8 addrspace(1) * %den_ptr
+define amdgpu_kernel void @udiv24_i8_denorm_flush_out(ptr addrspace(1) %out, ptr addrspace(1) %in) #2 {
+  %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1
+  %num = load i8, ptr addrspace(1) %in
+  %den = load i8, ptr addrspace(1) %den_ptr
   %result = udiv i8 %num, %den
-  store i8 %result, i8 addrspace(1)* %out
+  store i8 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -88,12 +88,12 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_out(i8 addrspace(1)* %out, i8
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define amdgpu_kernel void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
-  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
-  %num = load i16, i16 addrspace(1) * %in, align 2
-  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
+define amdgpu_kernel void @udiv24_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i16, ptr addrspace(1) %in, i16 1
+  %num = load i16, ptr addrspace(1) %in, align 2
+  %den = load i16, ptr addrspace(1) %den_ptr, align 2
   %result = udiv i16 %num, %den
-  store i16 %result, i16 addrspace(1)* %out, align 2
+  store i16 %result, ptr addrspace(1) %out, align 2
   ret void
 }
 
@@ -107,16 +107,16 @@ define amdgpu_kernel void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)*
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define amdgpu_kernel void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @udiv23_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i23.0 = shl i32 %num, 9
   %den.i23.0 = shl i32 %den, 9
   %num.i23 = lshr i32 %num.i23.0, 9
   %den.i23 = lshr i32 %den.i23.0, 9
   %result = udiv i32 %num.i23, %den.i23
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -124,16 +124,16 @@ define amdgpu_kernel void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_rcp_iflag
 ; SI-NOT: v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define amdgpu_kernel void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @udiv24_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
   %den.i24.0 = shl i32 %den, 8
   %num.i24 = lshr i32 %num.i24.0, 8
   %den.i24 = lshr i32 %den.i24.0, 8
   %result = udiv i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -141,16 +141,16 @@ define amdgpu_kernel void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_rcp_iflag
 ; SI-NOT: v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define amdgpu_kernel void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @no_udiv24_u23_u24_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i23.0 = shl i32 %num, 9
   %den.i24.0 = shl i32 %den, 8
   %num.i23 = lshr i32 %num.i23.0, 9
   %den.i24 = lshr i32 %den.i24.0, 8
   %result = udiv i32 %num.i23, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -158,16 +158,16 @@ define amdgpu_kernel void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 add
 ; SI: v_rcp_iflag
 ; SI-NOT: v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define amdgpu_kernel void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @no_udiv24_u24_u23_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
   %den.i23.0 = shl i32 %den, 9
   %num.i24 = lshr i32 %num.i24.0, 8
   %den.i23 = lshr i32 %den.i23.0, 9
   %result = udiv i32 %num.i24, %den.i23
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -178,16 +178,16 @@ define amdgpu_kernel void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 add
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define amdgpu_kernel void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @udiv25_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i25.0 = shl i32 %num, 7
   %den.i25.0 = shl i32 %den, 7
   %num.i25 = lshr i32 %num.i25.0, 7
   %den.i25 = lshr i32 %den.i25.0, 7
   %result = udiv i32 %num.i25, %den.i25
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -198,16 +198,16 @@ define amdgpu_kernel void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define amdgpu_kernel void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @test_no_udiv24_i32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
   %den.i24.0 = shl i32 %den, 7
   %num.i24 = lshr i32 %num.i24.0, 8
   %den.i24 = lshr i32 %den.i24.0, 7
   %result = udiv i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -218,16 +218,16 @@ define amdgpu_kernel void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addr
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define amdgpu_kernel void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @test_no_udiv24_i32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 7
   %den.i24.0 = shl i32 %den, 8
   %num.i24 = lshr i32 %num.i24.0, 7
   %den.i24 = lshr i32 %den.i24.0, 8
   %result = udiv i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -241,12 +241,12 @@ define amdgpu_kernel void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addr
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define amdgpu_kernel void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
-  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %num = load i8, i8 addrspace(1) * %in
-  %den = load i8, i8 addrspace(1) * %den_ptr
+define amdgpu_kernel void @urem24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1
+  %num = load i8, ptr addrspace(1) %in
+  %den = load i8, ptr addrspace(1) %den_ptr
   %result = urem i8 %num, %den
-  store i8 %result, i8 addrspace(1)* %out
+  store i8 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -260,28 +260,28 @@ define amdgpu_kernel void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define amdgpu_kernel void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
-  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
-  %num = load i16, i16 addrspace(1) * %in, align 2
-  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
+define amdgpu_kernel void @urem24_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i16, ptr addrspace(1) %in, i16 1
+  %num = load i16, ptr addrspace(1) %in, align 2
+  %den = load i16, ptr addrspace(1) %den_ptr, align 2
   %result = urem i16 %num, %den
-  store i16 %result, i16 addrspace(1)* %out, align 2
+  store i16 %result, ptr addrspace(1) %out, align 2
   ret void
 }
 
 ; FUNC-LABEL: {{^}}urem24_i32:
 ; SI-NOT: v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define amdgpu_kernel void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @urem24_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
   %den.i24.0 = shl i32 %den, 8
   %num.i24 = lshr i32 %num.i24.0, 8
   %den.i24 = lshr i32 %den.i24.0, 8
   %result = urem i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -292,16 +292,16 @@ define amdgpu_kernel void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define amdgpu_kernel void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @urem25_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 7
   %den.i24.0 = shl i32 %den, 7
   %num.i24 = lshr i32 %num.i24.0, 7
   %den.i24 = lshr i32 %den.i24.0, 7
   %result = urem i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -312,16 +312,16 @@ define amdgpu_kernel void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define amdgpu_kernel void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @test_no_urem24_i32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 8
   %den.i24.0 = shl i32 %den, 7
   %num.i24 = lshr i32 %num.i24.0, 8
   %den.i24 = lshr i32 %den.i24.0, 7
   %result = urem i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -332,16 +332,16 @@ define amdgpu_kernel void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addr
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define amdgpu_kernel void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @test_no_urem24_i32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i24.0 = shl i32 %num, 7
   %den.i24.0 = shl i32 %den, 8
   %num.i24 = lshr i32 %num.i24.0, 7
   %den.i24 = lshr i32 %den.i24.0, 8
   %result = urem i32 %num.i24, %den.i24
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -350,16 +350,16 @@ define amdgpu_kernel void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addr
 ; SI: v_and_b32_e32 v{{[0-9]+}}, 0x7fffff,
 
 ; EG: RECIP_IEEE
-define amdgpu_kernel void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @test_udiv24_u16_u23_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i16.0 = shl i32 %num, 16
   %den.i23.0 = shl i32 %den, 9
   %num.i16 = lshr i32 %num.i16.0, 16
   %den.i23 = lshr i32 %den.i23.0, 9
   %result = udiv i32 %num.i16, %den.i23
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -368,16 +368,16 @@ define amdgpu_kernel void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 a
 ; SI: v_and_b32_e32 v{{[0-9]+}}, 0x7fffff,
 
 ; EG: RECIP_IEEE
-define amdgpu_kernel void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %num = load i32, i32 addrspace(1) * %in, align 4
-  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+define amdgpu_kernel void @test_udiv24_u23_u16_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %num = load i32, ptr addrspace(1) %in, align 4
+  %den = load i32, ptr addrspace(1) %den_ptr, align 4
   %num.i23.0 = shl i32 %num, 9
   %den.i16.0 = shl i32 %den, 16
   %num.i23 = lshr i32 %num.i23.0, 9
   %den.i16 = lshr i32 %den.i16.0, 16
   %result = udiv i32 %num.i23, %den.i16
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/umed3.ll b/llvm/test/CodeGen/AMDGPU/umed3.ll
index 07b97f6139d47..13b47c0d7f8f9 100644
--- a/llvm/test/CodeGen/AMDGPU/umed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/umed3.ll
@@ -6,11 +6,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i32:
 ; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define amdgpu_kernel void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0
+  %gep0 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load i32, ptr addrspace(1) %gep0
 
   %icmp0 = icmp ugt i32 %a, 12
   %i0 = select i1 %icmp0, i32 %a, i32 12
@@ -18,18 +18,18 @@ define amdgpu_kernel void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 ad
   %icmp1 = icmp ult i32 %i0, 17
   %i1 = select i1 %icmp1, i32 %i0, i32 17
 
-  store i32 %i1, i32 addrspace(1)* %outgep
+  store i32 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_umed3_multi_use_r_i_i_i32:
 ; GCN: v_max_u32
 ; GCN: v_min_u32
-define amdgpu_kernel void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_multi_use_r_i_i_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0
+  %gep0 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load i32, ptr addrspace(1) %gep0
 
   %icmp0 = icmp ugt i32 %a, 12
   %i0 = select i1 %icmp0, i32 %a, i32 12
@@ -37,19 +37,19 @@ define amdgpu_kernel void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %o
   %icmp1 = icmp ult i32 %i0, 17
   %i1 = select i1 %icmp1, i32 %i0, i32 17
 
-  store volatile i32 %i0, i32 addrspace(1)* %outgep
-  store volatile i32 %i1, i32 addrspace(1)* %outgep
+  store volatile i32 %i0, ptr addrspace(1) %outgep
+  store volatile i32 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_sign_mismatch_i32:
 ; GCN: v_max_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
 ; GCN: v_min_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
-define amdgpu_kernel void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_sign_mismatch_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0
+  %gep0 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  %a = load i32, ptr addrspace(1) %gep0
 
   %icmp0 = icmp sgt i32 %a, 12
   %i0 = select i1 %icmp0, i32 %a, i32 12
@@ -57,18 +57,18 @@ define amdgpu_kernel void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)
   %icmp1 = icmp ult i32 %i0, 17
   %i1 = select i1 %icmp1, i32 %i0, i32 17
 
-  store i32 %i1, i32 addrspace(1)* %outgep
+  store i32 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i64:
 ; GCN: v_cmp_lt_u64
 ; GCN: v_cmp_gt_u64
-define amdgpu_kernel void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
-  %a = load i64, i64 addrspace(1)* %gep0
+  %gep0 = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr i64, ptr addrspace(1) %out, i32 %tid
+  %a = load i64, ptr addrspace(1) %gep0
 
   %icmp0 = icmp ugt i64 %a, 12
   %i0 = select i1 %icmp0, i64 %a, i64 12
@@ -76,18 +76,18 @@ define amdgpu_kernel void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 ad
   %icmp1 = icmp ult i64 %i0, 17
   %i1 = select i1 %icmp1, i64 %i0, i64 17
 
-  store i64 %i1, i64 addrspace(1)* %outgep
+  store i64 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i16:
 ; SICIVI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define amdgpu_kernel void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
-  %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
-  %a = load i16, i16 addrspace(1)* %gep0
+  %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
+  %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid
+  %a = load i16, ptr addrspace(1) %gep0
 
   %icmp0 = icmp ugt i16 %a, 12
   %i0 = select i1 %icmp0, i16 %a, i16 12
@@ -95,7 +95,7 @@ define amdgpu_kernel void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 ad
   %icmp1 = icmp ult i16 %i0, 17
   %i1 = select i1 %icmp1, i16 %i0, i16 17
 
-  store i16 %i1, i16 addrspace(1)* %outgep
+  store i16 %i1, ptr addrspace(1) %outgep
   ret void
 }
 
@@ -154,193 +154,193 @@ define internal i8 @umax8(i8 %x, i8 %y) #2 {
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_1:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_1(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
   %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_2:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_2(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
   %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_3:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_3(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
   %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
   %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_4:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_4(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_5:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_5(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
   %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_6:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_6(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
   %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_7:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_7(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
   %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
   %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_8:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_8(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_9:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_9(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
   %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_10:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_10(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
   %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_11:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_11(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
   %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
   %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_12:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_12(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_13:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_13(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
   %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_14:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_14(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
   %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_15:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_15(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
   %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
   %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -360,193 +360,193 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_16:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_16(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_16(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umax(i32 %tmp0, i32 %z)
   %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_17:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_17(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_17(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umax(i32 %tmp0, i32 %z)
   %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_18:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_18(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_18(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umax(i32 %z, i32 %tmp0)
   %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_19:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_19(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_19(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umax(i32 %z, i32 %tmp0)
   %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_20:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_20(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_20(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
   %tmp2 = call i32 @umax(i32 %tmp0, i32 %z)
   %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_21:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_21(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_21(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
   %tmp2 = call i32 @umax(i32 %tmp0, i32 %z)
   %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_22:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_22(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_22(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
   %tmp2 = call i32 @umax(i32 %z, i32 %tmp0)
   %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_23:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_23(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_23(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
   %tmp2 = call i32 @umax(i32 %z, i32 %tmp0)
   %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_24:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_24(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_24(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umax(i32 %tmp0, i32 %z)
   %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_25:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_25(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_25(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umax(i32 %tmp0, i32 %z)
   %tmp3 = call i32 @umin(i32 %tmp1, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_26:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_26(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_26(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umax(i32 %z, i32 %tmp0)
   %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_27:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_27(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_27(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umax(i32 %z, i32 %tmp0)
   %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_28:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_28(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_28(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
   %tmp2 = call i32 @umax(i32 %tmp0, i32 %z)
   %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_29:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_29(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_29(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
   %tmp2 = call i32 @umax(i32 %tmp0, i32 %z)
   %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_30:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_30(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_30(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
   %tmp2 = call i32 @umax(i32 %z, i32 %tmp0)
   %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_31:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_31(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_31(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
   %tmp2 = call i32 @umax(i32 %z, i32 %tmp0)
   %tmp3 = call i32 @umin(i32 %tmp2, i32 %tmp1)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -555,13 +555,13 @@ bb:
 ; GCN: s_and_b32
 ; GCN: s_and_b32
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, [8 x i32], i16 %x, [8 x i32], i16 %y, [8 x i32], i16 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i16_pat_0(ptr addrspace(1) %arg, [8 x i32], i16 %x, [8 x i32], i16 %y, [8 x i32], i16 %z) #1 {
 bb:
   %tmp0 = call i16 @umin16(i16 %x, i16 %y)
   %tmp1 = call i16 @umax16(i16 %x, i16 %y)
   %tmp2 = call i16 @umin16(i16 %tmp1, i16 %z)
   %tmp3 = call i16 @umax16(i16 %tmp0, i16 %tmp2)
-  store i16 %tmp3, i16 addrspace(1)* %arg
+  store i16 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -570,13 +570,13 @@ bb:
 ; GCN: s_and_b32
 ; GCN: s_and_b32
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, [8 x i32], i8 %x, [8 x i32], i8 %y, [8 x i32], i8 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i8_pat_0(ptr addrspace(1) %arg, [8 x i32], i8 %x, [8 x i32], i8 %y, [8 x i32], i8 %z) #1 {
 bb:
   %tmp0 = call i8 @umin8(i8 %x, i8 %y)
   %tmp1 = call i8 @umax8(i8 %x, i8 %y)
   %tmp2 = call i8 @umin8(i8 %tmp1, i8 %z)
   %tmp3 = call i8 @umax8(i8 %tmp0, i8 %tmp2)
-  store i8 %tmp3, i8 addrspace(1)* %arg
+  store i8 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -584,14 +584,14 @@ bb:
 ; GCN: s_min_u32
 ; GCN-NOT: {{s_min_u32|s_max_u32}}
 ; GCN: v_med3_u32
-define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_0(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
-  store volatile i32 %tmp0, i32 addrspace(1)* %arg
-  store volatile i32 %tmp3, i32 addrspace(1)* %arg
+  store volatile i32 %tmp0, ptr addrspace(1) %arg
+  store volatile i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -599,14 +599,14 @@ bb:
 ; GCN: s_max_u32
 ; GCN-NOT: {{s_min_u32|s_max_u32}}
 ; GCN: v_med3_u32
-define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_1(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
-  store volatile i32 %tmp1, i32 addrspace(1)* %arg
-  store volatile i32 %tmp3, i32 addrspace(1)* %arg
+  store volatile i32 %tmp1, ptr addrspace(1) %arg
+  store volatile i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -615,28 +615,28 @@ bb:
 ; GCN: s_min_u32
 ; GCN-NOT: {{s_min_u32|s_max_u32}}
 ; GCN: v_med3_u32
-define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_2(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
-  store volatile i32 %tmp2, i32 addrspace(1)* %arg
-  store volatile i32 %tmp3, i32 addrspace(1)* %arg
+  store volatile i32 %tmp2, ptr addrspace(1) %arg
+  store volatile i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_result:
 ; GCN-NOT: {{s_min_u32|s_max_u32}}
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_result(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
-  store volatile i32 %tmp3, i32 addrspace(1)* %arg
-  store volatile i32 %tmp3, i32 addrspace(1)* %arg
+  store volatile i32 %tmp3, ptr addrspace(1) %arg
+  store volatile i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -644,7 +644,7 @@ bb:
 ; GCN-NOT: {{s_min_u32|s_max_u32}}
 ; GCN: v_med3_u32 v{{[0-9]+}}, [[B0:s[0-9]+]], [[B1:v[0-9]+]], v{{[0-9]+}}
 ; GCN: v_med3_u32 v{{[0-9]+}}, [[B0]], [[B1]], v{{[0-9]+}}
-define amdgpu_kernel void @s_test_smed3_reuse_bounds(i32 addrspace(1)* %arg, i32 %b0, i32 %b1, i32 %x, i32 %y) #1 {
+define amdgpu_kernel void @s_test_smed3_reuse_bounds(ptr addrspace(1) %arg, i32 %b0, i32 %b1, i32 %x, i32 %y) #1 {
 bb:
   %lo = call i32 @umin(i32 %b0, i32 %b1)
   %hi = call i32 @umax(i32 %b0, i32 %b1)
@@ -655,44 +655,44 @@ bb:
   %tmp1 = call i32 @umin(i32 %y, i32 %hi)
   %z1 = call i32 @umax(i32 %tmp1, i32 %lo)
 
-  store volatile i32 %z0, i32 addrspace(1)* %arg
-  store volatile i32 %z1, i32 addrspace(1)* %arg
+  store volatile i32 %z0, ptr addrspace(1) %arg
+  store volatile i32 %z1, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src0:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src0(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 1, i32 %y)
   %tmp1 = call i32 @umax(i32 1, i32 %y)
   %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src1:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
-define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src1(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 2)
   %tmp1 = call i32 @umax(i32 %x, i32 2)
   %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
   %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src2:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 9
-define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src2(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
   %tmp2 = call i32 @umin(i32 %tmp1, i32 9)
   %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
-  store i32 %tmp3, i32 addrspace(1)* %arg
+  store i32 %tmp3, ptr addrspace(1) %arg
   ret void
 }
 
@@ -706,43 +706,43 @@ bb:
 ; VI: v_max_u16
 
 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
+define amdgpu_kernel void @v_test_umed3_i16_pat_0(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
-  %gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3
-  %gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8
-  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
-  %x = load i16, i16 addrspace(1)* %gep0
-  %y = load i16, i16 addrspace(1)* %gep1
-  %z = load i16, i16 addrspace(1)* %gep2
+  %gep0 = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i32 %tid
+  %gep1 = getelementptr inbounds i16, ptr addrspace(1) %gep0, i32 3
+  %gep2 = getelementptr inbounds i16, ptr addrspace(1) %gep0, i32 8
+  %out.gep = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
+  %x = load i16, ptr addrspace(1) %gep0
+  %y = load i16, ptr addrspace(1) %gep1
+  %z = load i16, ptr addrspace(1) %gep2
 
   %tmp0 = call i16 @umin16(i16 %x, i16 %y)
   %tmp1 = call i16 @umax16(i16 %x, i16 %y)
   %tmp2 = call i16 @umin16(i16 %tmp1, i16 %z)
   %tmp3 = call i16 @umax16(i16 %tmp0, i16 %tmp2)
-  store i16 %tmp3, i16 addrspace(1)* %out.gep
+  store i16 %tmp3, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_umed3_i16_pat_1:
 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_test_umed3_i16_pat_1(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
+define amdgpu_kernel void @v_test_umed3_i16_pat_1(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
-  %gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3
-  %gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8
-  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
-  %x = load i16, i16 addrspace(1)* %gep0
-  %y = load i16, i16 addrspace(1)* %gep1
-  %z = load i16, i16 addrspace(1)* %gep2
+  %gep0 = getelementptr inbounds i16, ptr addrspace(1) %a.ptr, i32 %tid
+  %gep1 = getelementptr inbounds i16, ptr addrspace(1) %gep0, i32 3
+  %gep2 = getelementptr inbounds i16, ptr addrspace(1) %gep0, i32 8
+  %out.gep = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
+  %x = load i16, ptr addrspace(1) %gep0
+  %y = load i16, ptr addrspace(1) %gep1
+  %z = load i16, ptr addrspace(1) %gep2
 
   %tmp0 = call i16 @umin16(i16 %x, i16 %y)
   %tmp1 = call i16 @umax16(i16 %x, i16 %y)
   %tmp2 = call i16 @umax16(i16 %tmp0, i16 %z)
   %tmp3 = call i16 @umin16(i16 %tmp1, i16 %tmp2)
-  store i16 %tmp3, i16 addrspace(1)* %out.gep
+  store i16 %tmp3, ptr addrspace(1) %out.gep
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
index b69140d408ee3..827e26d6e4615 100644
--- a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -9,9 +9,9 @@
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define amdgpu_kernel void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 {
-  %v = load i16, i16 addrspace(3)* %p, align 1
-  store i16 %v, i16 addrspace(3)* %r, align 1
+define amdgpu_kernel void @local_unaligned_load_store_i16(ptr addrspace(3) %p, ptr addrspace(3) %r) #0 {
+  %v = load i16, ptr addrspace(3) %p, align 1
+  store i16 %v, ptr addrspace(3) %r, align 1
   ret void
 }
 
@@ -24,9 +24,9 @@ define amdgpu_kernel void @local_unaligned_load_store_i16(i16 addrspace(3)* %p,
 ; UNALIGNED: buffer_load_ushort
 ; UNALIGNED: buffer_store_short
 ; SI: s_endpgm
-define amdgpu_kernel void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
-  %v = load i16, i16 addrspace(1)* %p, align 1
-  store i16 %v, i16 addrspace(1)* %r, align 1
+define amdgpu_kernel void @global_unaligned_load_store_i16(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
+  %v = load i16, ptr addrspace(1) %p, align 1
+  store i16 %v, ptr addrspace(1) %r, align 1
   ret void
 }
 
@@ -43,9 +43,9 @@ define amdgpu_kernel void @global_unaligned_load_store_i16(i16 addrspace(1)* %p,
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define amdgpu_kernel void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
-  %v = load i32, i32 addrspace(3)* %p, align 1
-  store i32 %v, i32 addrspace(3)* %r, align 1
+define amdgpu_kernel void @local_unaligned_load_store_i32(ptr addrspace(3) %p, ptr addrspace(3) %r) #0 {
+  %v = load i32, ptr addrspace(3) %p, align 1
+  store i32 %v, ptr addrspace(3) %r, align 1
   ret void
 }
 
@@ -61,9 +61,9 @@ define amdgpu_kernel void @local_unaligned_load_store_i32(i32 addrspace(3)* %p,
 
 ; UNALIGNED: buffer_load_dword
 ; UNALIGNED: buffer_store_dword
-define amdgpu_kernel void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
-  %v = load i32, i32 addrspace(1)* %p, align 1
-  store i32 %v, i32 addrspace(1)* %r, align 1
+define amdgpu_kernel void @global_unaligned_load_store_i32(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
+  %v = load i32, ptr addrspace(1) %p, align 1
+  store i32 %v, ptr addrspace(1) %r, align 1
   ret void
 }
 
@@ -75,9 +75,9 @@ define amdgpu_kernel void @global_unaligned_load_store_i32(i32 addrspace(1)* %p,
 
 ; UNALIGNED: buffer_load_dword
 ; UNALIGNED: buffer_store_dword
-define amdgpu_kernel void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
-  %v = load i32, i32 addrspace(1)* %p, align 2
-  store i32 %v, i32 addrspace(1)* %r, align 2
+define amdgpu_kernel void @global_align2_load_store_i32(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
+  %v = load i32, ptr addrspace(1) %p, align 2
+  store i32 %v, ptr addrspace(1) %r, align 2
   ret void
 }
 
@@ -86,9 +86,9 @@ define amdgpu_kernel void @global_align2_load_store_i32(i32 addrspace(1)* %p, i3
 ; GCN: ds_read_u16
 ; GCN: ds_write_b16
 ; GCN: ds_write_b16
-define amdgpu_kernel void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
-  %v = load i32, i32 addrspace(3)* %p, align 2
-  store i32 %v, i32 addrspace(3)* %r, align 2
+define amdgpu_kernel void @local_align2_load_store_i32(ptr addrspace(3) %p, ptr addrspace(3) %r) #0 {
+  %v = load i32, ptr addrspace(3) %p, align 2
+  store i32 %v, ptr addrspace(3) %r, align 2
   ret void
 }
 
@@ -133,9 +133,9 @@ define amdgpu_kernel void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32
 ; SI-NOT: v_lshl
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define amdgpu_kernel void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 {
-  %v = load i64, i64 addrspace(3)* %p, align 1
-  store i64 %v, i64 addrspace(3)* %r, align 1
+define amdgpu_kernel void @local_unaligned_load_store_i64(ptr addrspace(3) %p, ptr addrspace(3) %r) #0 {
+  %v = load i64, ptr addrspace(3) %p, align 1
+  store i64 %v, ptr addrspace(3) %r, align 1
   ret void
 }
 
@@ -180,9 +180,9 @@ define amdgpu_kernel void @local_unaligned_load_store_i64(i64 addrspace(3)* %p,
 ; SI-NOT: v_lshl
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define amdgpu_kernel void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 {
-  %v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1
-  store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1
+define amdgpu_kernel void @local_unaligned_load_store_v2i32(ptr addrspace(3) %p, ptr addrspace(3) %r) #0 {
+  %v = load <2 x i32>, ptr addrspace(3) %p, align 1
+  store <2 x i32> %v, ptr addrspace(3) %r, align 1
   ret void
 }
 
@@ -210,9 +210,9 @@ define amdgpu_kernel void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(
 
 ; UNALIGNED: buffer_load_dwordx2
 ; UNALIGNED: buffer_store_dwordx2
-define amdgpu_kernel void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
-  %v = load i64, i64 addrspace(1)* %p, align 2
-  store i64 %v, i64 addrspace(1)* %r, align 2
+define amdgpu_kernel void @global_align2_load_store_i64(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
+  %v = load i64, ptr addrspace(1) %p, align 2
+  store i64 %v, ptr addrspace(1) %r, align 2
   ret void
 }
 
@@ -240,9 +240,9 @@ define amdgpu_kernel void @global_align2_load_store_i64(i64 addrspace(1)* %p, i6
 
 ; UNALIGNED: buffer_load_dwordx2
 ; UNALIGNED: buffer_store_dwordx2
-define amdgpu_kernel void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
-  %v = load i64, i64 addrspace(1)* %p, align 1
-  store i64 %v, i64 addrspace(1)* %r, align 1
+define amdgpu_kernel void @unaligned_load_store_i64_global(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
+  %v = load i64, ptr addrspace(1) %p, align 1
+  store i64 %v, ptr addrspace(1) %r, align 1
   ret void
 }
 
@@ -287,9 +287,9 @@ define amdgpu_kernel void @unaligned_load_store_i64_global(i64 addrspace(1)* %p,
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: s_endpgm
-define amdgpu_kernel void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 {
-  %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1
-  store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
+define amdgpu_kernel void @local_unaligned_load_store_v4i32(ptr addrspace(3) %p, ptr addrspace(3) %r) #0 {
+  %v = load <4 x i32>, ptr addrspace(3) %p, align 1
+  store <4 x i32> %v, ptr addrspace(3) %r, align 1
   ret void
 }
 
@@ -330,26 +330,26 @@ define amdgpu_kernel void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(
 
 ; UNALIGNED: buffer_load_dwordx4
 ; UNALIGNED: buffer_store_dwordx4
-define amdgpu_kernel void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 {
-  %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1
-  store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
+define amdgpu_kernel void @global_unaligned_load_store_v4i32(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
+  %v = load <4 x i32>, ptr addrspace(1) %p, align 1
+  store <4 x i32> %v, ptr addrspace(1) %r, align 1
   ret void
 }
 
 ; GCN-LABEL: {{^}}local_load_i64_align_4:
 ; GCN: ds_read2_b32
-define amdgpu_kernel void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
-  %val = load i64, i64 addrspace(3)* %in, align 4
-  store i64 %val, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @local_load_i64_align_4(ptr addrspace(1) nocapture %out, ptr addrspace(3) %in) #0 {
+  %val = load i64, ptr addrspace(3) %in, align 4
+  store i64 %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}local_load_i64_align_4_with_offset
 ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
-define amdgpu_kernel void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
-  %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4
-  %val = load i64, i64 addrspace(3)* %ptr, align 4
-  store i64 %val, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @local_load_i64_align_4_with_offset(ptr addrspace(1) nocapture %out, ptr addrspace(3) %in) #0 {
+  %ptr = getelementptr i64, ptr addrspace(3) %in, i32 4
+  %val = load i64, ptr addrspace(3) %ptr, align 4
+  store i64 %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -357,12 +357,10 @@ define amdgpu_kernel void @local_load_i64_align_4_with_offset(i64 addrspace(1)*
 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
 ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
 ; GCN: s_endpgm
-define amdgpu_kernel void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
-  %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
-  %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
-  %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
-  %val = load i64, i64 addrspace(3)* %ptri64, align 4
-  store i64 %val, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @local_load_i64_align_4_with_split_offset(ptr addrspace(1) nocapture %out, ptr addrspace(3) %in) #0 {
+  %ptr255 = getelementptr i32, ptr addrspace(3) %in, i32 255
+  %val = load i64, ptr addrspace(3) %ptr255, align 4
+  store i64 %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -376,25 +374,25 @@ define amdgpu_kernel void @local_load_i64_align_4_with_split_offset(i64 addrspac
 ; GCN: ds_read_u8
 ; GCN: ds_read_u8
 ; GCN: store_dwordx2
-define amdgpu_kernel void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
-  %val = load i64, i64 addrspace(3)* %in, align 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
+define amdgpu_kernel void @local_load_i64_align_1(ptr addrspace(1) nocapture %out, ptr addrspace(3) %in) #0 {
+  %val = load i64, ptr addrspace(3) %in, align 1
+  store i64 %val, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}local_store_i64_align_4:
 ; GCN: ds_write2_b32
-define amdgpu_kernel void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
-  store i64 %val, i64 addrspace(3)* %out, align 4
+define amdgpu_kernel void @local_store_i64_align_4(ptr addrspace(3) %out, i64 %val) #0 {
+  store i64 %val, ptr addrspace(3) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}local_store_i64_align_4_with_offset
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
 ; GCN: s_endpgm
-define amdgpu_kernel void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
-  %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4
-  store i64 0, i64 addrspace(3)* %ptr, align 4
+define amdgpu_kernel void @local_store_i64_align_4_with_offset(ptr addrspace(3) %out) #0 {
+  %ptr = getelementptr i64, ptr addrspace(3) %out, i32 4
+  store i64 0, ptr addrspace(3) %ptr, align 4
   ret void
 }
 
@@ -402,11 +400,9 @@ define amdgpu_kernel void @local_store_i64_align_4_with_offset(i64 addrspace(3)*
 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; GCN: s_endpgm
-define amdgpu_kernel void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
-  %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
-  %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
-  %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
-  store i64 0, i64 addrspace(3)* %out, align 4
+define amdgpu_kernel void @local_store_i64_align_4_with_split_offset(ptr addrspace(3) %out) #0 {
+  %ptr255 = getelementptr i32, ptr addrspace(3) %out, i32 255
+  store i64 0, ptr addrspace(3) %out, align 4
   ret void
 }
 
@@ -419,9 +415,9 @@ define amdgpu_kernel void @local_store_i64_align_4_with_split_offset(i64 addrspa
 ; UNALIGNED: s_load_dword
 
 ; SI: buffer_store_dword
-define amdgpu_kernel void @constant_unaligned_load_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 {
-  %v = load i32, i32 addrspace(4)* %p, align 1
-  store i32 %v, i32 addrspace(1)* %r, align 4
+define amdgpu_kernel void @constant_unaligned_load_i32(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
+  %v = load i32, ptr addrspace(4) %p, align 1
+  store i32 %v, ptr addrspace(1) %r, align 4
   ret void
 }
 
@@ -431,9 +427,9 @@ define amdgpu_kernel void @constant_unaligned_load_i32(i32 addrspace(4)* %p, i32
 
 ; UNALIGNED: s_load_dword
 ; UNALIGNED: buffer_store_dword
-define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 {
-  %v = load i32, i32 addrspace(4)* %p, align 2
-  store i32 %v, i32 addrspace(1)* %r, align 4
+define amdgpu_kernel void @constant_align2_load_i32(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
+  %v = load i32, ptr addrspace(4) %p, align 2
+  store i32 %v, ptr addrspace(1) %r, align 4
   ret void
 }
 
@@ -445,27 +441,27 @@ define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(4)* %p, i32 ad
 
 ; UNALIGNED: s_load_dwordx4
 ; UNALIGNED: buffer_store_dwordx2
-define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(4)* %p, i64 addrspace(1)* %r) #0 {
-  %v = load i64, i64 addrspace(4)* %p, align 2
-  store i64 %v, i64 addrspace(1)* %r, align 4
+define amdgpu_kernel void @constant_align2_load_i64(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
+  %v = load i64, ptr addrspace(4) %p, align 2
+  store i64 %v, ptr addrspace(1) %r, align 4
   ret void
 }
 
 ; SI-LABEL: {{^}}constant_align4_load_i64:
 ; SI: s_load_dwordx2
 ; SI: buffer_store_dwordx2
-define amdgpu_kernel void @constant_align4_load_i64(i64 addrspace(4)* %p, i64 addrspace(1)* %r) #0 {
-  %v = load i64, i64 addrspace(4)* %p, align 4
-  store i64 %v, i64 addrspace(1)* %r, align 4
+define amdgpu_kernel void @constant_align4_load_i64(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
+  %v = load i64, ptr addrspace(4) %p, align 4
+  store i64 %v, ptr addrspace(1) %r, align 4
   ret void
 }
 
 ; SI-LABEL: {{^}}constant_align4_load_v4i32:
 ; SI: s_load_dwordx4
 ; SI: buffer_store_dwordx4
-define amdgpu_kernel void @constant_align4_load_v4i32(<4 x i32> addrspace(4)* %p, <4 x i32> addrspace(1)* %r) #0 {
-  %v = load <4 x i32>, <4 x i32> addrspace(4)* %p, align 4
-  store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
+define amdgpu_kernel void @constant_align4_load_v4i32(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
+  %v = load <4 x i32>, ptr addrspace(4) %p, align 4
+  store <4 x i32> %v, ptr addrspace(1) %r, align 4
   ret void
 }
 
@@ -483,9 +479,9 @@ define amdgpu_kernel void @constant_align4_load_v4i32(<4 x i32> addrspace(4)* %p
 ; UNALIGNED: buffer_load_dwordx2
 
 ; SI: buffer_store_dwordx2
-define amdgpu_kernel void @constant_unaligned_load_v2i32(<2 x i32> addrspace(4)* %p, <2 x i32> addrspace(1)* %r) #0 {
-  %v = load <2 x i32>, <2 x i32> addrspace(4)* %p, align 1
-  store <2 x i32> %v, <2 x i32> addrspace(1)* %r, align 4
+define amdgpu_kernel void @constant_unaligned_load_v2i32(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
+  %v = load <2 x i32>, ptr addrspace(4) %p, align 1
+  store <2 x i32> %v, ptr addrspace(1) %r, align 4
   ret void
 }
 
@@ -513,27 +509,27 @@ define amdgpu_kernel void @constant_unaligned_load_v2i32(<2 x i32> addrspace(4)*
 ; UNALIGNED: buffer_load_dwordx4
 
 ; SI: buffer_store_dwordx4
-define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(4)* %p, <4 x i32> addrspace(1)* %r) #0 {
-  %v = load <4 x i32>, <4 x i32> addrspace(4)* %p, align 1
-  store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
+define amdgpu_kernel void @constant_unaligned_load_v4i32(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
+  %v = load <4 x i32>, ptr addrspace(4) %p, align 1
+  store <4 x i32> %v, ptr addrspace(1) %r, align 4
   ret void
 }
 
 ; SI-LABEL: {{^}}constant_align4_load_i8:
 ; SI: s_load_dword
 ; SI: buffer_store_byte
-define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(4)* %p, i8 addrspace(1)* %r) #0 {
-  %v = load i8, i8 addrspace(4)* %p, align 4
-  store i8 %v, i8 addrspace(1)* %r, align 4
+define amdgpu_kernel void @constant_align4_load_i8(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
+  %v = load i8, ptr addrspace(4) %p, align 4
+  store i8 %v, ptr addrspace(1) %r, align 4
   ret void
 }
 
 ; SI-LABEL: {{^}}constant_align2_load_i8:
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_byte
-define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(4)* %p, i8 addrspace(1)* %r) #0 {
-  %v = load i8, i8 addrspace(4)* %p, align 2
-  store i8 %v, i8 addrspace(1)* %r, align 2
+define amdgpu_kernel void @constant_align2_load_i8(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
+  %v = load i8, ptr addrspace(4) %p, align 2
+  store i8 %v, ptr addrspace(1) %r, align 2
   ret void
 }
 
@@ -542,14 +538,14 @@ define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(4)* %p, i8 addrs
 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[LO]]
 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HI]]
 ; SI: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
-define amdgpu_kernel void @constant_align4_merge_load_2_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 {
-  %gep0 = getelementptr i32, i32 addrspace(4)* %p, i64 1
-  %v0 = load i32, i32 addrspace(4)* %p, align 4
-  %v1 = load i32, i32 addrspace(4)* %gep0, align 4
+define amdgpu_kernel void @constant_align4_merge_load_2_i32(ptr addrspace(4) %p, ptr addrspace(1) %r) #0 {
+  %gep0 = getelementptr i32, ptr addrspace(4) %p, i64 1
+  %v0 = load i32, ptr addrspace(4) %p, align 4
+  %v1 = load i32, ptr addrspace(4) %gep0, align 4
 
-  %gep1 = getelementptr i32, i32 addrspace(1)* %r, i64 1
-  store i32 %v0, i32 addrspace(1)* %r, align 4
-  store i32 %v1, i32 addrspace(1)* %gep1, align 4
+  %gep1 = getelementptr i32, ptr addrspace(1) %r, i64 1
+  store i32 %v0, ptr addrspace(1) %r, align 4
+  store i32 %v1, ptr addrspace(1) %gep1, align 4
   ret void
 }
 
@@ -572,9 +568,9 @@ define amdgpu_kernel void @constant_align4_merge_load_2_i32(i32 addrspace(4)* %p
 ; SI: ds_read_u8
 
 ; SI: ScratchSize: 0{{$}}
-define amdgpu_kernel void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 {
-  %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 1
-  store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
+define amdgpu_kernel void @local_load_align1_v16i8(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+  %ld = load <16 x i8>, ptr addrspace(3) %in, align 1
+  store <16 x i8> %ld, ptr addrspace(1) %out
   ret void
 }
 
@@ -597,8 +593,8 @@ define amdgpu_kernel void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out,
 ; SI: ds_write_b8
 
 ; SI: ScratchSize: 0{{$}}
-define amdgpu_kernel void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 {
-  store <16 x i8> zeroinitializer, <16 x i8> addrspace(3)* %out, align 1
+define amdgpu_kernel void @local_store_align1_v16i8(ptr addrspace(3) %out) #0 {
+  store <16 x i8> zeroinitializer, ptr addrspace(3) %out, align 1
   ret void
 }
 
@@ -612,8 +608,8 @@ define amdgpu_kernel void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out
 ; MUBUF: buffer_load_ubyte
 ; MUBUF: buffer_load_ubyte
 ; FLATSCR: scratch_load_dwordx2
-define double @private_load_align1_f64(double addrspace(5)* %in) {
-  %x = load double, double addrspace(5)* %in, align 1
+define double @private_load_align1_f64(ptr addrspace(5) %in) {
+  %x = load double, ptr addrspace(5) %in, align 1
   ret double %x
 }
 
@@ -627,8 +623,8 @@ define double @private_load_align1_f64(double addrspace(5)* %in) {
 ; MUBUF: buffer_store_byte
 ; MUBUF: buffer_store_byte
 ; FLATSCR: scratch_store_dwordx2
-define void @private_store_align1_f64(double addrspace(5)* %out, double %x) #0 {
-  store double %x, double addrspace(5)* %out, align 1
+define void @private_store_align1_f64(ptr addrspace(5) %out, double %x) #0 {
+  store double %x, ptr addrspace(5) %out, align 1
   ret void
 }
 
@@ -636,8 +632,8 @@ define void @private_store_align1_f64(double addrspace(5)* %out, double %x) #0 {
 ; MUBUF: buffer_load_dword
 ; MUBUF: buffer_load_dword
 ; FLATSCR: scratch_load_dwordx2
-define double @private_load_align4_f64(double addrspace(5)* %in) {
-  %x = load double, double addrspace(5)* %in, align 4
+define double @private_load_align4_f64(ptr addrspace(5) %in) {
+  %x = load double, ptr addrspace(5) %in, align 4
   ret double %x
 }
 
@@ -645,8 +641,8 @@ define double @private_load_align4_f64(double addrspace(5)* %in) {
 ; MUBUF: buffer_store_dword
 ; MUBUF: buffer_store_dword
 ; FLATSCR: scratch_store_dwordx2
-define void @private_store_align4_f64(double addrspace(5)* %out, double %x) #0 {
-  store double %x, double addrspace(5)* %out, align 4
+define void @private_store_align4_f64(ptr addrspace(5) %out, double %x) #0 {
+  store double %x, ptr addrspace(5) %out, align 4
   ret void
 }
 
@@ -656,8 +652,8 @@ define void @private_store_align4_f64(double addrspace(5)* %out, double %x) #0 {
 ; MUBUF: buffer_load_ushort
 ; MUBUF: buffer_load_ushort
 ; FLATSCR: scratch_load_dwordx2
-define double @private_load_align2_f64(double addrspace(5)* %in) {
-  %x = load double, double addrspace(5)* %in, align 2
+define double @private_load_align2_f64(ptr addrspace(5) %in) {
+  %x = load double, ptr addrspace(5) %in, align 2
   ret double %x
 }
 
@@ -667,25 +663,25 @@ define double @private_load_align2_f64(double addrspace(5)* %in) {
 ; MUBUF: buffer_store_short
 ; MUBUF: buffer_store_short
 ; FLATSCR: scratch_store_dwordx2
-define void @private_store_align2_f64(double addrspace(5)* %out, double %x) #0 {
-  store double %x, double addrspace(5)* %out, align 2
+define void @private_store_align2_f64(ptr addrspace(5) %out, double %x) #0 {
+  store double %x, ptr addrspace(5) %out, align 2
   ret void
 }
 
 ; Should not merge this to a dword store
-define amdgpu_kernel void @global_store_2xi16_align2(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
-  %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1
-  %v = load i16, i16 addrspace(1)* %p, align 2
-  store i16 1, i16 addrspace(1)* %r, align 2
-  store i16 2, i16 addrspace(1)* %gep.r, align 2
+define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
+  %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1
+  %v = load i16, ptr addrspace(1) %p, align 2
+  store i16 1, ptr addrspace(1) %r, align 2
+  store i16 2, ptr addrspace(1) %gep.r, align 2
   ret void
 }
 
 ; Should not merge this to a word load
-define i32 @load_2xi16_align2(i16 addrspace(1)* %p) #0 {
-  %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1
-  %p.0 = load i16, i16 addrspace(1)* %p, align 2
-  %p.1 = load i16, i16 addrspace(1)* %gep.p, align 2
+define i32 @load_2xi16_align2(ptr addrspace(1) %p) #0 {
+  %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
+  %p.0 = load i16, ptr addrspace(1) %p, align 2
+  %p.1 = load i16, ptr addrspace(1) %gep.p, align 2
   %zext.0 = zext i16 %p.0 to i32
   %zext.1 = zext i16 %p.1 to i32
   %shl.1 = shl i32 %zext.1, 16

diff  --git a/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll b/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
index 1bb427693171d..64858f2fbf860 100644
--- a/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
+++ b/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
@@ -5,7 +5,7 @@
 ; SI hits an assertion at -O0, evergreen hits a not implemented unreachable.
 
 ; COMMON-LABEL: {{^}}branch_true:
-define amdgpu_kernel void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+define amdgpu_kernel void @branch_true(ptr addrspace(1) nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 true, label %for.end, label %for.body.lr.ph
 
@@ -16,22 +16,17 @@ for.body.lr.ph:                                   ; preds = %entry
   br label %for.body
 
 for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
-  %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)*
-  %1 = load i32, i32 addrspace(1)* %0, align 4
-  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride
-  %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
-  %3 = load i32, i32 addrspace(1)* %2, align 4
-  %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
-  %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
-  %5 = load i32, i32 addrspace(1)* %4, align 4
-  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
-  %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)*
-  %7 = load i32, i32 addrspace(1)* %6, align 4
-  %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
-  %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)*
-  %9 = load i32, i32 addrspace(1)* %8, align 4
-  %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef
+  %main.addr.011 = phi ptr addrspace(1) [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
+  %0 = load i32, ptr addrspace(1) %main.addr.011, align 4
+  %add.ptr = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 %main_stride
+  %1 = load i32, ptr addrspace(1) %add.ptr, align 4
+  %add.ptr1 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 %add.ptr.sum
+  %2 = load i32, ptr addrspace(1) %add.ptr1, align 4
+  %add.ptr2 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 %add.ptr1.sum
+  %3 = load i32, ptr addrspace(1) %add.ptr2, align 4
+  %add.ptr3 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 %add.ptr4.sum
+  %4 = load i32, ptr addrspace(1) %add.ptr3, align 4
+  %add.ptr6 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 undef
   br i1 undef, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body, %entry
@@ -41,7 +36,7 @@ for.end:                                          ; preds = %for.body, %entry
 ; COMMON-LABEL: {{^}}branch_false:
 ; SI: s_cbranch_scc1
 ; SI: s_endpgm
-define amdgpu_kernel void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+define amdgpu_kernel void @branch_false(ptr addrspace(1) nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 false, label %for.end, label %for.body.lr.ph
 
@@ -52,22 +47,17 @@ for.body.lr.ph:                                   ; preds = %entry
   br label %for.body
 
 for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
-  %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)*
-  %1 = load i32, i32 addrspace(1)* %0, align 4
-  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride
-  %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
-  %3 = load i32, i32 addrspace(1)* %2, align 4
-  %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
-  %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
-  %5 = load i32, i32 addrspace(1)* %4, align 4
-  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
-  %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)*
-  %7 = load i32, i32 addrspace(1)* %6, align 4
-  %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
-  %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)*
-  %9 = load i32, i32 addrspace(1)* %8, align 4
-  %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef
+  %main.addr.011 = phi ptr addrspace(1) [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
+  %0 = load i32, ptr addrspace(1) %main.addr.011, align 4
+  %add.ptr = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 %main_stride
+  %1 = load i32, ptr addrspace(1) %add.ptr, align 4
+  %add.ptr1 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 %add.ptr.sum
+  %2 = load i32, ptr addrspace(1) %add.ptr1, align 4
+  %add.ptr2 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 %add.ptr1.sum
+  %3 = load i32, ptr addrspace(1) %add.ptr2, align 4
+  %add.ptr3 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 %add.ptr4.sum
+  %4 = load i32, ptr addrspace(1) %add.ptr3, align 4
+  %add.ptr6 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 undef
   br i1 undef, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body, %entry
@@ -78,7 +68,7 @@ for.end:                                          ; preds = %for.body, %entry
 ; SI: s_cbranch_scc1
 ; SI: s_cbranch_scc1
 ; SI: s_endpgm
-define amdgpu_kernel void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+define amdgpu_kernel void @branch_undef(ptr addrspace(1) nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 undef, label %for.end, label %for.body.lr.ph
 
@@ -89,22 +79,17 @@ for.body.lr.ph:                                   ; preds = %entry
   br label %for.body
 
 for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
-  %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)*
-  %1 = load i32, i32 addrspace(1)* %0, align 4
-  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride
-  %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
-  %3 = load i32, i32 addrspace(1)* %2, align 4
-  %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
-  %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
-  %5 = load i32, i32 addrspace(1)* %4, align 4
-  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
-  %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)*
-  %7 = load i32, i32 addrspace(1)* %6, align 4
-  %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
-  %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)*
-  %9 = load i32, i32 addrspace(1)* %8, align 4
-  %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef
+  %main.addr.011 = phi ptr addrspace(1) [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
+  %0 = load i32, ptr addrspace(1) %main.addr.011, align 4
+  %add.ptr = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 %main_stride
+  %1 = load i32, ptr addrspace(1) %add.ptr, align 4
+  %add.ptr1 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 %add.ptr.sum
+  %2 = load i32, ptr addrspace(1) %add.ptr1, align 4
+  %add.ptr2 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 %add.ptr1.sum
+  %3 = load i32, ptr addrspace(1) %add.ptr2, align 4
+  %add.ptr3 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 %add.ptr4.sum
+  %4 = load i32, ptr addrspace(1) %add.ptr3, align 4
+  %add.ptr6 = getelementptr inbounds i8, ptr addrspace(1) %main.addr.011, i32 undef
   br i1 undef, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body, %entry

diff  --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
index 7cb57ab6b0c31..3dbb3fcce6c80 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -12,7 +12,7 @@
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define amdgpu_kernel void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc(i32 %cond, ptr addrspace(1) %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -25,7 +25,7 @@ else:
 
 done:
   %value = phi i32 [0, %if], [1, %else]
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -40,7 +40,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define amdgpu_kernel void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_vcc(float %cond, ptr addrspace(1) %out) {
 entry:
   %cmp0 = fcmp oeq float %cond, 0.0
   br i1 %cmp0, label %if, label %else
@@ -53,7 +53,7 @@ else:
 
 done:
   %value = phi i32 [0, %if], [1, %else]
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -68,7 +68,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define amdgpu_kernel void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_swap_br_targets_scc(i32 %cond, ptr addrspace(1) %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %else, label %if
@@ -81,7 +81,7 @@ else:
 
 done:
   %value = phi i32 [0, %if], [1, %else]
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -96,7 +96,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define amdgpu_kernel void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_swap_br_targets_vcc(float %cond, ptr addrspace(1) %out) {
 entry:
   %cmp0 = fcmp oeq float %cond, 0.0
   br i1 %cmp0, label %else, label %if
@@ -109,7 +109,7 @@ else:
 
 done:
   %value = phi i32 [0, %if], [1, %else]
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -122,7 +122,7 @@ done:
 ; GCN: buffer_store_dword
 ; GCN: [[ENDIF_LABEL]]:
 ; GCN: s_endpgm
-define amdgpu_kernel void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) {
+define amdgpu_kernel void @uniform_if_move_valu(ptr addrspace(1) %out, float %a) {
 entry:
   %a.0 = fadd float %a, 10.0
   %cond = bitcast float %a.0 to i32
@@ -130,7 +130,7 @@ entry:
   br i1 %cmp, label %if, label %endif
 
 if:
-  store i32 0, i32 addrspace(1)* %out
+  store i32 0, ptr addrspace(1) %out
   br label %endif
 
 endif:
@@ -146,7 +146,7 @@ endif:
 ; GCN: buffer_store_dword
 ; GCN: [[ENDIF_LABEL]]:
 ; GCN: s_endpgm
-define amdgpu_kernel void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) {
+define amdgpu_kernel void @uniform_if_move_valu_commute(ptr addrspace(1) %out, float %a) {
 entry:
   %a.0 = fadd float %a, 10.0
   %cond = bitcast float %a.0 to i32
@@ -154,7 +154,7 @@ entry:
   br i1 %cmp, label %if, label %endif
 
 if:
-  store i32 0, i32 addrspace(1)* %out
+  store i32 0, ptr addrspace(1) %out
   br label %endif
 
 endif:
@@ -174,17 +174,17 @@ endif:
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) {
+define amdgpu_kernel void @uniform_if_else_ret(ptr addrspace(1) nocapture %out, i32 %a) {
 entry:
   %cmp = icmp eq i32 %a, 0
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  store i32 1, i32 addrspace(1)* %out
+  store i32 1, ptr addrspace(1) %out
   br label %if.end
 
 if.else:                                          ; preds = %entry
-  store i32 2, i32 addrspace(1)* %out
+  store i32 2, ptr addrspace(1) %out
   br label %if.end
 
 if.end:                                           ; preds = %if.else, %if.then
@@ -207,21 +207,21 @@ if.end:                                           ; preds = %if.else, %if.then
 ; GCN: v_mov_b32_e32 [[THREE:v[0-9]+]], 3
 ; GCN: buffer_store_dword [[THREE]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) {
+define amdgpu_kernel void @uniform_if_else(ptr addrspace(1) nocapture %out0, ptr addrspace(1) nocapture %out1, i32 %a) {
 entry:
   %cmp = icmp eq i32 %a, 0
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  store i32 1, i32 addrspace(1)* %out0
+  store i32 1, ptr addrspace(1) %out0
   br label %if.end
 
 if.else:                                          ; preds = %entry
-  store i32 2, i32 addrspace(1)* %out0
+  store i32 2, ptr addrspace(1) %out0
   br label %if.end
 
 if.end:                                           ; preds = %if.else, %if.then
-  store i32 3, i32 addrspace(1)* %out1
+  store i32 3, ptr addrspace(1) %out1
   ret void
 }
 
@@ -231,14 +231,14 @@ if.end:                                           ; preds = %if.else, %if.then
 ; GCN: buffer_store_dword
 ; GCN: [[LABEL]]:
 ; GCN: s_endpgm
-define amdgpu_kernel void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @icmp_2_users(ptr addrspace(1) %out, i32 %cond) {
 main_body:
   %0 = icmp sgt i32 %cond, 0
   %1 = sext i1 %0 to i32
   br i1 %0, label %IF, label %ENDIF
 
 IF:
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   br label %ENDIF
 
 ENDIF:                                            ; preds = %IF, %main_body
@@ -256,7 +256,7 @@ ENDIF:                                            ; preds = %IF, %main_body
 ; GCN: {{^}}[[BODY]]:
 ; GCN: buffer_store
 ; GCN: s_endpgm
-define amdgpu_kernel void @icmp_users_
diff erent_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @icmp_users_
diff erent_blocks(i32 %cond0, i32 %cond1, ptr addrspace(1) %out) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %cmp0 = icmp sgt i32 %cond0, 0
@@ -269,7 +269,7 @@ bb2:                                              ; preds = %bb
   br i1 %cmp1, label %bb9, label %bb7
 
 bb7:                                              ; preds = %bb5
-  store i32 %tmp3, i32 addrspace(1)* %out
+  store i32 %tmp3, ptr addrspace(1) %out
   br label %bb9
 
 bb9:                                              ; preds = %bb8, %bb4
@@ -282,7 +282,7 @@ bb9:                                              ; preds = %bb8, %bb4
 ; SI: s_cmp_lg_u32 [[I]], 0
 ; SI: s_cbranch_scc1 [[LOOP_LABEL]]
 ; SI: s_endpgm
-define amdgpu_kernel void @uniform_loop(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @uniform_loop(ptr addrspace(1) %out, i32 %a) {
 entry:
   br label %loop
 
@@ -307,19 +307,19 @@ done:
 ; GCN: {{^}}[[IF_UNIFORM_LABEL]]:
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
-define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 %cond) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %d_cmp = icmp ult i32 %tid, 16
   br i1 %d_cmp, label %if, label %endif
 
 if:
-  store i32 0, i32 addrspace(1)* %out
+  store i32 0, ptr addrspace(1) %out
   %u_cmp = icmp eq i32 %cond, 0
   br i1 %u_cmp, label %if_uniform, label %endif
 
 if_uniform:
-  store i32 1, i32 addrspace(1)* %out
+  store i32 1, ptr addrspace(1) %out
   br label %endif
 
 endif:
@@ -337,19 +337,19 @@ endif:
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @divergent_inside_uniform(ptr addrspace(1) %out, i32 %cond) {
 entry:
   %u_cmp = icmp eq i32 %cond, 0
   br i1 %u_cmp, label %if, label %endif
 
 if:
-  store i32 0, i32 addrspace(1)* %out
+  store i32 0, ptr addrspace(1) %out
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %d_cmp = icmp ult i32 %tid, 16
   br i1 %d_cmp, label %if_uniform, label %endif
 
 if_uniform:
-  store i32 1, i32 addrspace(1)* %out
+  store i32 1, ptr addrspace(1) %out
   br label %endif
 
 endif:
@@ -368,14 +368,14 @@ endif:
 ; GCN: [[IF_UNIFORM]]:
 ; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
 ; GCN: buffer_store_dword [[TWO]]
-define amdgpu_kernel void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @divergent_if_uniform_if(ptr addrspace(1) %out, i32 %cond) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %d_cmp = icmp eq i32 %tid, 0
   br i1 %d_cmp, label %if, label %endif
 
 if:
-  store i32 1, i32 addrspace(1)* %out
+  store i32 1, ptr addrspace(1) %out
   br label %endif
 
 endif:
@@ -383,7 +383,7 @@ endif:
   br i1 %u_cmp, label %if_uniform, label %exit
 
 if_uniform:
-  store i32 2, i32 addrspace(1)* %out
+  store i32 2, ptr addrspace(1) %out
   br label %exit
 
 exit:
@@ -408,20 +408,20 @@ exit:
 
 ; GCN: .LBB[[FNNUM]]_3:
 ; GCN: s_endpgm
-define amdgpu_kernel void @cse_uniform_condition_
diff erent_blocks(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @cse_uniform_condition_
diff erent_blocks(i32 %cond, ptr addrspace(1) %out) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp1 = icmp sgt i32 %cond, 0
   br i1 %tmp1, label %bb2, label %bb9
 
 bb2:                                              ; preds = %bb
-  %tmp3 = load volatile i32, i32 addrspace(1)* undef
-  store volatile i32 0, i32 addrspace(1)* undef
+  %tmp3 = load volatile i32, ptr addrspace(1) undef
+  store volatile i32 0, ptr addrspace(1) undef
   %tmp9 = icmp sle i32 %cond, 0
   br i1 %tmp9, label %bb9, label %bb7
 
 bb7:                                              ; preds = %bb5
-  store i32 %tmp3, i32 addrspace(1)* %out
+  store i32 %tmp3, ptr addrspace(1) %out
   br label %bb9
 
 bb9:                                              ; preds = %bb8, %bb4
@@ -442,7 +442,7 @@ bb9:                                              ; preds = %bb8, %bb4
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, ptr addrspace(1) %out) {
 entry:
   %cmp0 = icmp eq i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -455,7 +455,7 @@ else:
 
 done:
   %value = phi i32 [0, %if], [1, %else]
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -474,7 +474,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, ptr addrspace(1) %out) {
 entry:
   %cmp0 = icmp ne i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -487,7 +487,7 @@ else:
 
 done:
   %value = phi i32 [0, %if], [1, %else]
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -502,7 +502,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, ptr addrspace(1) %out) {
 entry:
   %cmp0 = icmp sgt i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -515,14 +515,14 @@ else:
 
 done:
   %value = phi i32 [0, %if], [1, %else]
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}move_to_valu_i64_eq:
 ; GCN: v_cmp_eq_u64_e32
-define amdgpu_kernel void @move_to_valu_i64_eq(i32 addrspace(1)* %out) {
-  %cond = load volatile i64, i64 addrspace(3)* undef
+define amdgpu_kernel void @move_to_valu_i64_eq(ptr addrspace(1) %out) {
+  %cond = load volatile i64, ptr addrspace(3) undef
   %cmp0 = icmp eq i64 %cond, 0
   br i1 %cmp0, label %if, label %else
 
@@ -534,14 +534,14 @@ else:
 
 done:
   %value = phi i32 [0, %if], [1, %else]
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}move_to_valu_i64_ne:
 ; GCN: v_cmp_ne_u64_e32
-define amdgpu_kernel void @move_to_valu_i64_ne(i32 addrspace(1)* %out) {
-  %cond = load volatile i64, i64 addrspace(3)* undef
+define amdgpu_kernel void @move_to_valu_i64_ne(ptr addrspace(1) %out) {
+  %cond = load volatile i64, ptr addrspace(3) undef
   %cmp0 = icmp ne i64 %cond, 0
   br i1 %cmp0, label %if, label %else
 
@@ -553,25 +553,25 @@ else:
 
 done:
   %value = phi i32 [0, %if], [1, %else]
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}move_to_valu_vgpr_operand_phi:
 ; GCN: v_add_{{[iu]}}32_e32
 ; GCN: ds_write_b32
-define void @move_to_valu_vgpr_operand_phi(i32 addrspace(3)* %out) {
+define void @move_to_valu_vgpr_operand_phi(ptr addrspace(3) %out) {
 bb0:
   br label %bb1
 
 bb1:                                              ; preds = %bb3, %bb0
   %tmp0 = phi i32 [ 8, %bb0 ], [ %tmp4, %bb3 ]
   %tmp1 = add nsw i32 %tmp0, -1
-  %tmp2 = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tmp1
+  %tmp2 = getelementptr inbounds i32, ptr addrspace(3) %out, i32 %tmp1
   br i1 undef, label %bb2, label %bb3
 
 bb2:                                              ; preds = %bb1
-  store volatile i32 1, i32 addrspace(3)* %tmp2, align 4
+  store volatile i32 1, ptr addrspace(3) %tmp2, align 4
   br label %bb3
 
 bb3:                                              ; preds = %bb2, %bb1

diff  --git a/llvm/test/CodeGen/AMDGPU/uniform-crash.ll b/llvm/test/CodeGen/AMDGPU/uniform-crash.ll
index 4e474544e8b06..c00ef9a710e76 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-crash.ll
@@ -6,14 +6,14 @@
 ; GCN: s_cbranch_scc1 [[LABEL:.LBB[0-9_A-Z]+]]
 ; GCN: [[LABEL]]:
 ; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @icmp_2_users(ptr addrspace(1) %out, i32 %cond) {
 main_body:
   %0 = icmp sgt i32 %cond, 0
   %1 = sext i1 %0 to i32
   br i1 %0, label %IF, label %ENDIF
 
 IF:
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   br label %ENDIF
 
 ENDIF:                                            ; preds = %IF, %main_body
@@ -35,7 +35,7 @@ bb2:                                              ; preds = %bb
   br label %bb3
 
 bb3:                                              ; preds = %bb3, %bb2
-  %val = load volatile i32, i32 addrspace(4)* undef
+  %val = load volatile i32, ptr addrspace(4) undef
   %tmp4 = icmp eq i32 %val, %arg1
   br i1 %tmp4, label %bb5, label %bb3
 

diff  --git a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
index 3716fd33bfadb..3c42ef89465d9 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
@@ -33,7 +33,7 @@ out:
 ; CHECK-LABEL: {{^}}test2:
 ; CHECK: s_and_saveexec_b64
 ; CHECK-NEXT: s_cbranch_execz
-define amdgpu_kernel void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test2(ptr addrspace(1) %out, i32 %a, i32 %b) {
 main_body:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %cc = icmp eq i32 %tid, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
index 60c42b0096b0b..6012ca681e1d1 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
@@ -10,10 +10,10 @@
 define void @func() #0 {
 ; CHECK-LABEL: define {{[^@]+}}@func
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    store i32 0, i32* @x, align 4
+; CHECK-NEXT:    store i32 0, ptr @x, align 4
 ; CHECK-NEXT:    ret void
 ;
-  store i32 0, i32* @x
+  store i32 0, ptr @x
   ret void
 }
 
@@ -31,10 +31,10 @@ define amdgpu_kernel void @kernel1() #1 {
 define weak_odr void @weak_func() #0 {
 ; CHECK-LABEL: define {{[^@]+}}@weak_func
 ; CHECK-SAME: () #[[ATTR0]] {
-; CHECK-NEXT:    store i32 0, i32* @x, align 4
+; CHECK-NEXT:    store i32 0, ptr @x, align 4
 ; CHECK-NEXT:    ret void
 ;
-  store i32 0, i32* @x
+  store i32 0, ptr @x
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/unigine-liveness-crash.ll b/llvm/test/CodeGen/AMDGPU/unigine-liveness-crash.ll
index cc622efdc54ed..f2cd30abf3a8d 100644
--- a/llvm/test/CodeGen/AMDGPU/unigine-liveness-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/unigine-liveness-crash.ll
@@ -9,7 +9,7 @@
 ;
 ; Check for a valid output.
 ; CHECK: image_sample_c
-define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(4)* inreg dereferenceable(18446744073709551615) %arg, [16 x <16 x i8>] addrspace(4)* inreg dereferenceable(18446744073709551615) %arg1, [32 x <8 x i32>] addrspace(4)* inreg dereferenceable(18446744073709551615) %arg2, [16 x <8 x i32>] addrspace(4)* inreg dereferenceable(18446744073709551615) %arg3, [16 x <4 x i32>] addrspace(4)* inreg dereferenceable(18446744073709551615) %arg4, float inreg %arg5, i32 inreg %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <3 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, <2 x i32> %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, i32 %arg20, float %arg21, i32 %arg22) #0 {
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main(ptr addrspace(4) inreg dereferenceable(18446744073709551615) %arg, ptr addrspace(4) inreg dereferenceable(18446744073709551615) %arg1, ptr addrspace(4) inreg dereferenceable(18446744073709551615) %arg2, ptr addrspace(4) inreg dereferenceable(18446744073709551615) %arg3, ptr addrspace(4) inreg dereferenceable(18446744073709551615) %arg4, float inreg %arg5, i32 inreg %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <3 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, <2 x i32> %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, i32 %arg20, float %arg21, i32 %arg22) #0 {
 main_body:
   %i.i = extractelement <2 x i32> %arg8, i32 0
   %j.i = extractelement <2 x i32> %arg8, i32 1

diff  --git a/llvm/test/CodeGen/AMDGPU/unpack-half.ll b/llvm/test/CodeGen/AMDGPU/unpack-half.ll
index b2133986ba5b2..2ee42fb79b7d2 100644
--- a/llvm/test/CodeGen/AMDGPU/unpack-half.ll
+++ b/llvm/test/CodeGen/AMDGPU/unpack-half.ll
@@ -9,7 +9,7 @@
 
 define amdgpu_gs void @main(i32 inreg %arg) local_unnamed_addr #0 {
 .entry:
-  %tmp = load volatile float, float addrspace(1)* undef
+  %tmp = load volatile float, ptr addrspace(1) undef
   %tmp1 = bitcast float %tmp to i32
   %im0.i = lshr i32 %tmp1, 16
   %tmp2 = insertelement <2 x i32> undef, i32 %im0.i, i32 1
@@ -18,7 +18,7 @@ define amdgpu_gs void @main(i32 inreg %arg) local_unnamed_addr #0 {
   %tmp5 = fpext <2 x half> %tmp4 to <2 x float>
   %bc = bitcast <2 x float> %tmp5 to <2 x i32>
   %tmp6 = extractelement <2 x i32> %bc, i32 1
-  store volatile i32 %tmp6, i32 addrspace(1)* undef
+  store volatile i32 %tmp6, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index 3d0e762dbdbfa..0988921a2452b 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -143,7 +143,7 @@ define hidden void @widget() {
 ; GCN-NEXT:    s_branch .LBB0_4
 ; SI-OPT-LABEL: @widget(
 ; SI-OPT-NEXT:  bb:
-; SI-OPT-NEXT:    [[TMP:%.*]] = load i32, i32 addrspace(1)* null, align 16
+; SI-OPT-NEXT:    [[TMP:%.*]] = load i32, ptr addrspace(1) null, align 16
 ; SI-OPT-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[TMP]], 21
 ; SI-OPT-NEXT:    br i1 [[TMP1]], label [[BB4:%.*]], label [[BB2:%.*]]
 ; SI-OPT:       bb2:
@@ -168,10 +168,10 @@ define hidden void @widget() {
 ; SI-OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
 ; SI-OPT-NEXT:    br label [[BB12]]
 ; SI-OPT:       bb12:
-; SI-OPT-NEXT:    store float 0.000000e+00, float addrspace(1)* null, align 8
+; SI-OPT-NEXT:    store float 0.000000e+00, ptr addrspace(1) null, align 8
 ; SI-OPT-NEXT:    ret void
 bb:
-  %tmp = load i32, i32 addrspace(1)* null, align 16
+  %tmp = load i32, ptr addrspace(1) null, align 16
   %tmp1 = icmp slt i32 %tmp, 21
   br i1 %tmp1, label %bb4, label %bb2
 
@@ -196,7 +196,7 @@ bb9:                                              ; preds = %bb4, %bb2
   br i1 %tmp11, label %bb6, label %bb12
 
 bb12:                                             ; preds = %bb9, %bb2
-  store float 0.000000e+00, float addrspace(1)* null, align 8
+  store float 0.000000e+00, ptr addrspace(1) null, align 8
   ret void
 }
 
@@ -206,15 +206,15 @@ declare hidden float @wibble() local_unnamed_addr
 define hidden void @blam() {
 ; SI-OPT-LABEL: @blam(
 ; SI-OPT-NEXT:  bb:
-; SI-OPT-NEXT:    [[TMP:%.*]] = load float, float* null, align 16
+; SI-OPT-NEXT:    [[TMP:%.*]] = load float, ptr null, align 16
 ; SI-OPT-NEXT:    br label [[BB2:%.*]]
 ; SI-OPT:       bb1:
 ; SI-OPT-NEXT:    br label [[BB2]]
 ; SI-OPT:       bb2:
 ; SI-OPT-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; SI-OPT-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* null, i32 [[TID]]
-; SI-OPT-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 16
-; SI-OPT-NEXT:    store float 0.000000e+00, float addrspace(5)* null, align 8
+; SI-OPT-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) null, i32 [[TID]]
+; SI-OPT-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 16
+; SI-OPT-NEXT:    store float 0.000000e+00, ptr addrspace(5) null, align 8
 ; SI-OPT-NEXT:    br label [[BB4:%.*]]
 ; SI-OPT:       bb4:
 ; SI-OPT-NEXT:    [[TMP5:%.*]] = icmp slt i32 [[TMP3]], 3
@@ -236,7 +236,7 @@ define hidden void @blam() {
 ; SI-OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP5]])
 ; SI-OPT-NEXT:    br label [[BB1]]
 ; SI-OPT:       bb10:
-; SI-OPT-NEXT:    store float 0x7FF8000000000000, float addrspace(5)* null, align 16
+; SI-OPT-NEXT:    store float 0x7FF8000000000000, ptr addrspace(5) null, align 16
 ; SI-OPT-NEXT:    br label [[BB18:%.*]]
 ; SI-OPT:       bb11:
 ; SI-OPT-NEXT:    [[TMP12:%.*]] = call float @spam()
@@ -254,13 +254,13 @@ define hidden void @blam() {
 ; SI-OPT-NEXT:    br i1 [[TMP10]], label [[BB17:%.*]], label [[BB16:%.*]]
 ; SI-OPT:       bb16:
 ; SI-OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP11]])
-; SI-OPT-NEXT:    store float 0x7FF8000000000000, float addrspace(5)* null, align 16
+; SI-OPT-NEXT:    store float 0x7FF8000000000000, ptr addrspace(5) null, align 16
 ; SI-OPT-NEXT:    br label [[BB17]]
 ; SI-OPT:       bb17:
-; SI-OPT-NEXT:    store float [[TMP]], float addrspace(5)* null, align 16
+; SI-OPT-NEXT:    store float [[TMP]], ptr addrspace(5) null, align 16
 ; SI-OPT-NEXT:    br label [[BB18]]
 ; SI-OPT:       bb18:
-; SI-OPT-NEXT:    store float 0x7FF8000000000000, float addrspace(5)* null, align 4
+; SI-OPT-NEXT:    store float 0x7FF8000000000000, ptr addrspace(5) null, align 4
 ; SI-OPT-NEXT:    br label [[BB2]]
 ;
 ; GCN-LABEL: blam:
@@ -391,7 +391,7 @@ define hidden void @blam() {
 ; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], 0
 ; GCN-NEXT:    s_branch .LBB1_2
 bb:
-  %tmp = load float, float* null, align 16
+  %tmp = load float, ptr null, align 16
   br label %bb2
 
 bb1:                                              ; preds = %bb8, %bb6
@@ -399,9 +399,9 @@ bb1:                                              ; preds = %bb8, %bb6
 
 bb2:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep  = getelementptr inbounds i32, i32 addrspace(1)* null, i32 %tid
-  %tmp3 = load i32, i32 addrspace(1)* %gep, align 16
-  store float 0.000000e+00, float addrspace(5)* null, align 8
+  %gep  = getelementptr inbounds i32, ptr addrspace(1) null, i32 %tid
+  %tmp3 = load i32, ptr addrspace(1) %gep, align 16
+  store float 0.000000e+00, ptr addrspace(5) null, align 8
   br label %bb4
 
 bb4:                                              ; preds = %bb2
@@ -417,7 +417,7 @@ bb8:                                              ; preds = %bb4
   br i1 %tmp9, label %bb10, label %bb1
 
 bb10:                                             ; preds = %bb8
-  store float 0x7FF8000000000000, float addrspace(5)* null, align 16
+  store float 0x7FF8000000000000, ptr addrspace(5) null, align 16
   br label %bb18
 
 bb11:                                             ; preds = %bb6
@@ -430,15 +430,15 @@ bb14:                                             ; preds = %bb11
   br i1 %tmp15, label %bb17, label %bb16
 
 bb16:                                             ; preds = %bb14
-  store float 0x7FF8000000000000, float addrspace(5)* null, align 16
+  store float 0x7FF8000000000000, ptr addrspace(5) null, align 16
   br label %bb17
 
 bb17:                                             ; preds = %bb16, %bb14
-  store float %tmp, float addrspace(5)* null, align 16
+  store float %tmp, ptr addrspace(5) null, align 16
   br label %bb18
 
 bb18:                                             ; preds = %bb17, %bb10
-  store float 0x7FF8000000000000, float addrspace(5)* null, align 4
+  store float 0x7FF8000000000000, ptr addrspace(5) null, align 4
   br label %bb2
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/urem.ll b/llvm/test/CodeGen/AMDGPU/urem.ll
index 40731ce14e83a..c3d4a42777f28 100644
--- a/llvm/test/CodeGen/AMDGPU/urem.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem.ll
@@ -9,12 +9,12 @@
 ; FUNC-LABEL: {{^}}test_urem_i32:
 ; SI: s_endpgm
 ; EG: CF_END
-define amdgpu_kernel void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
+define amdgpu_kernel void @test_urem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %a = load i32, ptr addrspace(1) %in
+  %b = load i32, ptr addrspace(1) %b_ptr
   %result = urem i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -26,69 +26,69 @@ define amdgpu_kernel void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1
 ; SI: v_subrev_{{[iu]}}32
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define amdgpu_kernel void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %num = load i32, i32 addrspace(1) * %in
+define amdgpu_kernel void @test_urem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %num = load i32, ptr addrspace(1) %in
   %result = urem i32 %num, 7
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}test_urem_v2i32:
 ; SI: s_endpgm
 ; EG: CF_END
-define amdgpu_kernel void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
-  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
+define amdgpu_kernel void @test_urem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
+  %a = load <2 x i32>, ptr addrspace(1) %in
+  %b = load <2 x i32>, ptr addrspace(1) %b_ptr
   %result = urem <2 x i32> %a, %b
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}test_urem_v4i32:
 ; SI: s_endpgm
 ; EG: CF_END
-define amdgpu_kernel void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
-  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
+define amdgpu_kernel void @test_urem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
+  %a = load <4 x i32>, ptr addrspace(1) %in
+  %b = load <4 x i32>, ptr addrspace(1) %b_ptr
   %result = urem <4 x i32> %a, %b
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}test_urem_i64:
 ; SI: s_endpgm
 ; EG: CF_END
-define amdgpu_kernel void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
-  %a = load i64, i64 addrspace(1)* %in
-  %b = load i64, i64 addrspace(1)* %b_ptr
+define amdgpu_kernel void @test_urem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %b_ptr = getelementptr i64, ptr addrspace(1) %in, i64 1
+  %a = load i64, ptr addrspace(1) %in
+  %b = load i64, ptr addrspace(1) %b_ptr
   %result = urem i64 %a, %b
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}test_urem_v2i64:
 ; SI: s_endpgm
 ; EG: CF_END
-define amdgpu_kernel void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
-  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
-  %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
-  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
+define amdgpu_kernel void @test_urem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 1
+  %a = load <2 x i64>, ptr addrspace(1) %in
+  %b = load <2 x i64>, ptr addrspace(1) %b_ptr
   %result = urem <2 x i64> %a, %b
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %result, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}test_urem_v4i64:
 ; SI: s_endpgm
 ; EG: CF_END
-define amdgpu_kernel void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
-  %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
-  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
+define amdgpu_kernel void @test_urem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in, i64 1
+  %a = load <4 x i64>, ptr addrspace(1) %in
+  %b = load <4 x i64>, ptr addrspace(1) %b_ptr
   %result = urem <4 x i64> %a, %b
-  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  store <4 x i64> %result, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 33038bb4fe238..79643026ea70f 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-IR %s
 
-define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_urem_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0xd
@@ -209,7 +209,7 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[12:15], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = urem i64 %x, %y
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -416,7 +416,7 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_urem31_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_urem31_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0xe
@@ -469,11 +469,11 @@ define amdgpu_kernel void @s_test_urem31_i64(i64 addrspace(1)* %out, i64 %x, i64
   %1 = lshr i64 %x, 33
   %2 = lshr i64 %y, 33
   %result = urem i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_urem31_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) {
 ; GCN-LABEL: s_test_urem31_v2i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
@@ -556,11 +556,11 @@ define amdgpu_kernel void @s_test_urem31_v2i64(<2 x i64> addrspace(1)* %out, <2
   %1 = lshr <2 x i64> %x, <i64 33, i64 33>
   %2 = lshr <2 x i64> %y, <i64 33, i64 33>
   %result = urem <2 x i64> %1, %2
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_urem24_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_urem24_i64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_urem24_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0xe
@@ -613,11 +613,11 @@ define amdgpu_kernel void @s_test_urem24_i64(i64 addrspace(1)* %out, i64 %x, i64
   %1 = lshr i64 %x, 40
   %2 = lshr i64 %y, 40
   %result = urem i64 %1, %2
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_urem23_64_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) {
 ; GCN-LABEL: s_test_urem23_64_v2i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
@@ -700,11 +700,11 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(<2 x i64> addrspace(1)* %out,
   %1 = lshr <2 x i64> %x, <i64 33, i64 41>
   %2 = lshr <2 x i64> %y, <i64 33, i64 41>
   %result = urem <2 x i64> %1, %2
-  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  store <2 x i64> %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) {
 ; GCN-LABEL: s_test_urem_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -887,11 +887,11 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = urem i64 24, %x
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) {
 ; GCN-LABEL: s_test_urem_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
@@ -1068,7 +1068,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = urem i64 %x, 24
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -1348,7 +1348,7 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
   ret i64 %result
 }
 
-define amdgpu_kernel void @s_test_urem24_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_urem24_k_num_i64(ptr addrspace(1) %out, i64 %x) {
 ; GCN-LABEL: s_test_urem24_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1396,11 +1396,11 @@ define amdgpu_kernel void @s_test_urem24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    s_endpgm
   %x.shr = lshr i64 %x, 40
   %result = urem i64 24, %x.shr
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @s_test_urem24_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_test_urem24_k_den_i64(ptr addrspace(1) %out, i64 %x) {
 ; GCN-LABEL: s_test_urem24_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1452,7 +1452,7 @@ define amdgpu_kernel void @s_test_urem24_k_den_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    s_endpgm
   %x.shr = lshr i64 %x, 40
   %result = urem i64 %x.shr, 23423
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index 8de306f9300f5..f1edd5c74b105 100644
--- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -11,9 +11,9 @@ declare float @llvm.amdgcn.div.fixup.f32(float, float, float) #1
 ; GCN: s_load_dword [[SGPR:s[0-9]+]],
 ; GCN: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_binop(ptr addrspace(1) %out, float %a) #0 {
   %dbl = fadd float %a, %a
-  store float %dbl, float addrspace(1)* %out, align 4
+  store float %dbl, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -21,9 +21,9 @@ define amdgpu_kernel void @test_sgpr_use_twice_binop(float addrspace(1)* %out, f
 ; GCN: s_load_dword [[SGPR:s[0-9]+]],
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_three_ternary_op(ptr addrspace(1) %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1
-  store float %fma, float addrspace(1)* %out, align 4
+  store float %fma, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -33,9 +33,9 @@ define amdgpu_kernel void @test_sgpr_use_three_ternary_op(float addrspace(1)* %o
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[#LOAD + 3]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], s[[#LOAD + 2]], s[[#LOAD + 2]], [[VGPR1]]
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(ptr addrspace(1) %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1
-  store float %fma, float addrspace(1)* %out, align 4
+  store float %fma, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -62,13 +62,13 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(
 ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[#LOAD + 2]], [[VA1]], [[VB]]
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
-define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 {
-  %va0 = load volatile float, float addrspace(1)* %in
-  %va1 = load volatile float, float addrspace(1)* %in
+define amdgpu_kernel void @test_use_s_v_s(ptr addrspace(1) %out, float %a, float %b, ptr addrspace(1) %in) #0 {
+  %va0 = load volatile float, ptr addrspace(1) %in
+  %va1 = load volatile float, ptr addrspace(1) %in
   %fma0 = call float @llvm.fma.f32(float %a, float %va0, float %b) #1
   %fma1 = call float @llvm.fma.f32(float %a, float %va1, float %b) #1
-  store volatile float %fma0, float addrspace(1)* %out
-  store volatile float %fma1, float addrspace(1)* %out
+  store volatile float %fma0, ptr addrspace(1) %out
+  store volatile float %fma1, ptr addrspace(1) %out
   ret void
 }
 
@@ -78,9 +78,9 @@ define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, fl
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[#LOAD + 3]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], s[[#LOAD + 2]], [[VGPR1]], s[[#LOAD + 2]]
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(ptr addrspace(1) %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1
-  store float %fma, float addrspace(1)* %out, align 4
+  store float %fma, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -90,9 +90,9 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[#LOAD + 3]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], s[[#LOAD + 2]], s[[#LOAD + 2]]
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_b_a_a(ptr addrspace(1) %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1
-  store float %fma, float addrspace(1)* %out, align 4
+  store float %fma, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -100,9 +100,9 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_imm(ptr addrspace(1) %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1
-  store float %fma, float addrspace(1)* %out, align 4
+  store float %fma, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -110,9 +110,9 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspac
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], 2.0, [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_imm_a(ptr addrspace(1) %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1
-  store float %fma, float addrspace(1)* %out, align 4
+  store float %fma, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -121,9 +121,9 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspac
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_imm_a_a(ptr addrspace(1) %out, float %a) #0 {
   %val = call float @llvm.amdgcn.div.fixup.f32(float 2.0, float %a, float %a) #1
-  store float %val, float addrspace(1)* %out, align 4
+  store float %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -132,9 +132,9 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspac
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_kimm(ptr addrspace(1) %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float 1024.0) #1
-  store float %fma, float addrspace(1)* %out, align 4
+  store float %fma, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -144,9 +144,9 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspa
 ; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT0:v[0-9]+]], [[SK]], [[SK]], [[VGPR]]
 ; GCN: buffer_store_dword [[RESULT0]]
-define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s(ptr addrspace(1) %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1
-  store float %fma, float addrspace(1)* %out
+  store float %fma, ptr addrspace(1) %out
   ret void
 }
 
@@ -160,11 +160,11 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s(float addrspa
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s_x2(ptr addrspace(1) %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1
   %fma1 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %b) #1
-  store volatile float %fma0, float addrspace(1)* %out
-  store volatile float %fma1, float addrspace(1)* %out
+  store volatile float %fma0, ptr addrspace(1) %out
+  store volatile float %fma1, ptr addrspace(1) %out
   ret void
 }
 
@@ -174,9 +174,9 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s_x2(float addr
 ; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR]], [[SK]], [[SK]]
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k(ptr addrspace(1) %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1
-  store float %fma, float addrspace(1)* %out
+  store float %fma, ptr addrspace(1) %out
   ret void
 }
 
@@ -190,11 +190,11 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k(float addrspa
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k_x2(ptr addrspace(1) %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1
   %fma1 = call float @llvm.fma.f32(float 1024.0, float %b, float 1024.0) #1
-  store volatile float %fma0, float addrspace(1)* %out
-  store volatile float %fma1, float addrspace(1)* %out
+  store volatile float %fma0, ptr addrspace(1) %out
+  store volatile float %fma1, ptr addrspace(1) %out
   ret void
 }
 
@@ -204,9 +204,9 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k_x2(float addr
 ; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR]], [[SK]], [[SK]]
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k(ptr addrspace(1) %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1
-  store float %fma, float addrspace(1)* %out
+  store float %fma, ptr addrspace(1) %out
   ret void
 }
 
@@ -220,11 +220,11 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k(float addrspa
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k_x2(ptr addrspace(1) %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1
   %fma1 = call float @llvm.fma.f32(float %b, float 1024.0, float 1024.0) #1
-  store volatile float %fma0, float addrspace(1)* %out
-  store volatile float %fma1, float addrspace(1)* %out
+  store volatile float %fma0, ptr addrspace(1) %out
+  store volatile float %fma1, ptr addrspace(1) %out
   ret void
 }
 
@@ -240,11 +240,11 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k_x2(float addr
 
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
-define amdgpu_kernel void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_s0_s1_k_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float %a, float %b, float 1024.0) #1
   %fma1 = call float @llvm.fma.f32(float %a, float %b, float 4096.0) #1
-  store volatile float %fma0, float addrspace(1)* %out
-  store volatile float %fma1, float addrspace(1)* %out
+  store volatile float %fma0, ptr addrspace(1) %out
+  store volatile float %fma1, ptr addrspace(1) %out
   ret void
 }
 
@@ -265,11 +265,11 @@ define amdgpu_kernel void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a,
 
 ; GCN: buffer_store_dwordx2 [[RESULT0]]
 ; GCN: buffer_store_dwordx2 [[RESULT1]]
-define amdgpu_kernel void @test_s0_s1_k_f64(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) #0 {
+define amdgpu_kernel void @test_s0_s1_k_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], double %b) #0 {
   %fma0 = call double @llvm.fma.f64(double %a, double %b, double 1024.0) #1
   %fma1 = call double @llvm.fma.f64(double %a, double %b, double 4096.0) #1
-  store volatile double %fma0, double addrspace(1)* %out
-  store volatile double %fma1, double addrspace(1)* %out
+  store volatile double %fma0, ptr addrspace(1) %out
+  store volatile double %fma1, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
index 93a39f119f023..168de4af1e1a3 100644
--- a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
@@ -17,12 +17,12 @@ bb:
   %v3 = zext i32 %v1 to i64
   %v.t = and i64 %v3, 255
   %v4 = add i64 %v2, %v.t
-  store i64 %v4, i64 addrspace(1) * undef
+  store i64 %v4, ptr addrspace(1) undef
   ret void
 }
 
 
-define amdgpu_kernel void @test_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 {
+define amdgpu_kernel void @test_add_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 {
 ; GFX9-LABEL: test_add_co_sdwa:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -38,14 +38,14 @@ define amdgpu_kernel void @test_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspac
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp
-  %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
+  %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i32 %tmp
+  %tmp4 = load i32, ptr addrspace(1) %tmp3, align 4
   %tmp5 = and i32 %tmp4, 255
   %tmp6 = zext i32 %tmp5 to i64
-  %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp
-  %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8
+  %tmp7 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp
+  %tmp8 = load i64, ptr addrspace(1) %tmp7, align 8
   %tmp9 = add nsw i64 %tmp8, %tmp6
-  store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8
+  store i64 %tmp9, ptr addrspace(1) %tmp7, align 8
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/v_illegal-atomics.ll b/llvm/test/CodeGen/AMDGPU/v_illegal-atomics.ll
index dd0d912932ce8..c8ef465f5d54f 100644
--- a/llvm/test/CodeGen/AMDGPU/v_illegal-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_illegal-atomics.ll
@@ -56,8 +56,8 @@
 ; GFX1100-OBJ-NEXT: s_waitcnt_vscnt null, 0x0
 ; GFX1100-OBJ-NEXT: global_atomic_add_f32 v[0:1], v2, off
 
-define fastcc void @fadd_test(float addrspace(1)* nocapture noundef %0, float noundef %1) unnamed_addr {
-  %3 = tail call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* noundef %0, float noundef %1)
+define fastcc void @fadd_test(ptr addrspace(1) nocapture noundef %0, float noundef %1) unnamed_addr {
+  %3 = tail call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) noundef %0, float noundef %1)
   ret void
 }
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float)
+declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float)

diff  --git a/llvm/test/CodeGen/AMDGPU/v_illegal-image_sample.ll b/llvm/test/CodeGen/AMDGPU/v_illegal-image_sample.ll
index 8a23d4ed1c958..a72cd2bd4f00a 100644
--- a/llvm/test/CodeGen/AMDGPU/v_illegal-image_sample.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_illegal-image_sample.ll
@@ -29,11 +29,11 @@
 ; GFX1100-NOT: v_illegal
 ; GFX1100: image_sample_lz
 
-define amdgpu_kernel void @image_sample_test(<4 x float> addrspace(1)* %out, float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4) {
+define amdgpu_kernel void @image_sample_test(ptr addrspace(1) %out, float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4) {
   
   %result = tail call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float %arg1, float %arg2, <8 x i32> %arg3, <4 x i32> %arg4, i1 false, i32 0, i32 0)
 
-  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  store <4 x float> %result, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index 1fbf9593aceea..5ec9284c870c1 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -4,7 +4,7 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
-define amdgpu_kernel void @v_pack_b32_v2f16(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; GCN-LABEL: v_pack_b32_v2f16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -40,10 +40,10 @@ define amdgpu_kernel void @v_pack_b32_v2f16(half addrspace(1)* %in0, half addrsp
 ; GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext
-  %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext
-  %v0 = load volatile half, half addrspace(1)* %in0.gep
-  %v1 = load volatile half, half addrspace(1)* %in1.gep
+  %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext
+  %in1.gep = getelementptr inbounds half, ptr addrspace(1) %in1, i64 %tid.ext
+  %v0 = load volatile half, ptr addrspace(1) %in0.gep
+  %v1 = load volatile half, ptr addrspace(1) %in1.gep
   %v0.add = fadd half %v0, 2.0
   %v1.add = fadd half %v1, 2.0
   %vec.0 = insertelement <2 x half> undef, half %v0.add, i32 0
@@ -53,7 +53,7 @@ define amdgpu_kernel void @v_pack_b32_v2f16(half addrspace(1)* %in0, half addrsp
   ret void
 }
 
-define amdgpu_kernel void @v_pack_b32_v2f16_sub(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; GCN-LABEL: v_pack_b32_v2f16_sub:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -89,10 +89,10 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(half addrspace(1)* %in0, half ad
 ; GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext
-  %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext
-  %v0 = load volatile half, half addrspace(1)* %in0.gep
-  %v1 = load volatile half, half addrspace(1)* %in1.gep
+  %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext
+  %in1.gep = getelementptr inbounds half, ptr addrspace(1) %in1, i64 %tid.ext
+  %v0 = load volatile half, ptr addrspace(1) %in0.gep
+  %v1 = load volatile half, ptr addrspace(1) %in1.gep
   %v0.add = fsub half %v0, 2.0
   %v1.add = fadd half %v1, 2.0
   %vec.0 = insertelement <2 x half> undef, half %v0.add, i32 0
@@ -135,15 +135,15 @@ define amdgpu_kernel void @fptrunc(
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-NEXT:    s_endpgm
-    <2 x half> addrspace(1)* %r,
-    <2 x float> addrspace(1)* %a) {
-  %a.val = load <2 x float>, <2 x float> addrspace(1)* %a
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
+  %a.val = load <2 x float>, ptr addrspace(1) %a
   %r.val = fptrunc <2 x float> %a.val to <2 x half>
-  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
+  store <2 x half> %r.val, ptr addrspace(1) %r
   ret void
 }
 
-define amdgpu_kernel void @v_pack_b32.fabs(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; GCN-LABEL: v_pack_b32.fabs:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -179,10 +179,10 @@ define amdgpu_kernel void @v_pack_b32.fabs(half addrspace(1)* %in0, half addrspa
 ; GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext
-  %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext
-  %v0 = load volatile half, half addrspace(1)* %in0.gep
-  %v1 = load volatile half, half addrspace(1)* %in1.gep
+  %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext
+  %in1.gep = getelementptr inbounds half, ptr addrspace(1) %in1, i64 %tid.ext
+  %v0 = load volatile half, ptr addrspace(1) %in0.gep
+  %v1 = load volatile half, ptr addrspace(1) %in1.gep
   %v0.add = fadd half %v0, 2.0
   %v1.add = fadd half %v1, 2.0
   %v0.fabs = call half @llvm.fabs.f16(half %v0.add)
@@ -194,7 +194,7 @@ define amdgpu_kernel void @v_pack_b32.fabs(half addrspace(1)* %in0, half addrspa
   ret void
 }
 
-define amdgpu_kernel void @v_pack_b32.fneg(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
 ; GCN-LABEL: v_pack_b32.fneg:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -230,10 +230,10 @@ define amdgpu_kernel void @v_pack_b32.fneg(half addrspace(1)* %in0, half addrspa
 ; GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext
-  %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext
-  %v0 = load volatile half, half addrspace(1)* %in0.gep
-  %v1 = load volatile half, half addrspace(1)* %in1.gep
+  %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext
+  %in1.gep = getelementptr inbounds half, ptr addrspace(1) %in1, i64 %tid.ext
+  %v0 = load volatile half, ptr addrspace(1) %in0.gep
+  %v1 = load volatile half, ptr addrspace(1) %in1.gep
   %v0.add = fadd half %v0, 2.0
   %v1.add = fadd half %v1, 2.0
   %v0.fneg = fsub half -0.0, %v0.add

diff  --git a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
index 0227e1e22235f..6818b804ea513 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
@@ -17,12 +17,12 @@ bb:
   %v3 = zext i32 %v1 to i64
   %v.t = and i64 %v3, 255
   %v4 = sub i64 %v2, %v.t
-  store i64 %v4, i64 addrspace(1) * undef
+  store i64 %v4, ptr addrspace(1) undef
   ret void
 }
 
 
-define amdgpu_kernel void @test_sub_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 {
+define amdgpu_kernel void @test_sub_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 {
 ; GFX9-LABEL: test_sub_co_sdwa:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -38,14 +38,14 @@ define amdgpu_kernel void @test_sub_co_sdwa(i64 addrspace(1)* %arg, i32 addrspac
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp
-  %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
+  %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i32 %tmp
+  %tmp4 = load i32, ptr addrspace(1) %tmp3, align 4
   %tmp5 = and i32 %tmp4, 255
   %tmp6 = zext i32 %tmp5 to i64
-  %tmp7 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp
-  %tmp8 = load i64, i64 addrspace(1)* %tmp7, align 8
+  %tmp7 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp
+  %tmp8 = load i64, ptr addrspace(1) %tmp7, align 8
   %tmp9 = sub nsw i64 %tmp8, %tmp6
-  store i64 %tmp9, i64 addrspace(1)* %tmp7, align 8
+  store i64 %tmp9, ptr addrspace(1) %tmp7, align 8
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
index 7820a17e9b69d..a2532d3840f22 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
@@ -7,7 +7,7 @@
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 ; No dynamic indexing required
-define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
+define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idx) #1 {
 ; GCN-LABEL: extract_insert_same_dynelt_v4i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -22,16 +22,16 @@ define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %o
 ; GCN-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
-  %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
-  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %id.ext
-  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %gep.in
+  %gep.in = getelementptr inbounds <4 x i32>, ptr addrspace(1) %in, i64 %id.ext
+  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %id.ext
+  %vec = load <4 x i32>, ptr addrspace(1) %gep.in
   %insert = insertelement <4 x i32> %vec, i32 %val, i32 %idx
   %extract = extractelement <4 x i32> %insert, i32 %idx
-  store i32 %extract, i32 addrspace(1)* %gep.out
+  store i32 %extract, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @extract_insert_
diff erent_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 {
+define amdgpu_kernel void @extract_insert_
diff erent_dynelt_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idx0, i32 %idx1) #1 {
 ; GCN-LABEL: extract_insert_
diff erent_dynelt_v4i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
@@ -75,16 +75,16 @@ define amdgpu_kernel void @extract_insert_
diff erent_dynelt_v4i32(i32 addrspace(1
 ; GCN-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
-  %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
-  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %id.ext
-  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %gep.in
+  %gep.in = getelementptr inbounds <4 x i32>, ptr addrspace(1) %in, i64 %id.ext
+  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %id.ext
+  %vec = load <4 x i32>, ptr addrspace(1) %gep.in
   %insert = insertelement <4 x i32> %vec, i32 %val, i32 %idx0
   %extract = extractelement <4 x i32> %insert, i32 %idx1
-  store i32 %extract, i32 addrspace(1)* %gep.out
+  store i32 %extract, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
+define amdgpu_kernel void @extract_insert_same_elt2_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idx) #1 {
 ; GCN-LABEL: extract_insert_same_elt2_v4i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -99,16 +99,16 @@ define amdgpu_kernel void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out
 ; GCN-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
-  %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
-  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %id.ext
-  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %gep.in
+  %gep.in = getelementptr inbounds <4 x i32>, ptr addrspace(1) %in, i64 %id.ext
+  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %id.ext
+  %vec = load <4 x i32>, ptr addrspace(1) %gep.in
   %insert = insertelement <4 x i32> %vec, i32 %val, i32 %idx
   %extract = extractelement <4 x i32> %insert, i32 %idx
-  store i32 %extract, i32 addrspace(1)* %gep.out
+  store i32 %extract, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @extract_insert_same_dynelt_v4f32(float addrspace(1)* %out, <4 x float> addrspace(1)* %in, float %val, i32 %idx) #1 {
+define amdgpu_kernel void @extract_insert_same_dynelt_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in, float %val, i32 %idx) #1 {
 ; GCN-LABEL: extract_insert_same_dynelt_v4f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -128,12 +128,12 @@ define amdgpu_kernel void @extract_insert_same_dynelt_v4f32(float addrspace(1)*
 ; GCN-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
-  %gep.in = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %id.ext
-  %gep.out = getelementptr inbounds float, float addrspace(1)* %out, i64 %id.ext
-  %vec = load volatile <4 x float>, <4 x float> addrspace(1)* %gep.in
+  %gep.in = getelementptr inbounds <4 x float>, ptr addrspace(1) %in, i64 %id.ext
+  %gep.out = getelementptr inbounds float, ptr addrspace(1) %out, i64 %id.ext
+  %vec = load volatile <4 x float>, ptr addrspace(1) %gep.in
   %insert = insertelement <4 x float> %vec, float %val, i32 %idx
   %extract = extractelement <4 x float> %insert, i32 %idx
-  store float %extract, float addrspace(1)* %gep.out
+  store float %extract, ptr addrspace(1) %gep.out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-legalizer-divergence.ll b/llvm/test/CodeGen/AMDGPU/vector-legalizer-divergence.ll
index 111ac5793ce1e..09875809715e9 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-legalizer-divergence.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-legalizer-divergence.ll
@@ -5,11 +5,11 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
-define amdgpu_kernel void @spam(double addrspace(1)* noalias %arg) {
+define amdgpu_kernel void @spam(ptr addrspace(1) noalias %arg) {
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
-  %tmp2 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %tmp1
-  %tmp3 = load double, double addrspace(1)* %tmp2, align 8
+  %tmp2 = getelementptr inbounds double, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load double, ptr addrspace(1) %tmp2, align 8
   %tmp4 = fadd double undef, 0.000000e+00
   %tmp5 = insertelement <2 x double> undef, double %tmp4, i64 0
   %tmp6 = insertelement <2 x double> %tmp5, double %tmp3, i64 1
@@ -17,14 +17,14 @@ define amdgpu_kernel void @spam(double addrspace(1)* noalias %arg) {
   %tmp8 = fadd <2 x double> zeroinitializer, undef
   %tmp9 = fadd <2 x double> %tmp7, zeroinitializer
   %tmp10 = extractelement <2 x double> %tmp8, i64 0
-  %tmp11 = getelementptr inbounds double, double addrspace(1)* %tmp2, i64 2
-  store double %tmp10, double addrspace(1)* %tmp11, align 8
-  %tmp12 = getelementptr inbounds double, double addrspace(1)* %tmp2, i64 3
-  store double undef, double addrspace(1)* %tmp12, align 8
+  %tmp11 = getelementptr inbounds double, ptr addrspace(1) %tmp2, i64 2
+  store double %tmp10, ptr addrspace(1) %tmp11, align 8
+  %tmp12 = getelementptr inbounds double, ptr addrspace(1) %tmp2, i64 3
+  store double undef, ptr addrspace(1) %tmp12, align 8
   %tmp13 = extractelement <2 x double> %tmp9, i64 0
-  %tmp14 = getelementptr inbounds double, double addrspace(1)* %tmp2, i64 6
-  store double %tmp13, double addrspace(1)* %tmp14, align 8
-  %tmp15 = getelementptr inbounds double, double addrspace(1)* %tmp2, i64 7
-  store double 0.000000e+00, double addrspace(1)* %tmp15, align 8
+  %tmp14 = getelementptr inbounds double, ptr addrspace(1) %tmp2, i64 6
+  store double %tmp13, ptr addrspace(1) %tmp14, align 8
+  %tmp15 = getelementptr inbounds double, ptr addrspace(1) %tmp2, i64 7
+  store double 0.000000e+00, ptr addrspace(1) %tmp15, align 8
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 1446c0c1b65ab..7d7c16c25f6eb 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
 
-define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_23uu:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -26,13 +26,13 @@ define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_234u:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -64,13 +64,13 @@ define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_u1u3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_u1u3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -93,13 +93,13 @@ define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_u3u1(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_u3u1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -125,13 +125,13 @@ define <4 x half> @shuffle_v4f16_u3u1(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_u3uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_u3uu:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -154,13 +154,13 @@ define <4 x half> @shuffle_v4f16_u3uu(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_3u6u:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -194,13 +194,13 @@ define <4 x half> @shuffle_v4f16_3u6u(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    v_alignbit_b32 v0, s0, v0, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_3uu7:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -234,13 +234,13 @@ define <4 x half> @shuffle_v4f16_3uu7(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    v_alignbit_b32 v0, s0, v0, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_35u5:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -272,13 +272,13 @@ define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_357u:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -314,13 +314,13 @@ define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_0101:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -346,13 +346,13 @@ define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_0123(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_0123:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -375,13 +375,13 @@ define <4 x half> @shuffle_v4f16_0123(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_0145(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_0145:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -413,13 +413,13 @@ define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_0167:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -451,13 +451,13 @@ define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_2301(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_2301:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -483,13 +483,13 @@ define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_2323(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_2323:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -515,13 +515,13 @@ define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_2345(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_2345:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -553,13 +553,13 @@ define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_2367:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -591,13 +591,13 @@ define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_4501(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_4501:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -631,13 +631,13 @@ define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_4523(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_4523:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -671,13 +671,13 @@ define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_4545(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_4545:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -703,13 +703,13 @@ define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_4567(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_4567(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_4567:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -732,13 +732,13 @@ define <4 x half> @shuffle_v4f16_4567(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_6701(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_6701:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -772,13 +772,13 @@ define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_6723(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_6723:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -812,13 +812,13 @@ define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_6745(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_6745:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -844,13 +844,13 @@ define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_6767(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_6767:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -876,13 +876,13 @@ define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_2356:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -916,13 +916,13 @@ define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_5623:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -956,13 +956,13 @@ define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_3456:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -997,13 +997,13 @@ define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v2, v0, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_5634:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1038,13 +1038,13 @@ define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_5734:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1080,13 +1080,13 @@ define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
   ret <4 x half> %shuffle
 }
 
-define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
+define <4 x i16> @shuffle_v4i16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4i16_2356:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1120,13 +1120,13 @@ define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> ad
 ; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
-  %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
+  %val0 = load <4 x i16>, ptr addrspace(1) %arg0
+  %val1 = load <4 x i16>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
   ret <4 x i16> %shuffle
 }
 
-define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
+define <4 x i16> @shuffle_v4i16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4i16_0167:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1158,13 +1158,13 @@ define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> ad
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
-  %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
+  %val0 = load <4 x i16>, ptr addrspace(1) %arg0
+  %val1 = load <4 x i16>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
   ret <4 x i16> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_0000:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1195,13 +1195,13 @@ define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_1010(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_1010:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1231,13 +1231,13 @@ define <4 x half> @shuffle_v4f16_1010(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_1100(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_1100:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1268,13 +1268,13 @@ define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_6161:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1308,13 +1308,13 @@ define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_2333(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_2333:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1341,13 +1341,13 @@ define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_6667(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_6667:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1374,13 +1374,13 @@ define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v8f16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v8f16_0101:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1406,13 +1406,13 @@ define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
-  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
+  %val0 = load <8 x half>, ptr addrspace(1) %arg0
+  %val1 = load <8 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v8f16_0123(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v8f16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v8f16_0123:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1435,13 +1435,13 @@ define <4 x half> @shuffle_v8f16_0123(<8 x half> addrspace(1)* %arg0, <8 x half>
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
-  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
+  %val0 = load <8 x half>, ptr addrspace(1) %arg0
+  %val1 = load <8 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v8f16_4589(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v8f16_4589:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1473,13 +1473,13 @@ define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half>
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
-  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
+  %val0 = load <8 x half>, ptr addrspace(1) %arg0
+  %val1 = load <8 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v8f16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v8f16_10_11_2_3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1513,13 +1513,13 @@ define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
-  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
+  %val0 = load <8 x half>, ptr addrspace(1) %arg0
+  %val1 = load <8 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v8f16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v8f16_13_14_2_3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1553,13 +1553,13 @@ define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x
 ; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
-  %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
+  %val0 = load <8 x half>, ptr addrspace(1) %arg0
+  %val1 = load <8 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v3f16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v3f16_0122:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1586,13 +1586,13 @@ define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0
-  %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1
+  %val0 = load <3 x half>, ptr addrspace(1) %arg0
+  %val1 = load <3 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
   ret <4 x half> %shuffle
 }
 
-define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v2f16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v2f16_0122:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1618,13 +1618,13 @@ define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half>
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v1, v0, v0, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0
-  %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1
+  %val0 = load <2 x half>, ptr addrspace(1) %arg0
+  %val1 = load <2 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
   ret <4 x half> %shuffle
 }
 
-define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) {
+define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v6f16_452367:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1674,13 +1674,13 @@ define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x hal
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0
-  %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1
+  %val0 = load <6 x half>, ptr addrspace(1) %arg0
+  %val1 = load <6 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7>
   ret <6 x half> %shuffle
 }
 
-define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C)  {
+define amdgpu_kernel void @fma_shuffle(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C)  {
 ; GFX9-LABEL: fma_shuffle:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1740,12 +1740,12 @@ define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readon
 entry:
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp12 = zext i32 %tmp1 to i64
-  %arrayidx = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %A, i64 %tmp12
-  %tmp14 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx, align 8
-  %arrayidx1 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %B, i64 %tmp12
-  %tmp15 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx1, align 8
-  %arrayidx2 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %C, i64 %tmp12
-  %tmp16 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx2, align 8
+  %arrayidx = getelementptr inbounds <4 x half>, ptr addrspace(1) %A, i64 %tmp12
+  %tmp14 = load <4 x half>, ptr addrspace(1) %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds <4 x half>, ptr addrspace(1) %B, i64 %tmp12
+  %tmp15 = load <4 x half>, ptr addrspace(1) %arrayidx1, align 8
+  %arrayidx2 = getelementptr inbounds <4 x half>, ptr addrspace(1) %C, i64 %tmp12
+  %tmp16 = load <4 x half>, ptr addrspace(1) %arrayidx2, align 8
   %tmp17 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> zeroinitializer
   %tmp18 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 0, i32 1>
   %tmp19 = shufflevector <4 x half> %tmp16, <4 x half> undef, <2 x i32> <i32 0, i32 1>
@@ -1762,11 +1762,11 @@ entry:
   %tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28)
   %tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  store <4 x half> %tmp32, <4 x half> addrspace(1)* %arrayidx2, align 8
+  store <4 x half> %tmp32, ptr addrspace(1) %arrayidx2, align 8
   ret void
 }
 
-define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
+define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_0456:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1803,13 +1803,13 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half>
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
-  %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
+  %val0 = load <4 x half>, ptr addrspace(1) %arg0
+  %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
   ret <4 x half> %shuffle
 }
 
-define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)* %in, <4 x i32> addrspace(1)* %out)  {
+define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ptr addrspace(1) %out)  {
 ; GFX9-LABEL: shuffle_scalar_load_v8i32_0123:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1850,9 +1850,9 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[2:3]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16
+  %ld8 = load <8 x i32>, ptr addrspace(4) %in, align 16
   %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x i32> %id, <4 x i32> addrspace(1)* %out, align 8
+  store <4 x i32> %id, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -1860,7 +1860,7 @@ declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 attributes #0 = { nounwind readnone speculatable }
-define <2 x half> @low16bits(<2 x half> addrspace(1)* %x0, <2 x half> addrspace(1)* %x1) {
+define <2 x half> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-LABEL: low16bits:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1891,14 +1891,14 @@ define <2 x half> @low16bits(<2 x half> addrspace(1)* %x0, <2 x half> addrspace(
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %0 = load <2 x half>, <2 x half> addrspace(1)* %x0, align 4
-  %1 = load <2 x half>, <2 x half> addrspace(1)* %x1, align 4
+  %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x half>, ptr addrspace(1) %x1, align 4
   %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> <i32 0, i32 undef>
   %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> <i32 0, i32 2>
   ret <2 x half> %vy1.2.vec.insert
 }
 
-define <2 x half> @hi16bits(<2 x half> addrspace(1)* %x0, <2 x half> addrspace(1)* %x1) {
+define <2 x half> @hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-LABEL: hi16bits:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1929,14 +1929,14 @@ define <2 x half> @hi16bits(<2 x half> addrspace(1)* %x0, <2 x half> addrspace(1
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %0 = load <2 x half>, <2 x half> addrspace(1)* %x0, align 4
-  %1 = load <2 x half>, <2 x half> addrspace(1)* %x1, align 4
+  %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x half>, ptr addrspace(1) %x1, align 4
   %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> <i32 1, i32 undef>
   %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> <i32 0, i32 3>
   ret <2 x half> %vy1.2.vec.insert
 }
 
-define <2 x half> @low16hi16bits(<2 x half> addrspace(1)* %x0, <2 x half> addrspace(1)* %x1) {
+define <2 x half> @low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-LABEL: low16hi16bits:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1967,14 +1967,14 @@ define <2 x half> @low16hi16bits(<2 x half> addrspace(1)* %x0, <2 x half> addrsp
 ; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %0 = load <2 x half>, <2 x half> addrspace(1)* %x0, align 4
-  %1 = load <2 x half>, <2 x half> addrspace(1)* %x1, align 4
+  %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x half>, ptr addrspace(1) %x1, align 4
   %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> <i32 0, i32 undef>
   %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> <i32 0, i32 3>
   ret <2 x half> %vy1.2.vec.insert
 }
 
-define <2 x half> @hi16low16bits(<2 x half> addrspace(1)* %x0, <2 x half> addrspace(1)* %x1) {
+define <2 x half> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-LABEL: hi16low16bits:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2004,14 +2004,14 @@ define <2 x half> @hi16low16bits(<2 x half> addrspace(1)* %x0, <2 x half> addrsp
 ; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %0 = load <2 x half>, <2 x half> addrspace(1)* %x0, align 4
-  %1 = load <2 x half>, <2 x half> addrspace(1)* %x1, align 4
+  %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x half>, ptr addrspace(1) %x1, align 4
   %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> <i32 1, i32 undef>
   %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> <i32 0, i32 2>
   ret <2 x half> %vy1.2.vec.insert
 }
 
-define <2 x i16> @i16_low16bits(<2 x i16> addrspace(1)* %x0, <2 x i16> addrspace(1)* %x1) {
+define <2 x i16> @i16_low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-LABEL: i16_low16bits:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2042,14 +2042,14 @@ define <2 x i16> @i16_low16bits(<2 x i16> addrspace(1)* %x0, <2 x i16> addrspace
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %0 = load <2 x i16>, <2 x i16> addrspace(1)* %x0, align 4
-  %1 = load <2 x i16>, <2 x i16> addrspace(1)* %x1, align 4
+  %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4
   %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> <i32 0, i32 undef>
   %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> <i32 0, i32 2>
   ret <2 x i16> %vy1.2.vec.insert
 }
 
-define <2 x i16> @i16_low16hi16bits(<2 x i16> addrspace(1)* %x0, <2 x i16> addrspace(1)* %x1) {
+define <2 x i16> @i16_low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-LABEL: i16_low16hi16bits:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2080,14 +2080,14 @@ define <2 x i16> @i16_low16hi16bits(<2 x i16> addrspace(1)* %x0, <2 x i16> addrs
 ; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %0 = load <2 x i16>, <2 x i16> addrspace(1)* %x0, align 4
-  %1 = load <2 x i16>, <2 x i16> addrspace(1)* %x1, align 4
+  %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4
   %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> <i32 0, i32 undef>
   %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> <i32 0, i32 3>
   ret <2 x i16> %vy1.2.vec.insert
 }
 
-define <2 x i16> @i16_hi16low16bits(<2 x i16> addrspace(1)* %x0, <2 x i16> addrspace(1)* %x1) {
+define <2 x i16> @i16_hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-LABEL: i16_hi16low16bits:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2117,14 +2117,14 @@ define <2 x i16> @i16_hi16low16bits(<2 x i16> addrspace(1)* %x0, <2 x i16> addrs
 ; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %0 = load <2 x i16>, <2 x i16> addrspace(1)* %x0, align 4
-  %1 = load <2 x i16>, <2 x i16> addrspace(1)* %x1, align 4
+  %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4
   %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> <i32 1, i32 undef>
   %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> <i32 0, i32 2>
   ret <2 x i16> %vy1.2.vec.insert
 }
 
-define <2 x i16> @i16_hi16bits(<2 x i16> addrspace(1)* %x0, <2 x i16> addrspace(1)* %x1) {
+define <2 x i16> @i16_hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-LABEL: i16_hi16bits:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2155,14 +2155,14 @@ define <2 x i16> @i16_hi16bits(<2 x i16> addrspace(1)* %x0, <2 x i16> addrspace(
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %0 = load <2 x i16>, <2 x i16> addrspace(1)* %x0, align 4
-  %1 = load <2 x i16>, <2 x i16> addrspace(1)* %x1, align 4
+  %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4
   %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> <i32 1, i32 undef>
   %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> <i32 0, i32 3>
   ret <2 x i16> %vy1.2.vec.insert
 }
 
-define <2 x i16> @v2i16_hi16bits(<2 x i16> addrspace(1)* %x0) {
+define <2 x i16> @v2i16_hi16bits(ptr addrspace(1) %x0) {
 ; GFX9-LABEL: v2i16_hi16bits:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2189,14 +2189,14 @@ define <2 x i16> @v2i16_hi16bits(<2 x i16> addrspace(1)* %x0) {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load0 = load <2 x i16>, <2 x i16> addrspace(1)* %x0, align 4
+  %load0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
   %insert1 = insertelement <2 x i16> undef, i16 0, i32 0
   %insert2 = insertelement <2 x i16> %insert1, i16 0, i32 1
   %vec.ret = shufflevector <2 x i16> %insert2, <2 x i16> %load0, <2 x i32> <i32 0, i32 3>
   ret <2 x i16> %vec.ret
 }
 
-define <2 x half> @v2half_hi16bits(<2 x half> addrspace(1)* %x0) {
+define <2 x half> @v2half_hi16bits(ptr addrspace(1) %x0) {
 ; GFX9-LABEL: v2half_hi16bits:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2223,7 +2223,7 @@ define <2 x half> @v2half_hi16bits(<2 x half> addrspace(1)* %x0) {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load0 = load <2 x half>, <2 x half> addrspace(1)* %x0, align 4
+  %load0 = load <2 x half>, ptr addrspace(1) %x0, align 4
   %insert1 = insertelement <2 x half> undef, half 0.0, i32 0
   %insert2 = insertelement <2 x half> %insert1, half 0.0, i32 1
   %vec.ret = shufflevector <2 x half> %insert2, <2 x half> %load0, <2 x i32> <i32 0, i32 3>

diff  --git a/llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll b/llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll
index 381ff5b1b518a..591d7d74afb7a 100644
--- a/llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll
+++ b/llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll
@@ -12,69 +12,69 @@
 ; CHECK-DAG: ds_write2_b32
 ; CHECK-DAG: ds_write2_b32
 
-define amdgpu_kernel void @vectorize_global_local(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(3)* nocapture %arg1) {
+define amdgpu_kernel void @vectorize_global_local(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(3) nocapture %arg1) {
 bb:
-  %tmp = load i32, i32 addrspace(1)* %arg, align 4
-  store i32 %tmp, i32 addrspace(3)* %arg1, align 4
-  %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
-  %tmp3 = load i32, i32 addrspace(1)* %tmp2, align 4
-  %tmp4 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 1
-  store i32 %tmp3, i32 addrspace(3)* %tmp4, align 4
-  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
-  %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
-  %tmp7 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 2
-  store i32 %tmp6, i32 addrspace(3)* %tmp7, align 4
-  %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3
-  %tmp9 = load i32, i32 addrspace(1)* %tmp8, align 4
-  %tmp10 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 3
-  store i32 %tmp9, i32 addrspace(3)* %tmp10, align 4
-  %tmp11 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4
-  %tmp12 = load i32, i32 addrspace(1)* %tmp11, align 4
-  %tmp13 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 4
-  store i32 %tmp12, i32 addrspace(3)* %tmp13, align 4
-  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 5
-  %tmp15 = load i32, i32 addrspace(1)* %tmp14, align 4
-  %tmp16 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 5
-  store i32 %tmp15, i32 addrspace(3)* %tmp16, align 4
-  %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 6
-  %tmp18 = load i32, i32 addrspace(1)* %tmp17, align 4
-  %tmp19 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 6
-  store i32 %tmp18, i32 addrspace(3)* %tmp19, align 4
-  %tmp20 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 7
-  %tmp21 = load i32, i32 addrspace(1)* %tmp20, align 4
-  %tmp22 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 7
-  store i32 %tmp21, i32 addrspace(3)* %tmp22, align 4
-  %tmp23 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 8
-  %tmp24 = load i32, i32 addrspace(1)* %tmp23, align 4
-  %tmp25 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 8
-  store i32 %tmp24, i32 addrspace(3)* %tmp25, align 4
-  %tmp26 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 9
-  %tmp27 = load i32, i32 addrspace(1)* %tmp26, align 4
-  %tmp28 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 9
-  store i32 %tmp27, i32 addrspace(3)* %tmp28, align 4
-  %tmp29 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 10
-  %tmp30 = load i32, i32 addrspace(1)* %tmp29, align 4
-  %tmp31 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 10
-  store i32 %tmp30, i32 addrspace(3)* %tmp31, align 4
-  %tmp32 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 11
-  %tmp33 = load i32, i32 addrspace(1)* %tmp32, align 4
-  %tmp34 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 11
-  store i32 %tmp33, i32 addrspace(3)* %tmp34, align 4
-  %tmp35 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 12
-  %tmp36 = load i32, i32 addrspace(1)* %tmp35, align 4
-  %tmp37 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 12
-  store i32 %tmp36, i32 addrspace(3)* %tmp37, align 4
-  %tmp38 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 13
-  %tmp39 = load i32, i32 addrspace(1)* %tmp38, align 4
-  %tmp40 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 13
-  store i32 %tmp39, i32 addrspace(3)* %tmp40, align 4
-  %tmp41 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 14
-  %tmp42 = load i32, i32 addrspace(1)* %tmp41, align 4
-  %tmp43 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 14
-  store i32 %tmp42, i32 addrspace(3)* %tmp43, align 4
-  %tmp44 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 15
-  %tmp45 = load i32, i32 addrspace(1)* %tmp44, align 4
-  %tmp46 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 15
-  store i32 %tmp45, i32 addrspace(3)* %tmp46, align 4
+  %tmp = load i32, ptr addrspace(1) %arg, align 4
+  store i32 %tmp, ptr addrspace(3) %arg1, align 4
+  %tmp2 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
+  %tmp3 = load i32, ptr addrspace(1) %tmp2, align 4
+  %tmp4 = getelementptr inbounds i32, ptr addrspace(3) %arg1, i32 1
+  store i32 %tmp3, ptr addrspace(3) %tmp4, align 4
+  %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
+  %tmp6 = load i32, ptr addrspace(1) %tmp5, align 4
+  %tmp7 = getelementptr inbounds i32, ptr addrspace(3) %arg1, i32 2
+  store i32 %tmp6, ptr addrspace(3) %tmp7, align 4
+  %tmp8 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 3
+  %tmp9 = load i32, ptr addrspace(1) %tmp8, align 4
+  %tmp10 = getelementptr inbounds i32, ptr addrspace(3) %arg1, i32 3
+  store i32 %tmp9, ptr addrspace(3) %tmp10, align 4
+  %tmp11 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 4
+  %tmp12 = load i32, ptr addrspace(1) %tmp11, align 4
+  %tmp13 = getelementptr inbounds i32, ptr addrspace(3) %arg1, i32 4
+  store i32 %tmp12, ptr addrspace(3) %tmp13, align 4
+  %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 5
+  %tmp15 = load i32, ptr addrspace(1) %tmp14, align 4
+  %tmp16 = getelementptr inbounds i32, ptr addrspace(3) %arg1, i32 5
+  store i32 %tmp15, ptr addrspace(3) %tmp16, align 4
+  %tmp17 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 6
+  %tmp18 = load i32, ptr addrspace(1) %tmp17, align 4
+  %tmp19 = getelementptr inbounds i32, ptr addrspace(3) %arg1, i32 6
+  store i32 %tmp18, ptr addrspace(3) %tmp19, align 4
+  %tmp20 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 7
+  %tmp21 = load i32, ptr addrspace(1) %tmp20, align 4
+  %tmp22 = getelementptr inbounds i32, ptr addrspace(3) %arg1, i32 7
+  store i32 %tmp21, ptr addrspace(3) %tmp22, align 4
+  %tmp23 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 8
+  %tmp24 = load i32, ptr addrspace(1) %tmp23, align 4
+  %tmp25 = getelementptr inbounds i32, ptr addrspace(3) %arg1, i32 8
+  store i32 %tmp24, ptr addrspace(3) %tmp25, align 4
+  %tmp26 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 9
+  %tmp27 = load i32, ptr addrspace(1) %tmp26, align 4
+  %tmp28 = getelementptr inbounds i32, ptr addrspace(3) %arg1, i32 9
+  store i32 %tmp27, ptr addrspace(3) %tmp28, align 4
+  %tmp29 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 10
+  %tmp30 = load i32, ptr addrspace(1) %tmp29, align 4
+  %tmp31 = getelementptr inbounds i32, ptr addrspace(3) %arg1, i32 10
+  store i32 %tmp30, ptr addrspace(3) %tmp31, align 4
+  %tmp32 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 11
+  %tmp33 = load i32, ptr addrspace(1) %tmp32, align 4
+  %tmp34 = getelementptr inbounds i32, ptr addrspace(3) %arg1, i32 11
+  store i32 %tmp33, ptr addrspace(3) %tmp34, align 4
+  %tmp35 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 12
+  %tmp36 = load i32, ptr addrspace(1) %tmp35, align 4
+  %tmp37 = getelementptr inbounds i32, ptr addrspace(3) %arg1, i32 12
+  store i32 %tmp36, ptr addrspace(3) %tmp37, align 4
+  %tmp38 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 13
+  %tmp39 = load i32, ptr addrspace(1) %tmp38, align 4
+  %tmp40 = getelementptr inbounds i32, ptr addrspace(3) %arg1, i32 13
+  store i32 %tmp39, ptr addrspace(3) %tmp40, align 4
+  %tmp41 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 14
+  %tmp42 = load i32, ptr addrspace(1) %tmp41, align 4
+  %tmp43 = getelementptr inbounds i32, ptr addrspace(3) %arg1, i32 14
+  store i32 %tmp42, ptr addrspace(3) %tmp43, align 4
+  %tmp44 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 15
+  %tmp45 = load i32, ptr addrspace(1) %tmp44, align 4
+  %tmp46 = getelementptr inbounds i32, ptr addrspace(3) %arg1, i32 15
+  store i32 %tmp45, ptr addrspace(3) %tmp46, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll b/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll
index 40b61ece2380f..5abaf06141d22 100644
--- a/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll
@@ -7,18 +7,16 @@
 ; GCN: s_lshr_b32 [[ID_Y:s[0-9]+]], [[ID_XY]], 16
 ; GCN: s_add_i32 [[ID_SUM:s[0-9]+]], [[ID_Y]], [[ID_XY]]
 ; GCN: s_and_b32 s{{[0-9]+}}, [[ID_SUM]], 0xffff
-define protected amdgpu_kernel void @load_idx_idy(i32 addrspace(1)* %out) {
+define protected amdgpu_kernel void @load_idx_idy(ptr addrspace(1) %out) {
 entry:
-  %disp = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep_x = getelementptr i8, i8 addrspace(4)* %disp, i64 4
-  %gep_x.cast = bitcast i8 addrspace(4)* %gep_x to i16 addrspace(4)*
-  %id_x = load i16, i16 addrspace(4)* %gep_x.cast, align 4, !invariant.load !0 ; load workgroup size x
-  %gep_y = getelementptr i8, i8 addrspace(4)* %disp, i64 6
-  %gep_y.cast = bitcast i8 addrspace(4)* %gep_y to i16 addrspace(4)*
-  %id_y = load i16, i16 addrspace(4)* %gep_y.cast, align 2, !invariant.load !0 ; load workgroup size y
+  %disp = tail call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep_x = getelementptr i8, ptr addrspace(4) %disp, i64 4
+  %id_x = load i16, ptr addrspace(4) %gep_x, align 4, !invariant.load !0 ; load workgroup size x
+  %gep_y = getelementptr i8, ptr addrspace(4) %disp, i64 6
+  %id_y = load i16, ptr addrspace(4) %gep_y, align 2, !invariant.load !0 ; load workgroup size y
   %add = add nuw nsw i16 %id_y, %id_x
   %conv = zext i16 %add to i32
-  store i32 %conv, i32 addrspace(1)* %out, align 4
+  store i32 %conv, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -30,29 +28,25 @@ entry:
 ; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s[[D0]], 16
 ; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s[[D1]], 16
 ; GCN: s_endpgm
-define protected amdgpu_kernel void @load_4i16(i32 addrspace(1)* %out) {
+define protected amdgpu_kernel void @load_4i16(ptr addrspace(1) %out) {
 entry:
-  %disp = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep_x = getelementptr i8, i8 addrspace(4)* %disp, i64 4
-  %gep_x.cast = bitcast i8 addrspace(4)* %gep_x to i16 addrspace(4)*
-  %id_x = load i16, i16 addrspace(4)* %gep_x.cast, align 4, !invariant.load !0 ; load workgroup size x
-  %gep_y = getelementptr i8, i8 addrspace(4)* %disp, i64 6
-  %gep_y.cast = bitcast i8 addrspace(4)* %gep_y to i16 addrspace(4)*
-  %id_y = load i16, i16 addrspace(4)* %gep_y.cast, align 2, !invariant.load !0 ; load workgroup size y
-  %gep_z = getelementptr i8, i8 addrspace(4)* %disp, i64 8
-  %gep_z.cast = bitcast i8 addrspace(4)* %gep_z to i16 addrspace(4)*
-  %id_z = load i16, i16 addrspace(4)* %gep_z.cast, align 4, !invariant.load !0 ; load workgroup size x
-  %gep_w = getelementptr i8, i8 addrspace(4)* %disp, i64 10
-  %gep_w.cast = bitcast i8 addrspace(4)* %gep_w to i16 addrspace(4)*
-  %id_w = load i16, i16 addrspace(4)* %gep_w.cast, align 2, !invariant.load !0 ; load workgroup size y
+  %disp = tail call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep_x = getelementptr i8, ptr addrspace(4) %disp, i64 4
+  %id_x = load i16, ptr addrspace(4) %gep_x, align 4, !invariant.load !0 ; load workgroup size x
+  %gep_y = getelementptr i8, ptr addrspace(4) %disp, i64 6
+  %id_y = load i16, ptr addrspace(4) %gep_y, align 2, !invariant.load !0 ; load workgroup size y
+  %gep_z = getelementptr i8, ptr addrspace(4) %disp, i64 8
+  %id_z = load i16, ptr addrspace(4) %gep_z, align 4, !invariant.load !0 ; load workgroup size x
+  %gep_w = getelementptr i8, ptr addrspace(4) %disp, i64 10
+  %id_w = load i16, ptr addrspace(4) %gep_w, align 2, !invariant.load !0 ; load workgroup size y
   %add = add nuw nsw i16 %id_y, %id_x
   %add2 = add nuw nsw i16 %id_z, %id_w
   %add3 = add nuw nsw i16 %add, %add2
   %conv = zext i16 %add3 to i32
-  store i32 %conv, i32 addrspace(1)* %out, align 4
+  store i32 %conv, ptr addrspace(1) %out, align 4
   ret void
 }
 
-declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
 
 !0 = !{!0}

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
index 1cdfeffb96e7d..ad7170eae81f9 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GCN
 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11
 
-define void @vgpr_descriptor_waterfall_loop_idom_update(<4 x i32>* %arg) #0 {
+define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 {
 ; GCN-LABEL: vgpr_descriptor_waterfall_loop_idom_update:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -70,7 +70,7 @@ entry:
   br label %bb0
 
 bb0:
-  %desc = load <4 x i32>, <4 x i32>* %arg, align 8
+  %desc = load <4 x i32>, ptr %arg, align 8
   tail call void @llvm.amdgcn.raw.buffer.store.f32(float undef, <4 x i32> %desc, i32 undef, i32 0, i32 0)
   br label %bb0
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
index 0eb614c11142f..c71dc06c68d8d 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
@@ -158,7 +158,7 @@ for.end:
 }
 
 ; a loop inside an if-else
-define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %extern_func, float(float)* %extern_func2) #0 {
+define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_func, ptr %extern_func2) #0 {
 ; SI-LABEL: loop:
 ; SI:       ; %bb.0: ; %main_body
 ; SI-NEXT:    v_mov_b32_e32 v6, v0
@@ -236,7 +236,7 @@ end:
 }
 
 ; a loop inside an if-else, but the variable is still in use after the if-else
-define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float(float)* %extern_func, float(float)* %extern_func2) #0 {
+define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %extern_func, ptr %extern_func2) #0 {
 ; SI-LABEL: loop_with_use:
 ; SI:       ; %bb.0: ; %main_body
 ; SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index d294add6fee5b..b504910ac5265 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -62,7 +62,7 @@
 ; GCN: ScratchSize: 1536
 
 ; s[0:3] input user SGPRs. s4,s5,s6 = workgroup IDs. s8 scratch offset.
-define amdgpu_kernel void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 {
+define amdgpu_kernel void @spill_vgpr_compute(<4 x float> %arg6, ptr addrspace(1) %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 {
 bb:
   %tmp = add i32 %arg1, %arg2
   %tmp7 = extractelement <4 x float> %arg6, i32 0
@@ -208,138 +208,138 @@ bb12:                                             ; preds = %bb145, %bb
   br i1 %tmp143, label %bb144, label %bb145
 
 bb144:                                            ; preds = %bb12
-  store volatile float %arg3, float addrspace(1)* %arg
-  store volatile float %tmp91, float addrspace(1)* %arg
-  store volatile float %tmp90, float addrspace(1)* %arg
-  store volatile float %tmp89, float addrspace(1)* %arg
-  store volatile float %tmp87, float addrspace(1)* %arg
-  store volatile float %tmp86, float addrspace(1)* %arg
-  store volatile float %tmp85, float addrspace(1)* %arg
-  store volatile float %tmp83, float addrspace(1)* %arg
-  store volatile float %tmp82, float addrspace(1)* %arg
-  store volatile float %tmp81, float addrspace(1)* %arg
-  store volatile float %tmp79, float addrspace(1)* %arg
-  store volatile float %tmp78, float addrspace(1)* %arg
-  store volatile float %tmp77, float addrspace(1)* %arg
-  store volatile float %tmp75, float addrspace(1)* %arg
-  store volatile float %tmp74, float addrspace(1)* %arg
-  store volatile float %tmp73, float addrspace(1)* %arg
-  store volatile float %tmp71, float addrspace(1)* %arg
-  store volatile float %tmp70, float addrspace(1)* %arg
-  store volatile float %tmp69, float addrspace(1)* %arg
-  store volatile float %tmp67, float addrspace(1)* %arg
-  store volatile float %tmp66, float addrspace(1)* %arg
-  store volatile float %tmp65, float addrspace(1)* %arg
-  store volatile float %tmp63, float addrspace(1)* %arg
-  store volatile float %tmp62, float addrspace(1)* %arg
-  store volatile float %tmp61, float addrspace(1)* %arg
-  store volatile float %tmp59, float addrspace(1)* %arg
-  store volatile float %tmp58, float addrspace(1)* %arg
-  store volatile float %tmp57, float addrspace(1)* %arg
-  store volatile float %tmp55, float addrspace(1)* %arg
-  store volatile float %tmp54, float addrspace(1)* %arg
-  store volatile float %tmp53, float addrspace(1)* %arg
-  store volatile float %tmp51, float addrspace(1)* %arg
-  store volatile float %tmp50, float addrspace(1)* %arg
-  store volatile float %tmp49, float addrspace(1)* %arg
-  store volatile float %tmp47, float addrspace(1)* %arg
-  store volatile float %tmp46, float addrspace(1)* %arg
-  store volatile float %tmp45, float addrspace(1)* %arg
-  store volatile float %tmp43, float addrspace(1)* %arg
-  store volatile float %tmp42, float addrspace(1)* %arg
-  store volatile float %tmp41, float addrspace(1)* %arg
-  store volatile float %tmp39, float addrspace(1)* %arg
-  store volatile float %tmp38, float addrspace(1)* %arg
-  store volatile float %tmp37, float addrspace(1)* %arg
-  store volatile float %tmp35, float addrspace(1)* %arg
-  store volatile float %tmp34, float addrspace(1)* %arg
-  store volatile float %tmp33, float addrspace(1)* %arg
-  store volatile float %tmp31, float addrspace(1)* %arg
-  store volatile float %tmp30, float addrspace(1)* %arg
-  store volatile float %tmp29, float addrspace(1)* %arg
-  store volatile float %tmp27, float addrspace(1)* %arg
-  store volatile float %tmp26, float addrspace(1)* %arg
-  store volatile float %tmp25, float addrspace(1)* %arg
-  store volatile float %tmp23, float addrspace(1)* %arg
-  store volatile float %tmp22, float addrspace(1)* %arg
-  store volatile float %tmp21, float addrspace(1)* %arg
-  store volatile float %tmp19, float addrspace(1)* %arg
-  store volatile float %tmp18, float addrspace(1)* %arg
-  store volatile float %tmp17, float addrspace(1)* %arg
-  store volatile float %tmp15, float addrspace(1)* %arg
-  store volatile float %tmp14, float addrspace(1)* %arg
-  store volatile float %tmp13, float addrspace(1)* %arg
-  store volatile float %tmp16, float addrspace(1)* %arg
-  store volatile float %tmp20, float addrspace(1)* %arg
-  store volatile float %tmp24, float addrspace(1)* %arg
-  store volatile float %tmp28, float addrspace(1)* %arg
-  store volatile float %tmp32, float addrspace(1)* %arg
-  store volatile float %tmp36, float addrspace(1)* %arg
-  store volatile float %tmp40, float addrspace(1)* %arg
-  store volatile float %tmp44, float addrspace(1)* %arg
-  store volatile float %tmp48, float addrspace(1)* %arg
-  store volatile float %tmp52, float addrspace(1)* %arg
-  store volatile float %tmp56, float addrspace(1)* %arg
-  store volatile float %tmp60, float addrspace(1)* %arg
-  store volatile float %tmp64, float addrspace(1)* %arg
-  store volatile float %tmp68, float addrspace(1)* %arg
-  store volatile float %tmp72, float addrspace(1)* %arg
-  store volatile float %tmp76, float addrspace(1)* %arg
-  store volatile float %tmp80, float addrspace(1)* %arg
-  store volatile float %tmp84, float addrspace(1)* %arg
-  store volatile float %tmp88, float addrspace(1)* %arg
-  store volatile float %tmp92, float addrspace(1)* %arg
-  store volatile float %tmp93, float addrspace(1)* %arg
-  store volatile float %tmp94, float addrspace(1)* %arg
-  store volatile float %tmp96, float addrspace(1)* %arg
-  store volatile float %tmp97, float addrspace(1)* %arg
-  store volatile float %tmp98, float addrspace(1)* %arg
-  store volatile float %tmp99, float addrspace(1)* %arg
-  store volatile float %tmp100, float addrspace(1)* %arg
-  store volatile float %tmp101, float addrspace(1)* %arg
-  store volatile float %tmp102, float addrspace(1)* %arg
-  store volatile float %tmp103, float addrspace(1)* %arg
-  store volatile float %tmp104, float addrspace(1)* %arg
-  store volatile float %tmp105, float addrspace(1)* %arg
-  store volatile float %tmp106, float addrspace(1)* %arg
-  store volatile float %tmp107, float addrspace(1)* %arg
-  store volatile float %tmp108, float addrspace(1)* %arg
-  store volatile float %tmp109, float addrspace(1)* %arg
-  store volatile float %tmp110, float addrspace(1)* %arg
-  store volatile float %tmp111, float addrspace(1)* %arg
-  store volatile float %tmp112, float addrspace(1)* %arg
-  store volatile float %tmp113, float addrspace(1)* %arg
-  store volatile float %tmp114, float addrspace(1)* %arg
-  store volatile float %tmp115, float addrspace(1)* %arg
-  store volatile float %tmp116, float addrspace(1)* %arg
-  store volatile float %tmp117, float addrspace(1)* %arg
-  store volatile float %tmp118, float addrspace(1)* %arg
-  store volatile float %tmp119, float addrspace(1)* %arg
-  store volatile float %tmp120, float addrspace(1)* %arg
-  store volatile float %tmp121, float addrspace(1)* %arg
-  store volatile float %tmp122, float addrspace(1)* %arg
-  store volatile float %tmp123, float addrspace(1)* %arg
-  store volatile float %tmp124, float addrspace(1)* %arg
-  store volatile float %tmp125, float addrspace(1)* %arg
-  store volatile float %tmp126, float addrspace(1)* %arg
-  store volatile float %tmp127, float addrspace(1)* %arg
-  store volatile float %tmp128, float addrspace(1)* %arg
-  store volatile float %tmp129, float addrspace(1)* %arg
-  store volatile float %tmp130, float addrspace(1)* %arg
-  store volatile float %tmp131, float addrspace(1)* %arg
-  store volatile float %tmp132, float addrspace(1)* %arg
-  store volatile float %tmp133, float addrspace(1)* %arg
-  store volatile float %tmp134, float addrspace(1)* %arg
-  store volatile float %tmp135, float addrspace(1)* %arg
-  store volatile float %tmp136, float addrspace(1)* %arg
-  store volatile float %tmp137, float addrspace(1)* %arg
-  store volatile float %tmp138, float addrspace(1)* %arg
-  store volatile float %tmp139, float addrspace(1)* %arg
-  store volatile float %arg4, float addrspace(1)* %arg
-  store volatile float %tmp7, float addrspace(1)* %arg
-  store volatile float %tmp8, float addrspace(1)* %arg
-  store volatile float %tmp9, float addrspace(1)* %arg
-  store volatile float %tmp10, float addrspace(1)* %arg
+  store volatile float %arg3, ptr addrspace(1) %arg
+  store volatile float %tmp91, ptr addrspace(1) %arg
+  store volatile float %tmp90, ptr addrspace(1) %arg
+  store volatile float %tmp89, ptr addrspace(1) %arg
+  store volatile float %tmp87, ptr addrspace(1) %arg
+  store volatile float %tmp86, ptr addrspace(1) %arg
+  store volatile float %tmp85, ptr addrspace(1) %arg
+  store volatile float %tmp83, ptr addrspace(1) %arg
+  store volatile float %tmp82, ptr addrspace(1) %arg
+  store volatile float %tmp81, ptr addrspace(1) %arg
+  store volatile float %tmp79, ptr addrspace(1) %arg
+  store volatile float %tmp78, ptr addrspace(1) %arg
+  store volatile float %tmp77, ptr addrspace(1) %arg
+  store volatile float %tmp75, ptr addrspace(1) %arg
+  store volatile float %tmp74, ptr addrspace(1) %arg
+  store volatile float %tmp73, ptr addrspace(1) %arg
+  store volatile float %tmp71, ptr addrspace(1) %arg
+  store volatile float %tmp70, ptr addrspace(1) %arg
+  store volatile float %tmp69, ptr addrspace(1) %arg
+  store volatile float %tmp67, ptr addrspace(1) %arg
+  store volatile float %tmp66, ptr addrspace(1) %arg
+  store volatile float %tmp65, ptr addrspace(1) %arg
+  store volatile float %tmp63, ptr addrspace(1) %arg
+  store volatile float %tmp62, ptr addrspace(1) %arg
+  store volatile float %tmp61, ptr addrspace(1) %arg
+  store volatile float %tmp59, ptr addrspace(1) %arg
+  store volatile float %tmp58, ptr addrspace(1) %arg
+  store volatile float %tmp57, ptr addrspace(1) %arg
+  store volatile float %tmp55, ptr addrspace(1) %arg
+  store volatile float %tmp54, ptr addrspace(1) %arg
+  store volatile float %tmp53, ptr addrspace(1) %arg
+  store volatile float %tmp51, ptr addrspace(1) %arg
+  store volatile float %tmp50, ptr addrspace(1) %arg
+  store volatile float %tmp49, ptr addrspace(1) %arg
+  store volatile float %tmp47, ptr addrspace(1) %arg
+  store volatile float %tmp46, ptr addrspace(1) %arg
+  store volatile float %tmp45, ptr addrspace(1) %arg
+  store volatile float %tmp43, ptr addrspace(1) %arg
+  store volatile float %tmp42, ptr addrspace(1) %arg
+  store volatile float %tmp41, ptr addrspace(1) %arg
+  store volatile float %tmp39, ptr addrspace(1) %arg
+  store volatile float %tmp38, ptr addrspace(1) %arg
+  store volatile float %tmp37, ptr addrspace(1) %arg
+  store volatile float %tmp35, ptr addrspace(1) %arg
+  store volatile float %tmp34, ptr addrspace(1) %arg
+  store volatile float %tmp33, ptr addrspace(1) %arg
+  store volatile float %tmp31, ptr addrspace(1) %arg
+  store volatile float %tmp30, ptr addrspace(1) %arg
+  store volatile float %tmp29, ptr addrspace(1) %arg
+  store volatile float %tmp27, ptr addrspace(1) %arg
+  store volatile float %tmp26, ptr addrspace(1) %arg
+  store volatile float %tmp25, ptr addrspace(1) %arg
+  store volatile float %tmp23, ptr addrspace(1) %arg
+  store volatile float %tmp22, ptr addrspace(1) %arg
+  store volatile float %tmp21, ptr addrspace(1) %arg
+  store volatile float %tmp19, ptr addrspace(1) %arg
+  store volatile float %tmp18, ptr addrspace(1) %arg
+  store volatile float %tmp17, ptr addrspace(1) %arg
+  store volatile float %tmp15, ptr addrspace(1) %arg
+  store volatile float %tmp14, ptr addrspace(1) %arg
+  store volatile float %tmp13, ptr addrspace(1) %arg
+  store volatile float %tmp16, ptr addrspace(1) %arg
+  store volatile float %tmp20, ptr addrspace(1) %arg
+  store volatile float %tmp24, ptr addrspace(1) %arg
+  store volatile float %tmp28, ptr addrspace(1) %arg
+  store volatile float %tmp32, ptr addrspace(1) %arg
+  store volatile float %tmp36, ptr addrspace(1) %arg
+  store volatile float %tmp40, ptr addrspace(1) %arg
+  store volatile float %tmp44, ptr addrspace(1) %arg
+  store volatile float %tmp48, ptr addrspace(1) %arg
+  store volatile float %tmp52, ptr addrspace(1) %arg
+  store volatile float %tmp56, ptr addrspace(1) %arg
+  store volatile float %tmp60, ptr addrspace(1) %arg
+  store volatile float %tmp64, ptr addrspace(1) %arg
+  store volatile float %tmp68, ptr addrspace(1) %arg
+  store volatile float %tmp72, ptr addrspace(1) %arg
+  store volatile float %tmp76, ptr addrspace(1) %arg
+  store volatile float %tmp80, ptr addrspace(1) %arg
+  store volatile float %tmp84, ptr addrspace(1) %arg
+  store volatile float %tmp88, ptr addrspace(1) %arg
+  store volatile float %tmp92, ptr addrspace(1) %arg
+  store volatile float %tmp93, ptr addrspace(1) %arg
+  store volatile float %tmp94, ptr addrspace(1) %arg
+  store volatile float %tmp96, ptr addrspace(1) %arg
+  store volatile float %tmp97, ptr addrspace(1) %arg
+  store volatile float %tmp98, ptr addrspace(1) %arg
+  store volatile float %tmp99, ptr addrspace(1) %arg
+  store volatile float %tmp100, ptr addrspace(1) %arg
+  store volatile float %tmp101, ptr addrspace(1) %arg
+  store volatile float %tmp102, ptr addrspace(1) %arg
+  store volatile float %tmp103, ptr addrspace(1) %arg
+  store volatile float %tmp104, ptr addrspace(1) %arg
+  store volatile float %tmp105, ptr addrspace(1) %arg
+  store volatile float %tmp106, ptr addrspace(1) %arg
+  store volatile float %tmp107, ptr addrspace(1) %arg
+  store volatile float %tmp108, ptr addrspace(1) %arg
+  store volatile float %tmp109, ptr addrspace(1) %arg
+  store volatile float %tmp110, ptr addrspace(1) %arg
+  store volatile float %tmp111, ptr addrspace(1) %arg
+  store volatile float %tmp112, ptr addrspace(1) %arg
+  store volatile float %tmp113, ptr addrspace(1) %arg
+  store volatile float %tmp114, ptr addrspace(1) %arg
+  store volatile float %tmp115, ptr addrspace(1) %arg
+  store volatile float %tmp116, ptr addrspace(1) %arg
+  store volatile float %tmp117, ptr addrspace(1) %arg
+  store volatile float %tmp118, ptr addrspace(1) %arg
+  store volatile float %tmp119, ptr addrspace(1) %arg
+  store volatile float %tmp120, ptr addrspace(1) %arg
+  store volatile float %tmp121, ptr addrspace(1) %arg
+  store volatile float %tmp122, ptr addrspace(1) %arg
+  store volatile float %tmp123, ptr addrspace(1) %arg
+  store volatile float %tmp124, ptr addrspace(1) %arg
+  store volatile float %tmp125, ptr addrspace(1) %arg
+  store volatile float %tmp126, ptr addrspace(1) %arg
+  store volatile float %tmp127, ptr addrspace(1) %arg
+  store volatile float %tmp128, ptr addrspace(1) %arg
+  store volatile float %tmp129, ptr addrspace(1) %arg
+  store volatile float %tmp130, ptr addrspace(1) %arg
+  store volatile float %tmp131, ptr addrspace(1) %arg
+  store volatile float %tmp132, ptr addrspace(1) %arg
+  store volatile float %tmp133, ptr addrspace(1) %arg
+  store volatile float %tmp134, ptr addrspace(1) %arg
+  store volatile float %tmp135, ptr addrspace(1) %arg
+  store volatile float %tmp136, ptr addrspace(1) %arg
+  store volatile float %tmp137, ptr addrspace(1) %arg
+  store volatile float %tmp138, ptr addrspace(1) %arg
+  store volatile float %tmp139, ptr addrspace(1) %arg
+  store volatile float %arg4, ptr addrspace(1) %arg
+  store volatile float %tmp7, ptr addrspace(1) %arg
+  store volatile float %tmp8, ptr addrspace(1) %arg
+  store volatile float %tmp9, ptr addrspace(1) %arg
+  store volatile float %tmp10, ptr addrspace(1) %arg
   ret void
 
 bb145:                                            ; preds = %bb12

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index 8b8a430863dd8..55cb06dd7cd6d 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -27,15 +27,13 @@
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 768
 
-define amdgpu_vs void @main([9 x <4 x i32>] addrspace(4)* inreg %arg, [17 x <4 x i32>] addrspace(4)* inreg %arg1, [17 x <4 x i32>] addrspace(4)* inreg %arg2, [34 x <8 x i32>] addrspace(4)* inreg %arg3, [16 x <4 x i32>] addrspace(4)* inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
+define amdgpu_vs void @main(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, ptr addrspace(4) inreg %arg3, ptr addrspace(4) inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
 bb:
-  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg1, i64 0, i64 0
-  %tmp11 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, align 16, !tbaa !0
+  %tmp11 = load <4 x i32>, ptr addrspace(4) %arg1, align 16, !tbaa !0
   %tmp12 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp11, i32 0, i32 0)
   %tmp13 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp11, i32 16, i32 0)
   %tmp14 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp11, i32 32, i32 0)
-  %tmp15 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(4)* %arg4, i64 0, i64 0
-  %tmp16 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp15, align 16, !tbaa !0
+  %tmp16 = load <4 x i32>, ptr addrspace(4) %arg4, align 16, !tbaa !0
   %tmp17 = add i32 %arg5, %arg7
   %tmp16.cast = bitcast <4 x i32> %tmp16 to <4 x i32>
   %tmp18 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp16.cast, i32 %tmp17, i32 0, i32 0, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
index d6b3f035a3e3b..de38208b6e500 100644
--- a/llvm/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
@@ -1,12 +1,12 @@
 ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
-; ERROR: error: foo.cl:1:42: in function rsq_legacy_f32 void (float addrspace(1)*, float): intrinsic not supported on subtarget
+; ERROR: error: foo.cl:1:42: in function rsq_legacy_f32 void (ptr addrspace(1), float): intrinsic not supported on subtarget
 
 declare float @llvm.amdgcn.rsq.legacy(float) #0
 
-define amdgpu_kernel void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rsq_legacy_f32(ptr addrspace(1) %out, float %src) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float %src), !dbg !4
-  store float %rsq, float addrspace(1)* %out, align 4
+  store float %rsq, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll b/llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll
index 6995cf6845553..3360019660673 100644
--- a/llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll
@@ -11,7 +11,7 @@
 ; CHECK:    s_swappc_b64
 define amdgpu_kernel void @vgpr_multi_use_imm_fold() {
 entry:
-  store double 0.0, double addrspace(1)* undef, align 8
+  store double 0.0, ptr addrspace(1) undef, align 8
   %call0 = tail call fastcc double @__ocml_log_f64(double 2.0)
   %op = fadd double %call0, 0.0
   %call1 = tail call fastcc double @__ocml_sqrt_f64(double %op)

diff  --git a/llvm/test/CodeGen/AMDGPU/vselect.ll b/llvm/test/CodeGen/AMDGPU/vselect.ll
index 9b1b552b82a0f..a237e6d913653 100644
--- a/llvm/test/CodeGen/AMDGPU/vselect.ll
+++ b/llvm/test/CodeGen/AMDGPU/vselect.ll
@@ -17,13 +17,13 @@
 ; SI-DAG: s_cmp_gt_i32
 ; SI-DAG: s_cselect_b32
 
-define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
+define amdgpu_kernel void @test_select_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <2 x i32> %val) {
 entry:
-  %load0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0
-  %load1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1
+  %load0 = load <2 x i32>, ptr addrspace(1) %in0
+  %load1 = load <2 x i32>, ptr addrspace(1) %in1
   %cmp = icmp sgt <2 x i32> %load0, %load1
   %result = select <2 x i1> %cmp, <2 x i32> %val, <2 x i32> %load0
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -37,13 +37,13 @@ entry:
 ; SI: v_cmp_neq_f32_e32 vcc
 ; SI: v_cndmask_b32_e32
 
-define amdgpu_kernel void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
+define amdgpu_kernel void @test_select_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
 entry:
-  %0 = load <2 x float>, <2 x float> addrspace(1)* %in0
-  %1 = load <2 x float>, <2 x float> addrspace(1)* %in1
+  %0 = load <2 x float>, ptr addrspace(1) %in0
+  %1 = load <2 x float>, ptr addrspace(1) %in1
   %cmp = fcmp une <2 x float> %0, %1
   %result = select <2 x i1> %cmp, <2 x float> %0, <2 x float> %1
-  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  store <2 x float> %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -64,13 +64,13 @@ entry:
 ; SI-DAG: s_cselect_b32
 ; SI-DAG: s_cselect_b32
 
-define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
+define amdgpu_kernel void @test_select_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <4 x i32> %val) {
 entry:
-  %load0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0
-  %load1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1
+  %load0 = load <4 x i32>, ptr addrspace(1) %in0
+  %load1 = load <4 x i32>, ptr addrspace(1) %in1
   %cmp = icmp sgt <4 x i32> %load0, %load1
   %result = select <4 x i1> %cmp, <4 x i32> %val, <4 x i32> %load0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -84,12 +84,12 @@ entry:
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define amdgpu_kernel void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
+define amdgpu_kernel void @test_select_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
 entry:
-  %0 = load <4 x float>, <4 x float> addrspace(1)* %in0
-  %1 = load <4 x float>, <4 x float> addrspace(1)* %in1
+  %0 = load <4 x float>, ptr addrspace(1) %in0
+  %1 = load <4 x float>, ptr addrspace(1) %in1
   %cmp = fcmp une <4 x float> %0, %1
   %result = select <4 x i1> %cmp, <4 x float> %0, <4 x float> %1
-  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  store <4 x float> %result, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/vselect64.ll b/llvm/test/CodeGen/AMDGPU/vselect64.ll
index 4a04355651619..e9d71ad632dfc 100644
--- a/llvm/test/CodeGen/AMDGPU/vselect64.ll
+++ b/llvm/test/CodeGen/AMDGPU/vselect64.ll
@@ -5,11 +5,11 @@
 ; Make sure the vectors aren't being stored on the stack.  We know they are
 ; being stored on the stack if the shaders uses at leat 10 registers.
 ; CHECK-NOT: {{\**}} MOV T{{[0-9][0-9]}}.X
-define amdgpu_kernel void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) {
+define amdgpu_kernel void @test_select_v4i64(ptr addrspace(1) %out, <4 x i32> %c) {
 entry:
        %cmp = icmp ne  <4 x i32> %c, <i32 0, i32 0, i32 0, i32 0>
        %result = select <4 x i1> %cmp, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x i64> <i64 4, i64 5, i64 6, i64 7>
-       store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+       store <4 x i64> %result, ptr addrspace(1) %out
        ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/vtx-fetch-branch.ll b/llvm/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
index 4c5eb3d3aa5d8..2fdea7d6dbe3b 100644
--- a/llvm/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
+++ b/llvm/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
@@ -10,18 +10,18 @@
 ; CHECK-NOT: ALU_POP_AFTER
 ; CHECK: TEX
 ; CHECK-NEXT: POP
-define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
+define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %cond) {
 entry:
   %0 = icmp eq i32 %cond, 0
   br i1 %0, label %endif, label %if
 
 if:
-  %1 = load i32, i32 addrspace(1)* %in
+  %1 = load i32, ptr addrspace(1) %in
   br label %endif
 
 endif:
   %x = phi i32 [ %1, %if], [ 0, %entry]
-  store i32 %x, i32 addrspace(1)* %out
+  store i32 %x, ptr addrspace(1) %out
   br label %done
 
 done:

diff  --git a/llvm/test/CodeGen/AMDGPU/vtx-schedule.ll b/llvm/test/CodeGen/AMDGPU/vtx-schedule.ll
index c4b619bf168f2..cc6e02b5671de 100644
--- a/llvm/test/CodeGen/AMDGPU/vtx-schedule.ll
+++ b/llvm/test/CodeGen/AMDGPU/vtx-schedule.ll
@@ -9,10 +9,10 @@
 ; CHECK: VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0
 ; CHECK: Fetch clause
 ; CHECK: VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 0
-define amdgpu_kernel void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) {
+define amdgpu_kernel void @test(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in0) {
 entry:
-  %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in0
-  %1 = load i32, i32 addrspace(1)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %0 = load ptr addrspace(1), ptr addrspace(1) %in0
+  %1 = load i32, ptr addrspace(1) %0
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index 14d3503cce6e5..2bf74d9bcdf1c 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=SI %s
 ; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
 
-define amdgpu_kernel void @widen_i16_constant_load(i16 addrspace(4)* %arg) {
+define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) {
 ; SI-LABEL: widen_i16_constant_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -32,14 +32,14 @@ define amdgpu_kernel void @widen_i16_constant_load(i16 addrspace(4)* %arg) {
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
-  %load = load i16, i16 addrspace(4)* %arg, align 4
+  %load = load i16, ptr addrspace(4) %arg, align 4
   %add = add i16 %load, 999
   %or = or i16 %add, 4
-  store i16 %or, i16 addrspace(1)* null
+  store i16 %or, ptr addrspace(1) null
   ret void
 }
 
-define amdgpu_kernel void @widen_i16_constant_load_zext_i32(i16 addrspace(4)* %arg) {
+define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %arg) {
 ; SI-LABEL: widen_i16_constant_load_zext_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -71,15 +71,15 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(i16 addrspace(4)* %a
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
-  %load = load i16, i16 addrspace(4)* %arg, align 4
+  %load = load i16, ptr addrspace(4) %arg, align 4
   %ext = zext i16 %load to i32
   %add = add i32 %ext, 999
   %or = or i32 %add, 4
-  store i32 %or, i32 addrspace(1)* null
+  store i32 %or, ptr addrspace(1) null
   ret void
 }
 
-define amdgpu_kernel void @widen_i16_constant_load_sext_i32(i16 addrspace(4)* %arg) {
+define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %arg) {
 ; SI-LABEL: widen_i16_constant_load_sext_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -111,15 +111,15 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(i16 addrspace(4)* %a
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
-  %load = load i16, i16 addrspace(4)* %arg, align 4
+  %load = load i16, ptr addrspace(4) %arg, align 4
   %ext = sext i16 %load to i32
   %add = add i32 %ext, 999
   %or = or i32 %add, 4
-  store i32 %or, i32 addrspace(1)* null
+  store i32 %or, ptr addrspace(1) null
   ret void
 }
 
-define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) {
+define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) {
 ; SI-LABEL: widen_i17_constant_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -162,14 +162,14 @@ define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) {
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_endpgm
-  %load = load i17, i17 addrspace(4)* %arg, align 4
+  %load = load i17, ptr addrspace(4) %arg, align 4
   %add = add i17 %load, 34
   %or = or i17 %add, 4
-  store i17 %or, i17 addrspace(1)* null
+  store i17 %or, ptr addrspace(1) null
   ret void
 }
 
-define amdgpu_kernel void @widen_f16_constant_load(half addrspace(4)* %arg) {
+define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) {
 ; SI-LABEL: widen_f16_constant_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -197,14 +197,14 @@ define amdgpu_kernel void @widen_f16_constant_load(half addrspace(4)* %arg) {
 ; VI-NEXT:    v_add_f16_e64 v2, s0, 4.0
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
-  %load = load half, half addrspace(4)* %arg, align 4
+  %load = load half, ptr addrspace(4) %arg, align 4
   %add = fadd half %load, 4.0
-  store half %add, half addrspace(1)* null
+  store half %add, ptr addrspace(1) null
   ret void
 }
 
 ; FIXME: valu usage on VI
-define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg) {
+define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
 ; SI-LABEL: widen_v2i8_constant_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -245,14 +245,14 @@ define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg)
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
-  %load = load <2 x i8>, <2 x i8> addrspace(4)* %arg, align 4
+  %load = load <2 x i8>, ptr addrspace(4) %arg, align 4
   %add = add <2 x i8> %load, <i8 12, i8 44>
   %or = or <2 x i8> %add, <i8 4, i8 3>
-  store <2 x i8> %or, <2 x i8> addrspace(1)* null
+  store <2 x i8> %or, ptr addrspace(1) null
   ret void
 }
 
-define amdgpu_kernel void @no_widen_i16_constant_divergent_load(i16 addrspace(4)* %arg) {
+define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) %arg) {
 ; SI-LABEL: no_widen_i16_constant_divergent_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -290,15 +290,15 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(i16 addrspace(4)
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = zext i32 %tid to i64
-  %gep.arg = getelementptr inbounds i16, i16 addrspace(4)* %arg, i64 %tid.ext
-  %load = load i16, i16 addrspace(4)* %gep.arg, align 4
+  %gep.arg = getelementptr inbounds i16, ptr addrspace(4) %arg, i64 %tid.ext
+  %load = load i16, ptr addrspace(4) %gep.arg, align 4
   %add = add i16 %load, 999
   %or = or i16 %add, 4
-  store i16 %or, i16 addrspace(1)* null
+  store i16 %or, ptr addrspace(1) null
   ret void
 }
 
-define amdgpu_kernel void @widen_i1_constant_load(i1 addrspace(4)* %arg) {
+define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) {
 ; SI-LABEL: widen_i1_constant_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -326,13 +326,13 @@ define amdgpu_kernel void @widen_i1_constant_load(i1 addrspace(4)* %arg) {
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
-  %load = load i1, i1 addrspace(4)* %arg, align 4
+  %load = load i1, ptr addrspace(4) %arg, align 4
   %and = and i1 %load, true
-  store i1 %and, i1 addrspace(1)* null
+  store i1 %and, ptr addrspace(1) null
   ret void
 }
 
-define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(i16 addrspace(4)* %arg) {
+define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4) %arg) {
 ; SI-LABEL: widen_i16_zextload_i64_constant_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -364,15 +364,15 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(i16 addrspace(4)
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
-  %load = load i16, i16 addrspace(4)* %arg, align 4
+  %load = load i16, ptr addrspace(4) %arg, align 4
   %zext = zext i16 %load to i32
   %add = add i32 %zext, 999
   %or = or i32 %add, 4
-  store i32 %or, i32 addrspace(1)* null
+  store i32 %or, ptr addrspace(1) null
   ret void
 }
 
-define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(i1 addrspace(4)* %arg) {
+define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) %arg) {
 ; SI-LABEL: widen_i1_zext_to_i64_constant_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -406,14 +406,14 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(i1 addrspace(4)* %
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
-  %load = load i1, i1 addrspace(4)* %arg, align 4
+  %load = load i1, ptr addrspace(4) %arg, align 4
   %zext = zext i1 %load to i64
   %add = add i64 %zext, 999
-  store i64 %add, i64 addrspace(1)* null
+  store i64 %add, ptr addrspace(1) null
   ret void
 }
 
-define amdgpu_kernel void @widen_i16_constant32_load(i16 addrspace(6)* %arg) {
+define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) {
 ; SI-LABEL: widen_i16_constant32_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s0, s[0:1], 0x9
@@ -444,14 +444,14 @@ define amdgpu_kernel void @widen_i16_constant32_load(i16 addrspace(6)* %arg) {
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
-  %load = load i16, i16 addrspace(6)* %arg, align 4
+  %load = load i16, ptr addrspace(6) %arg, align 4
   %add = add i16 %load, 999
   %or = or i16 %add, 4
-  store i16 %or, i16 addrspace(1)* null
+  store i16 %or, ptr addrspace(1) null
   ret void
 }
 
-define amdgpu_kernel void @widen_i16_global_invariant_load(i16 addrspace(1)* %arg) {
+define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg) {
 ; SI-LABEL: widen_i16_global_invariant_load:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -481,10 +481,10 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(i16 addrspace(1)* %ar
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
-  %load = load i16, i16 addrspace(1)* %arg, align 4, !invariant.load !0
+  %load = load i16, ptr addrspace(1) %arg, align 4, !invariant.load !0
   %add = add i16 %load, 999
   %or = or i16 %add, 1
-  store i16 %or, i16 addrspace(1)* null
+  store i16 %or, ptr addrspace(1) null
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll b/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
index e2858fd259362..cac2bfde634c2 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
@@ -22,7 +22,7 @@ bb:
   %tmp8 = icmp sgt <4 x i64> %tmp6, <i64 -1, i64 -1, i64 -1, i64 -1>
   %tmp9 = and <4 x i1> %tmp8, %tmp7
   %tmp10 = select <4 x i1> %tmp9, <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, <4 x double> zeroinitializer
-  store <4 x double> %tmp10, <4 x double> addrspace(1)* null, align 32
+  store <4 x double> %tmp10, ptr addrspace(1) null, align 32
   ret void
 }
 
@@ -44,7 +44,7 @@ bb:
   %tmp8 = icmp sgt <4 x i64> %tmp6, <i64 -1, i64 -1, i64 -1, i64 -1>
   %tmp9 = and <4 x i1> %tmp8, %tmp7
   %tmp10 = select <4 x i1> %tmp9, <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i64> zeroinitializer
-  store <4 x i64> %tmp10, <4 x i64> addrspace(1)* null, align 32
+  store <4 x i64> %tmp10, ptr addrspace(1) null, align 32
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll b/llvm/test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll
index f3532f8d0b852..24c1875159f67 100644
--- a/llvm/test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll
@@ -1,333 +1,319 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare -amdgpu-codegenprepare-widen-constant-loads < %s | FileCheck -check-prefix=OPT %s
 
-declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
 
-define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_i1(
-; OPT-NEXT:    [[VAL:%.*]] = load i1, i1 addrspace(4)* [[IN:%.*]], align 1
-; OPT-NEXT:    store i1 [[VAL]], i1 addrspace(1)* [[OUT:%.*]], align 1
+; OPT-NEXT:    [[VAL:%.*]] = load i1, ptr addrspace(4) [[IN:%.*]], align 1
+; OPT-NEXT:    store i1 [[VAL]], ptr addrspace(1) [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
-  %val = load i1, i1 addrspace(4)* %in
-  store i1 %val, i1 addrspace(1)* %out
+  %val = load i1, ptr addrspace(4) %in
+  store i1 %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_i1_align2(i1 addrspace(1)* %out, i1 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_i1_align2(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_i1_align2(
-; OPT-NEXT:    [[VAL:%.*]] = load i1, i1 addrspace(4)* [[IN:%.*]], align 2
-; OPT-NEXT:    store i1 [[VAL]], i1 addrspace(1)* [[OUT:%.*]], align 2
+; OPT-NEXT:    [[VAL:%.*]] = load i1, ptr addrspace(4) [[IN:%.*]], align 2
+; OPT-NEXT:    store i1 [[VAL]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; OPT-NEXT:    ret void
 ;
-  %val = load i1, i1 addrspace(4)* %in, align 2
-  store i1 %val, i1 addrspace(1)* %out, align 2
+  %val = load i1, ptr addrspace(4) %in, align 2
+  store i1 %val, ptr addrspace(1) %out, align 2
   ret void
 }
 
-define amdgpu_kernel void @constant_load_i1_align4(i1 addrspace(1)* %out, i1 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_i1_align4(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_i1_align4(
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast i1 addrspace(4)* [[IN:%.*]] to i32 addrspace(4)*
-; OPT-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[IN:%.*]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i1
-; OPT-NEXT:    store i1 [[TMP3]], i1 addrspace(1)* [[OUT:%.*]], align 4
+; OPT-NEXT:    store i1 [[TMP3]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; OPT-NEXT:    ret void
 ;
-  %val = load i1, i1 addrspace(4)* %in, align 4
-  store i1 %val, i1 addrspace(1)* %out, align 4
+  %val = load i1, ptr addrspace(4) %in, align 4
+  store i1 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_i8(
-; OPT-NEXT:    [[VAL:%.*]] = load i8, i8 addrspace(4)* [[IN:%.*]], align 1
-; OPT-NEXT:    store i8 [[VAL]], i8 addrspace(1)* [[OUT:%.*]], align 1
+; OPT-NEXT:    [[VAL:%.*]] = load i8, ptr addrspace(4) [[IN:%.*]], align 1
+; OPT-NEXT:    store i8 [[VAL]], ptr addrspace(1) [[OUT:%.*]], align 1
 ; OPT-NEXT:    ret void
 ;
-  %val = load i8, i8 addrspace(4)* %in
-  store i8 %val, i8 addrspace(1)* %out
+  %val = load i8, ptr addrspace(4) %in
+  store i8 %val, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_i8_align2(i8 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_i8_align2(
-; OPT-NEXT:    [[VAL:%.*]] = load i8, i8 addrspace(4)* [[IN:%.*]], align 2
-; OPT-NEXT:    store i8 [[VAL]], i8 addrspace(1)* [[OUT:%.*]], align 2
+; OPT-NEXT:    [[VAL:%.*]] = load i8, ptr addrspace(4) [[IN:%.*]], align 2
+; OPT-NEXT:    store i8 [[VAL]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; OPT-NEXT:    ret void
 ;
-  %val = load i8, i8 addrspace(4)* %in, align 2
-  store i8 %val, i8 addrspace(1)* %out, align 2
+  %val = load i8, ptr addrspace(4) %in, align 2
+  store i8 %val, ptr addrspace(1) %out, align 2
   ret void
 }
 
-define amdgpu_kernel void @constant_load_i8align4(i8 addrspace(1)* %out, i8 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_i8align4(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_i8align4(
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN:%.*]] to i32 addrspace(4)*
-; OPT-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[IN:%.*]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
-; OPT-NEXT:    store i8 [[TMP3]], i8 addrspace(1)* [[OUT:%.*]], align 4
+; OPT-NEXT:    store i8 [[TMP3]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; OPT-NEXT:    ret void
 ;
-  %val = load i8, i8 addrspace(4)* %in, align 4
-  store i8 %val, i8 addrspace(1)* %out, align 4
+  %val = load i8, ptr addrspace(4) %in, align 4
+  store i8 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_v2i8(
-; OPT-NEXT:    [[LD:%.*]] = load <2 x i8>, <2 x i8> addrspace(4)* [[IN:%.*]], align 2
-; OPT-NEXT:    store <2 x i8> [[LD]], <2 x i8> addrspace(1)* [[OUT:%.*]], align 2
+; OPT-NEXT:    [[LD:%.*]] = load <2 x i8>, ptr addrspace(4) [[IN:%.*]], align 2
+; OPT-NEXT:    store <2 x i8> [[LD]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; OPT-NEXT:    ret void
 ;
-  %ld = load <2 x i8>, <2 x i8> addrspace(4)* %in
-  store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
+  %ld = load <2 x i8>, ptr addrspace(4) %in
+  store <2 x i8> %ld, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_v2i8_align4(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i8_align4(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_v2i8_align4(
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast <2 x i8> addrspace(4)* [[IN:%.*]] to i32 addrspace(4)*
-; OPT-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[IN:%.*]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
 ; OPT-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP3]] to <2 x i8>
-; OPT-NEXT:    store <2 x i8> [[TMP4]], <2 x i8> addrspace(1)* [[OUT:%.*]], align 4
+; OPT-NEXT:    store <2 x i8> [[TMP4]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; OPT-NEXT:    ret void
 ;
-  %ld = load <2 x i8>, <2 x i8> addrspace(4)* %in, align 4
-  store <2 x i8> %ld, <2 x i8> addrspace(1)* %out, align 4
+  %ld = load <2 x i8>, ptr addrspace(4) %in, align 4
+  store <2 x i8> %ld, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_v3i8(
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast <3 x i8> addrspace(4)* [[IN:%.*]] to i32 addrspace(4)*
-; OPT-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[IN:%.*]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i24
 ; OPT-NEXT:    [[TMP4:%.*]] = bitcast i24 [[TMP3]] to <3 x i8>
-; OPT-NEXT:    store <3 x i8> [[TMP4]], <3 x i8> addrspace(1)* [[OUT:%.*]], align 4
+; OPT-NEXT:    store <3 x i8> [[TMP4]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; OPT-NEXT:    ret void
 ;
-  %ld = load <3 x i8>, <3 x i8> addrspace(4)* %in
-  store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
+  %ld = load <3 x i8>, ptr addrspace(4) %in
+  store <3 x i8> %ld, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i8_align4(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_v3i8_align4(
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast <3 x i8> addrspace(4)* [[IN:%.*]] to i32 addrspace(4)*
-; OPT-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[IN:%.*]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i24
 ; OPT-NEXT:    [[TMP4:%.*]] = bitcast i24 [[TMP3]] to <3 x i8>
-; OPT-NEXT:    store <3 x i8> [[TMP4]], <3 x i8> addrspace(1)* [[OUT:%.*]], align 4
+; OPT-NEXT:    store <3 x i8> [[TMP4]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; OPT-NEXT:    ret void
 ;
-  %ld = load <3 x i8>, <3 x i8> addrspace(4)* %in, align 4
-  store <3 x i8> %ld, <3 x i8> addrspace(1)* %out, align 4
+  %ld = load <3 x i8>, ptr addrspace(4) %in, align 4
+  store <3 x i8> %ld, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @constant_load_i16(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_i16(
-; OPT-NEXT:    [[LD:%.*]] = load i16, i16 addrspace(4)* [[IN:%.*]], align 2
+; OPT-NEXT:    [[LD:%.*]] = load i16, ptr addrspace(4) [[IN:%.*]], align 2
 ; OPT-NEXT:    [[EXT:%.*]] = sext i16 [[LD]] to i32
-; OPT-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; OPT-NEXT:    store i32 [[EXT]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; OPT-NEXT:    ret void
 ;
-  %ld = load i16, i16 addrspace(4)* %in
+  %ld = load i16, ptr addrspace(4) %in
   %ext = sext i16 %ld to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_i16_align4(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_i16_align4(
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast i16 addrspace(4)* [[IN:%.*]] to i32 addrspace(4)*
-; OPT-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[IN:%.*]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
 ; OPT-NEXT:    [[EXT:%.*]] = sext i16 [[TMP3]] to i32
-; OPT-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; OPT-NEXT:    store i32 [[EXT]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; OPT-NEXT:    ret void
 ;
-  %ld = load i16, i16 addrspace(4)* %in, align 4
+  %ld = load i16, ptr addrspace(4) %in, align 4
   %ext = sext i16 %ld to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
+  store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @constant_load_f16(half addrspace(1)* %out, half addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_f16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_f16(
-; OPT-NEXT:    [[LD:%.*]] = load half, half addrspace(4)* [[IN:%.*]], align 2
-; OPT-NEXT:    store half [[LD]], half addrspace(1)* [[OUT:%.*]], align 2
+; OPT-NEXT:    [[LD:%.*]] = load half, ptr addrspace(4) [[IN:%.*]], align 2
+; OPT-NEXT:    store half [[LD]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; OPT-NEXT:    ret void
 ;
-  %ld = load half, half addrspace(4)* %in
-  store half %ld, half addrspace(1)* %out
+  %ld = load half, ptr addrspace(4) %in
+  store half %ld, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2f16(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_v2f16(
-; OPT-NEXT:    [[LD:%.*]] = load <2 x half>, <2 x half> addrspace(4)* [[IN:%.*]], align 4
-; OPT-NEXT:    store <2 x half> [[LD]], <2 x half> addrspace(1)* [[OUT:%.*]], align 4
+; OPT-NEXT:    [[LD:%.*]] = load <2 x half>, ptr addrspace(4) [[IN:%.*]], align 4
+; OPT-NEXT:    store <2 x half> [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; OPT-NEXT:    ret void
 ;
-  %ld = load <2 x half>, <2 x half> addrspace(4)* %in
-  store <2 x half> %ld, <2 x half> addrspace(1)* %out
+  %ld = load <2 x half>, ptr addrspace(4) %in
+  store <2 x half> %ld, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @load_volatile(i16 addrspace(1)* %out, i16 addrspace(4)* %in) {
+define amdgpu_kernel void @load_volatile(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 ; OPT-LABEL: @load_volatile(
-; OPT-NEXT:    [[A:%.*]] = load volatile i16, i16 addrspace(4)* [[IN:%.*]], align 2
-; OPT-NEXT:    store i16 [[A]], i16 addrspace(1)* [[OUT:%.*]], align 2
+; OPT-NEXT:    [[A:%.*]] = load volatile i16, ptr addrspace(4) [[IN:%.*]], align 2
+; OPT-NEXT:    store i16 [[A]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; OPT-NEXT:    ret void
 ;
-  %a = load volatile i16, i16 addrspace(4)* %in
-  store i16 %a, i16 addrspace(1)* %out
+  %a = load volatile i16, ptr addrspace(4) %in
+  store i16 %a, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_v2i8_volatile(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i8_volatile(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_v2i8_volatile(
-; OPT-NEXT:    [[LD:%.*]] = load volatile <2 x i8>, <2 x i8> addrspace(4)* [[IN:%.*]], align 2
-; OPT-NEXT:    store <2 x i8> [[LD]], <2 x i8> addrspace(1)* [[OUT:%.*]], align 2
+; OPT-NEXT:    [[LD:%.*]] = load volatile <2 x i8>, ptr addrspace(4) [[IN:%.*]], align 2
+; OPT-NEXT:    store <2 x i8> [[LD]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; OPT-NEXT:    ret void
 ;
-  %ld = load volatile <2 x i8>, <2 x i8> addrspace(4)* %in
-  store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
+  %ld = load volatile <2 x i8>, ptr addrspace(4) %in
+  store <2 x i8> %ld, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_v2i8_addrspace1(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i8_addrspace1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; OPT-LABEL: @constant_load_v2i8_addrspace1(
-; OPT-NEXT:    [[LD:%.*]] = load <2 x i8>, <2 x i8> addrspace(1)* [[IN:%.*]], align 2
-; OPT-NEXT:    store <2 x i8> [[LD]], <2 x i8> addrspace(1)* [[OUT:%.*]], align 2
+; OPT-NEXT:    [[LD:%.*]] = load <2 x i8>, ptr addrspace(1) [[IN:%.*]], align 2
+; OPT-NEXT:    store <2 x i8> [[LD]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; OPT-NEXT:    ret void
 ;
-  %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
-  store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
+  %ld = load <2 x i8>, ptr addrspace(1) %in
+  store <2 x i8> %ld, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_dispatch_ptr(ptr addrspace(1) %ptr) #1 {
 ; OPT-LABEL: @use_dispatch_ptr(
-; OPT-NEXT:    [[DISPATCH_PTR:%.*]] = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[DISPATCH_PTR]] to i32 addrspace(4)*
-; OPT-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
+; OPT-NEXT:    [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DISPATCH_PTR]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
 ; OPT-NEXT:    [[LD:%.*]] = zext i8 [[TMP3]] to i32
-; OPT-NEXT:    store i32 [[LD]], i32 addrspace(1)* [[PTR:%.*]], align 4
+; OPT-NEXT:    store i32 [[LD]], ptr addrspace(1) [[PTR:%.*]], align 4
 ; OPT-NEXT:    ret void
 ;
-  %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %val = load i8, i8 addrspace(4)* %dispatch.ptr, align 4
+  %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %val = load i8, ptr addrspace(4) %dispatch.ptr, align 4
   %ld = zext i8 %val to i32
-  store i32 %ld, i32 addrspace(1)* %ptr
+  store i32 %ld, ptr addrspace(1) %ptr
   ret void
 }
 
-define amdgpu_kernel void @constant_load_i16_align4_range(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_i16_align4_range(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_i16_align4_range(
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast i16 addrspace(4)* [[IN:%.*]] to i32 addrspace(4)*
-; OPT-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4, !range [[RNG0:![0-9]+]]
+; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[IN:%.*]], align 4, !range [[RNG0:![0-9]+]]
 ; OPT-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
 ; OPT-NEXT:    [[EXT:%.*]] = sext i16 [[TMP3]] to i32
-; OPT-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; OPT-NEXT:    store i32 [[EXT]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; OPT-NEXT:    ret void
 ;
-  %ld = load i16, i16 addrspace(4)* %in, align 4, !range !0
+  %ld = load i16, ptr addrspace(4) %in, align 4, !range !0
   %ext = sext i16 %ld to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_i16_align4_range_max(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_i16_align4_range_max(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_i16_align4_range_max(
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast i16 addrspace(4)* [[IN:%.*]] to i32 addrspace(4)*
-; OPT-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4, !range [[RNG0]]
+; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[IN:%.*]], align 4, !range [[RNG0]]
 ; OPT-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
 ; OPT-NEXT:    [[EXT:%.*]] = sext i16 [[TMP3]] to i32
-; OPT-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; OPT-NEXT:    store i32 [[EXT]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; OPT-NEXT:    ret void
 ;
-  %ld = load i16, i16 addrspace(4)* %in, align 4, !range !1
+  %ld = load i16, ptr addrspace(4) %in, align 4, !range !1
   %ext = sext i16 %ld to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_i16_align4_complex_range(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_i16_align4_complex_range(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_i16_align4_complex_range(
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast i16 addrspace(4)* [[IN:%.*]] to i32 addrspace(4)*
-; OPT-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4, !range [[RNG1:![0-9]+]]
+; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[IN:%.*]], align 4, !range [[RNG1:![0-9]+]]
 ; OPT-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
 ; OPT-NEXT:    [[EXT:%.*]] = sext i16 [[TMP3]] to i32
-; OPT-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; OPT-NEXT:    store i32 [[EXT]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; OPT-NEXT:    ret void
 ;
-  %ld = load i16, i16 addrspace(4)* %in, align 4, !range !2
+  %ld = load i16, ptr addrspace(4) %in, align 4, !range !2
   %ext = sext i16 %ld to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_i16_align4_range_from_0(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_i16_align4_range_from_0(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_i16_align4_range_from_0(
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast i16 addrspace(4)* [[IN:%.*]] to i32 addrspace(4)*
-; OPT-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[IN:%.*]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
 ; OPT-NEXT:    [[EXT:%.*]] = sext i16 [[TMP3]] to i32
-; OPT-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; OPT-NEXT:    store i32 [[EXT]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; OPT-NEXT:    ret void
 ;
-  %ld = load i16, i16 addrspace(4)* %in, align 4, !range !3
+  %ld = load i16, ptr addrspace(4) %in, align 4, !range !3
   %ext = sext i16 %ld to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_i16_align4_range_from_neg(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_i16_align4_range_from_neg(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_i16_align4_range_from_neg(
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast i16 addrspace(4)* [[IN:%.*]] to i32 addrspace(4)*
-; OPT-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4, !range [[RNG2:![0-9]+]]
+; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[IN:%.*]], align 4, !range [[RNG2:![0-9]+]]
 ; OPT-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
 ; OPT-NEXT:    [[EXT:%.*]] = sext i16 [[TMP3]] to i32
-; OPT-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; OPT-NEXT:    store i32 [[EXT]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; OPT-NEXT:    ret void
 ;
-  %ld = load i16, i16 addrspace(4)* %in, align 4, !range !4
+  %ld = load i16, ptr addrspace(4) %in, align 4, !range !4
   %ext = sext i16 %ld to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_i16_align4_range_from_neg_to_0(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_i16_align4_range_from_neg_to_0(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_i16_align4_range_from_neg_to_0(
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast i16 addrspace(4)* [[IN:%.*]] to i32 addrspace(4)*
-; OPT-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4, !range [[RNG2]]
+; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[IN:%.*]], align 4, !range [[RNG2]]
 ; OPT-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
 ; OPT-NEXT:    [[EXT:%.*]] = sext i16 [[TMP3]] to i32
-; OPT-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; OPT-NEXT:    store i32 [[EXT]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; OPT-NEXT:    ret void
 ;
-  %ld = load i16, i16 addrspace(4)* %in, align 4, !range !5
+  %ld = load i16, ptr addrspace(4) %in, align 4, !range !5
   %ext = sext i16 %ld to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @constant_load_i16_align4_invariant(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
+define amdgpu_kernel void @constant_load_i16_align4_invariant(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 ; OPT-LABEL: @constant_load_i16_align4_invariant(
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast i16 addrspace(4)* [[IN:%.*]] to i32 addrspace(4)*
-; OPT-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4, !invariant.load !3
+; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[IN:%.*]], align 4, !invariant.load !3
 ; OPT-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
 ; OPT-NEXT:    [[EXT:%.*]] = sext i16 [[TMP3]] to i32
-; OPT-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; OPT-NEXT:    store i32 [[EXT]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; OPT-NEXT:    ret void
 ;
-  %ld = load i16, i16 addrspace(4)* %in, align 4, !invariant.load !6
+  %ld = load i16, ptr addrspace(4) %in, align 4, !invariant.load !6
   %ext = sext i16 %ld to i32
-  store i32 %ext, i32 addrspace(1)* %out
+  store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
index 7cd9fb9bcd4fc..c47c84b3b848b 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
@@ -11,11 +11,11 @@ define amdgpu_cs void @xyz () {
 .entry:
   br label %loop
 loop:
-  %ld = load <8 x float>, <8 x float> addrspace(5)* null, align 32
+  %ld = load <8 x float>, ptr addrspace(5) null, align 32
   %in_shuffle = shufflevector <8 x float> %ld, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %wmma = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> undef, <16 x half> undef, <4 x float> %in_shuffle)
   %out_shuffle = shufflevector <4 x float> %wmma, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-  store <8 x float> %out_shuffle, <8 x float> addrspace(5)* null, align 32
+  store <8 x float> %out_shuffle, ptr addrspace(5) null, align 32
   br i1 false, label %.exit, label %loop
 .exit:
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
index 4ef413dae002f..878527f6e715d 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
@@ -20,7 +20,7 @@ declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 im
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_f32_16x16x16_f16:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f32_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23]
@@ -36,14 +36,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
   %res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %B, <16 x half> %B, <8 x float> %C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
-  store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32
+  store <8 x float> %res, ptr addrspace(1) %out, align 32
+  store <8 x float> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_f32_16x16x16_bf16:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f32_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23]
@@ -59,14 +59,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B,
 bb:
   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
   %res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <8 x float> %C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
-  store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32
+  store <8 x float> %res, ptr addrspace(1) %out, align 32
+  store <8 x float> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
 ; @llvm.amdgcn.wmma.f16.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <16 x half> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23]
@@ -82,12 +82,12 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half>
 bb:
   %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0)
   %res2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <16 x half> %C, i1 0)
-  store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32
-  store <16 x half> %res2, <16 x half> addrspace(1)* %out2, align 32
+  store <16 x half> %res, ptr addrspace(1) %out, align 32
+  store <16 x half> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <16 x half> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
@@ -103,14 +103,14 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half>
 bb:
   %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1)
   %res2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <16 x half> %C, i1 1)
-  store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32
-  store <16 x half> %res2, <16 x half> addrspace(1)* %out2, align 32
+  store <16 x half> %res, ptr addrspace(1) %out, align 32
+  store <16 x half> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23]
@@ -126,12 +126,12 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16>
 bb:
   %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0)
   %res2 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <16 x i16> %C, i1 0)
-  store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32
-  store <16 x i16> %res2, <16 x i16> addrspace(1)* %out2, align 32
+  store <16 x i16> %res, ptr addrspace(1) %out, align 32
+  store <16 x i16> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
@@ -147,14 +147,14 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16>
 bb:
   %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1)
   %res2 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <16 x i16> %C, i1 1)
-  store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32
-  store <16 x i16> %res2, <16 x i16> addrspace(1)* %out2, align 32
+  store <16 x i16> %res, ptr addrspace(1) %out, align 32
+  store <16 x i16> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu8
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15]
@@ -170,12 +170,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
   %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
-  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
+  store <8 x i32> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
@@ -191,12 +191,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A,
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
   %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
-  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
+  store <8 x i32> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
@@ -212,12 +212,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A,
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
   %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
-  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
+  store <8 x i32> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0]
@@ -233,12 +233,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
   %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
-  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
+  store <8 x i32> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] clamp
@@ -254,12 +254,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
   %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
-  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
+  store <8 x i32> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] clamp
@@ -275,12 +275,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
   %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
-  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
+  store <8 x i32> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] clamp
@@ -296,12 +296,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
   %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
-  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
+  store <8 x i32> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu8 v[20:27], v[0:3], v[4:7], v[8:15] neg_lo:[1,1,0] clamp
@@ -317,14 +317,14 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32>
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
   %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
-  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
+  store <8 x i32> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu4
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11]
@@ -340,12 +340,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
   %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
-  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
+  store <8 x i32> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
@@ -361,12 +361,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A,
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
   %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
-  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
+  store <8 x i32> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
@@ -382,12 +382,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A,
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
   %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
-  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
+  store <8 x i32> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0]
@@ -403,13 +403,13 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
   %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
-  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
+  store <8 x i32> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] clamp
@@ -425,12 +425,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
   %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
-  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
+  store <8 x i32> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] clamp
@@ -446,12 +446,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
   %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
-  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
+  store <8 x i32> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] clamp
@@ -467,12 +467,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
   %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
-  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
+  store <8 x i32> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W32-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_i32_16x16x16_iu4 v[16:23], v[0:1], v[2:3], v[4:11] neg_lo:[1,1,0] clamp
@@ -488,8 +488,8 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32>
 bb:
   %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
   %res2 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
-  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
+  store <8 x i32> %res, ptr addrspace(1) %out, align 32
+  store <8 x i32> %res2, ptr addrspace(1) %out2, align 32
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
index 3ce95a757cc71..9848d4aabc455 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
@@ -20,7 +20,7 @@ declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 im
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <4 x float> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_f32_16x16x16_f16:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f32_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19]
@@ -32,14 +32,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <4 x float> %C)
   %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %B, <16 x half> %B, <4 x float> %C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
-  store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16
+  store <4 x float> %res, ptr addrspace(1) %out, align 16
+  store <4 x float> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_f32_16x16x16_bf16:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f32_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19]
@@ -51,14 +51,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B,
 bb:
   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C)
   %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <4 x float> %C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
-  store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16
+  store <4 x float> %res, ptr addrspace(1) %out, align 16
+  store <4 x float> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
 ; @llvm.amdgcn.wmma.f16.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19]
@@ -70,12 +70,12 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half>
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0)
   %res2 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <8 x half> %C, i1 0)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16
-  store <8 x half> %res2, <8 x half> addrspace(1)* %out2, align 16
+  store <8 x half> %res, ptr addrspace(1) %out, align 16
+  store <8 x half> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
@@ -87,14 +87,14 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half>
 bb:
   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1)
   %res2 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <8 x half> %C, i1 1)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16
-  store <8 x half> %res2, <8 x half> addrspace(1)* %out2, align 16
+  store <8 x half> %res, ptr addrspace(1) %out, align 16
+  store <8 x half> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19]
@@ -106,12 +106,12 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16>
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0)
   %res2 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <8 x i16> %C, i1 0)
-  store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16
-  store <8 x i16> %res2, <8 x i16> addrspace(1)* %out2, align 16
+  store <8 x i16> %res, ptr addrspace(1) %out, align 16
+  store <8 x i16> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
@@ -123,14 +123,14 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16>
 bb:
   %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1)
   %res2 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <8 x i16> %C, i1 1)
-  store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16
-  store <8 x i16> %res2, <8 x i16> addrspace(1)* %out2, align 16
+  store <8 x i16> %res, ptr addrspace(1) %out, align 16
+  store <8 x i16> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu8
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11]
@@ -142,13 +142,13 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
   %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
+  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
@@ -160,12 +160,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A,
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
   %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
+  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
@@ -177,12 +177,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A,
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
   %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
+  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0]
@@ -194,12 +194,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
   %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
+  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] clamp
@@ -211,12 +211,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
   %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
+  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
@@ -228,12 +228,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
   %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
+  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
@@ -245,12 +245,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
   %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
+  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui8_signed_signed_clamp:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu8 v[16:19], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
@@ -262,14 +262,14 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32>
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
   %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %B, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
+  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu4
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7]
@@ -281,12 +281,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
   %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
+  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
@@ -298,12 +298,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A,
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
   %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
+  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
@@ -315,12 +315,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A,
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
   %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
+  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0]
@@ -332,12 +332,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
   %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
+  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] clamp
@@ -349,12 +349,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
   %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
+  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
@@ -366,12 +366,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
   %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
+  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
@@ -383,12 +383,12 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
   %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
+  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
 ; W64-LABEL: test_wmma_i32_16x16x16_ui4_signed_signed_clamp:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_i32_16x16x16_iu4 v[12:15], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
@@ -400,8 +400,8 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32>
 bb:
   %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
   %res2 = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %B, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
+  store <4 x i32> %res, ptr addrspace(1) %out, align 16
+  store <4 x i32> %res2, ptr addrspace(1) %out2, align 16
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/xnor.ll b/llvm/test/CodeGen/AMDGPU/xnor.ll
index bf5cb3ece02cd..75a21bdc0ff3b 100644
--- a/llvm/test/CodeGen/AMDGPU/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xnor.ll
@@ -7,11 +7,11 @@
 ; GCN-LABEL: {{^}}scalar_xnor_i32_one_use
 ; GCN: s_xnor_b32
 define amdgpu_kernel void @scalar_xnor_i32_one_use(
-    i32 addrspace(1)* %r0, i32 %a, i32 %b) {
+    ptr addrspace(1) %r0, i32 %a, i32 %b) {
 entry:
   %xor = xor i32 %a, %b
   %r0.val = xor i32 %xor, -1
-  store i32 %r0.val, i32 addrspace(1)* %r0
+  store i32 %r0.val, ptr addrspace(1) %r0
   ret void
 }
 
@@ -21,24 +21,24 @@ entry:
 ; GCN: s_not_b32
 ; GCN: s_add_i32
 define amdgpu_kernel void @scalar_xnor_i32_mul_use(
-    i32 addrspace(1)* %r0, i32 addrspace(1)* %r1, i32 %a, i32 %b) {
+    ptr addrspace(1) %r0, ptr addrspace(1) %r1, i32 %a, i32 %b) {
 entry:
   %xor = xor i32 %a, %b
   %r0.val = xor i32 %xor, -1
   %r1.val = add i32 %xor, %a
-  store i32 %r0.val, i32 addrspace(1)* %r0
-  store i32 %r1.val, i32 addrspace(1)* %r1
+  store i32 %r0.val, ptr addrspace(1) %r0
+  store i32 %r1.val, ptr addrspace(1) %r1
   ret void
 }
 
 ; GCN-LABEL: {{^}}scalar_xnor_i64_one_use
 ; GCN: s_xnor_b64
 define amdgpu_kernel void @scalar_xnor_i64_one_use(
-    i64 addrspace(1)* %r0, i64 %a, i64 %b) {
+    ptr addrspace(1) %r0, i64 %a, i64 %b) {
 entry:
   %xor = xor i64 %a, %b
   %r0.val = xor i64 %xor, -1
-  store i64 %r0.val, i64 addrspace(1)* %r0
+  store i64 %r0.val, ptr addrspace(1) %r0
   ret void
 }
 
@@ -49,13 +49,13 @@ entry:
 ; GCN: s_add_u32
 ; GCN: s_addc_u32
 define amdgpu_kernel void @scalar_xnor_i64_mul_use(
-    i64 addrspace(1)* %r0, i64 addrspace(1)* %r1, i64 %a, i64 %b) {
+    ptr addrspace(1) %r0, ptr addrspace(1) %r1, i64 %a, i64 %b) {
 entry:
   %xor = xor i64 %a, %b
   %r0.val = xor i64 %xor, -1
   %r1.val = add i64 %xor, %a
-  store i64 %r0.val, i64 addrspace(1)* %r0
-  store i64 %r1.val, i64 addrspace(1)* %r1
+  store i64 %r0.val, ptr addrspace(1) %r0
+  store i64 %r1.val, ptr addrspace(1) %r1
   ret void
 }
 
@@ -90,11 +90,11 @@ entry:
 ; GCN-NOT: s_xnor_b32
 ; GCN: s_not_b32
 ; GCN: v_xor_b32
-define amdgpu_kernel void @xnor_s_v_i32_one_use(i32 addrspace(1)* %out, i32 %s) {
+define amdgpu_kernel void @xnor_s_v_i32_one_use(ptr addrspace(1) %out, i32 %s) {
   %v = call i32 @llvm.amdgcn.workitem.id.x() #1
   %xor = xor i32 %s, %v
   %d = xor i32 %xor, -1
-  store i32 %d, i32 addrspace(1)* %out
+  store i32 %d, ptr addrspace(1) %out
   ret void
 }
 
@@ -102,11 +102,11 @@ define amdgpu_kernel void @xnor_s_v_i32_one_use(i32 addrspace(1)* %out, i32 %s)
 ; GCN-NOT: s_xnor_b32
 ; GCN: s_not_b32
 ; GCN: v_xor_b32
-define amdgpu_kernel void @xnor_v_s_i32_one_use(i32 addrspace(1)* %out, i32 %s) {
+define amdgpu_kernel void @xnor_v_s_i32_one_use(ptr addrspace(1) %out, i32 %s) {
   %v = call i32 @llvm.amdgcn.workitem.id.x() #1
   %xor = xor i32 %v, %s
   %d = xor i32 %xor, -1
-  store i32 %d, i32 addrspace(1)* %out
+  store i32 %d, ptr addrspace(1) %out
   ret void
 }
 
@@ -118,14 +118,14 @@ define amdgpu_kernel void @xnor_v_s_i32_one_use(i32 addrspace(1)* %out, i32 %s)
 ; GCN-DL: v_xnor_b32
 ; GCN-DL: v_xnor_b32
 define amdgpu_kernel void @xnor_i64_s_v_one_use(
-  i64 addrspace(1)* %r0, i64 %a) {
+  ptr addrspace(1) %r0, i64 %a) {
 entry:
   %b32 = call i32 @llvm.amdgcn.workitem.id.x() #1
   %b64 = zext i32 %b32 to i64
   %b = shl i64 %b64, 29
   %xor = xor i64 %a, %b
   %r0.val = xor i64 %xor, -1
-  store i64 %r0.val, i64 addrspace(1)* %r0
+  store i64 %r0.val, ptr addrspace(1) %r0
   ret void
 }
 
@@ -137,14 +137,14 @@ entry:
 ; GCN-DL: v_xnor_b32
 ; GCN-DL: v_xnor_b32
 define amdgpu_kernel void @xnor_i64_v_s_one_use(
-  i64 addrspace(1)* %r0, i64 %a) {
+  ptr addrspace(1) %r0, i64 %a) {
 entry:
   %b32 = call i32 @llvm.amdgcn.workitem.id.x() #1
   %b64 = zext i32 %b32 to i64
   %b = shl i64 %b64, 29
   %xor = xor i64 %b, %a
   %r0.val = xor i64 %xor, -1
-  store i64 %r0.val, i64 addrspace(1)* %r0
+  store i64 %r0.val, ptr addrspace(1) %r0
   ret void
 }
 
@@ -175,22 +175,22 @@ entry:
 ; GCN-LABEL: {{^}}scalar_xor_a_nb_i64_one_use
 ; GCN: s_xnor_b64
 define amdgpu_kernel void @scalar_xor_a_nb_i64_one_use(
-    i64 addrspace(1)* %r0, i64 %a, i64 %b) {
+    ptr addrspace(1) %r0, i64 %a, i64 %b) {
 entry:
   %nb = xor i64 %b, -1
   %r0.val = xor i64 %a, %nb
-  store i64 %r0.val, i64 addrspace(1)* %r0
+  store i64 %r0.val, ptr addrspace(1) %r0
   ret void
 }
 
 ; GCN-LABEL: {{^}}scalar_xor_na_b_i64_one_use
 ; GCN: s_xnor_b64
 define amdgpu_kernel void @scalar_xor_na_b_i64_one_use(
-    i64 addrspace(1)* %r0, i64 %a, i64 %b) {
+    ptr addrspace(1) %r0, i64 %a, i64 %b) {
 entry:
   %na = xor i64 %a, -1
   %r0.val = xor i64 %na, %b
-  store i64 %r0.val, i64 addrspace(1)* %r0
+  store i64 %r0.val, ptr addrspace(1) %r0
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index a5e055290d37e..543b0f13a294d 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -10,11 +10,11 @@
 ; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define amdgpu_kernel void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
-  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in0
-  %b = load <2 x i32>, <2 x i32> addrspace(1) * %in1
+define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load <2 x i32>, ptr addrspace(1) %in0
+  %b = load <2 x i32>, ptr addrspace(1) %in1
   %result = xor <2 x i32> %a, %b
-  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -29,11 +29,11 @@ define amdgpu_kernel void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> add
 ; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
-define amdgpu_kernel void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
-  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in0
-  %b = load <4 x i32>, <4 x i32> addrspace(1) * %in1
+define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load <4 x i32>, ptr addrspace(1) %in0
+  %b = load <4 x i32>, ptr addrspace(1) %in1
   %result = xor <4 x i32> %a, %b
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -46,14 +46,14 @@ define amdgpu_kernel void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> add
 ; SI: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
-  %a = load float, float addrspace(1) * %in0
-  %b = load float, float addrspace(1) * %in1
+define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load float, ptr addrspace(1) %in0
+  %b = load float, ptr addrspace(1) %in1
   %acmp = fcmp oge float %a, 0.000000e+00
   %bcmp = fcmp oge float %b, 1.000000e+00
   %xor = xor i1 %acmp, %bcmp
   %result = select i1 %xor, float %a, float %b
-  store float %result, float addrspace(1)* %out
+  store float %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -63,47 +63,47 @@ define amdgpu_kernel void @xor_i1(float addrspace(1)* %out, float addrspace(1)*
 ; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[B]], [[A]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
 ; SI: buffer_store_byte [[RESULT]]
-define amdgpu_kernel void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
-  %a = load volatile i1, i1 addrspace(1)* %in0
-  %b = load volatile i1, i1 addrspace(1)* %in1
+define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load volatile i1, ptr addrspace(1) %in0
+  %b = load volatile i1, ptr addrspace(1) %in1
   %xor = xor i1 %a, %b
-  store i1 %xor, i1 addrspace(1)* %out
+  store i1 %xor, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}vector_xor_i32:
 ; SI: v_xor_b32_e32
-define amdgpu_kernel void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
-  %a = load i32, i32 addrspace(1)* %in0
-  %b = load i32, i32 addrspace(1)* %in1
+define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load i32, ptr addrspace(1) %in0
+  %b = load i32, ptr addrspace(1) %in1
   %result = xor i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}scalar_xor_i32:
 ; SI: s_xor_b32
-define amdgpu_kernel void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
   %result = xor i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}scalar_not_i32:
 ; SI: s_not_b32
-define amdgpu_kernel void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) {
   %result = xor i32 %a, -1
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}vector_not_i32:
 ; SI: v_not_b32
-define amdgpu_kernel void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
-  %a = load i32, i32 addrspace(1)* %in0
-  %b = load i32, i32 addrspace(1)* %in1
+define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load i32, ptr addrspace(1) %in0
+  %b = load i32, ptr addrspace(1) %in1
   %result = xor i32 %a, -1
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -111,39 +111,39 @@ define amdgpu_kernel void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(
 ; SI: v_xor_b32_e32
 ; SI: v_xor_b32_e32
 ; SI: s_endpgm
-define amdgpu_kernel void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
-  %a = load i64, i64 addrspace(1)* %in0
-  %b = load i64, i64 addrspace(1)* %in1
+define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load i64, ptr addrspace(1) %in0
+  %b = load i64, ptr addrspace(1) %in1
   %result = xor i64 %a, %b
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}scalar_xor_i64:
 ; SI: s_xor_b64
 ; SI: s_endpgm
-define amdgpu_kernel void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
   %result = xor i64 %a, %b
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}scalar_not_i64:
 ; SI: s_not_b64
-define amdgpu_kernel void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) {
   %result = xor i64 %a, -1
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}vector_not_i64:
 ; SI: v_not_b32
 ; SI: v_not_b32
-define amdgpu_kernel void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
-  %a = load i64, i64 addrspace(1)* %in0
-  %b = load i64, i64 addrspace(1)* %in1
+define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load i64, ptr addrspace(1) %in0
+  %b = load i64, ptr addrspace(1) %in1
   %result = xor i64 %a, -1
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -153,7 +153,7 @@ define amdgpu_kernel void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(
 
 ; FUNC-LABEL: {{^}}xor_cf:
 ; SI: s_xor_b64
-define amdgpu_kernel void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) {
+define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
@@ -163,12 +163,12 @@ if:
   br label %endif
 
 else:
-  %2 = load i64, i64 addrspace(1)* %in
+  %2 = load i64, ptr addrspace(1) %in
   br label %endif
 
 endif:
   %3 = phi i64 [%1, %if], [%2, %else]
-  store i64 %3, i64 addrspace(1)* %out
+  store i64 %3, ptr addrspace(1) %out
   ret void
 }
 
@@ -178,9 +178,9 @@ endif:
 ; SI-DAG: s_xor_b32 s[[RES_LO:[0-9]+]], s{{[0-9]+}}, 0x3039
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
-define amdgpu_kernel void @scalar_xor_literal_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
+define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
   %or = xor i64 %a, 4261135838621753
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   ret void
 }
 
@@ -192,12 +192,12 @@ define amdgpu_kernel void @scalar_xor_literal_i64(i64 addrspace(1)* %out, [8 x i
 
 ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3039
 ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0xf237b
-define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, i64 %b) {
   %or = xor i64 %a, 4261135838621753
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
 
   %foo = add i64 %b, 4261135838621753
-  store volatile i64 %foo, i64 addrspace(1)* undef
+  store volatile i64 %foo, ptr addrspace(1) undef
   ret void
 }
 
@@ -211,18 +211,18 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %o
 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s{{[0-9]+}}
 ; SI-NOT: xor_b32
 ; SI: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
-define amdgpu_kernel void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
+define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
   %or = xor i64 %a, 63
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}scalar_xor_neg_inline_imm_i64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -8
-define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
+define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
   %or = xor i64 %a, -8
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   ret void
 }
 
@@ -231,10 +231,10 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out,
 ; SI: v_xor_b32_e32 {{v[0-9]+}}, -8, v[[LO_VREG]]
 ; SI: v_xor_b32_e32 {{v[0-9]+}}, -1, {{.*}}
 ; SI: s_endpgm
-define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 8
+define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+  %loada = load i64, ptr addrspace(1) %a, align 8
   %or = xor i64 %loada, -8
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   ret void
 }
 
@@ -243,9 +243,9 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out,
 ; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]]
 ; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]]
 ; SI: s_endpgm
-define amdgpu_kernel void @vector_xor_literal_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 8
+define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+  %loada = load i64, ptr addrspace(1) %a, align 8
   %or = xor i64 %loada, 22470723082367
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index a0e46e61c6c18..1f532f2706de7 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -9,29 +9,29 @@
 ; GCN: {{^}}s_mad_zext_i32_to_i64:
 ; GCN: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v[0:[[V_ZERO]]]
-define amdgpu_kernel void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @s_mad_zext_i32_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) #0 {
 entry:
   %tmp0 = mul i32 %a, %b
   %tmp1 = add i32 %tmp0, %c
   %tmp2 = zext i32 %tmp1 to i64
-  store i64 %tmp2, i64 addrspace(1)* %out
+  store i64 %tmp2, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i32
 ; GCN: v_cndmask_b32
-define amdgpu_kernel void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_cmp_zext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 entry:
   %tmp0 = icmp eq i32 %a, %b
   %tmp1 = zext i1 %tmp0 to i32
-  store i32 %tmp1, i32 addrspace(1)* %out
+  store i32 %tmp1, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_arg_zext_i1_to_i64:
-define amdgpu_kernel void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 {
+define amdgpu_kernel void @s_arg_zext_i1_to_i64(ptr addrspace(1) %out, i1 zeroext %arg) #0 {
   %ext = zext i1 %arg to i64
-  store i64 %ext, i64 addrspace(1)* %out, align 8
+  store i64 %ext, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -39,10 +39,10 @@ define amdgpu_kernel void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroe
 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 0
 ; GCN-DAG: s_cmp_eq_u32
 ; GCN:     v_cndmask_b32
-define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_cmp_zext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
   %cmp = icmp eq i32 %a, %b
   %ext = zext i1 %cmp to i64
-  store i64 %ext, i64 addrspace(1)* %out, align 8
+  store i64 %ext, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -57,10 +57,10 @@ define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a,
 ; GCN: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -1, 0
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN: buffer_store_short [[RESULT]]
-define amdgpu_kernel void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 {
+define amdgpu_kernel void @s_cmp_zext_i1_to_i16(ptr addrspace(1) %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 {
   %tmp0 = icmp eq i16 %a, %b
   %tmp1 = zext i1 %tmp0 to i16
-  store i16 %tmp1, i16 addrspace(1)* %out
+  store i16 %tmp1, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll
index 0296e308df660..efe93e4e4dafb 100644
--- a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
 
-define amdgpu_kernel void @zext_i16_to_i32_uniform(i32 addrspace(1)* %out, i16 %a, i32 %b) {
+define amdgpu_kernel void @zext_i16_to_i32_uniform(ptr addrspace(1) %out, i16 %a, i32 %b) {
 ; GCN-LABEL: zext_i16_to_i32_uniform:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -17,12 +17,12 @@ define amdgpu_kernel void @zext_i16_to_i32_uniform(i32 addrspace(1)* %out, i16 %
 ; GCN-NEXT:    s_endpgm
   %zext = zext i16 %a to i32
   %res = add i32 %b, %zext
-  store i32 %res, i32 addrspace(1)* %out
+  store i32 %res, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @zext_i16_to_i64_uniform(i64 addrspace(1)* %out, i16 %a, i64 %b) {
+define amdgpu_kernel void @zext_i16_to_i64_uniform(ptr addrspace(1) %out, i16 %a, i64 %b) {
 ; GCN-LABEL: zext_i16_to_i64_uniform:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s6, s[0:1], 0xb
@@ -40,11 +40,11 @@ define amdgpu_kernel void @zext_i16_to_i64_uniform(i64 addrspace(1)* %out, i16 %
 ; GCN-NEXT:    s_endpgm
   %zext = zext i16 %a to i64
   %res = add i64 %b, %zext
-  store i64 %res, i64 addrspace(1)* %out
+  store i64 %res, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @zext_i16_to_i32_divergent(i32 addrspace(1)* %out, i16 %a, i32 %b) {
+define amdgpu_kernel void @zext_i16_to_i32_divergent(ptr addrspace(1) %out, i16 %a, i32 %b) {
 ; GCN-LABEL: zext_i16_to_i32_divergent:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -60,12 +60,12 @@ define amdgpu_kernel void @zext_i16_to_i32_divergent(i32 addrspace(1)* %out, i16
   %tid.truncated = trunc i32 %tid to i16
   %divergent.a = add i16 %a, %tid.truncated
   %zext = zext i16 %divergent.a to i32
-  store i32 %zext, i32 addrspace(1)* %out
+  store i32 %zext, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @zext_i16_to_i64_divergent(i64 addrspace(1)* %out, i16 %a, i64 %b) {
+define amdgpu_kernel void @zext_i16_to_i64_divergent(ptr addrspace(1) %out, i16 %a, i64 %b) {
 ; GCN-LABEL: zext_i16_to_i64_divergent:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -82,7 +82,7 @@ define amdgpu_kernel void @zext_i16_to_i64_divergent(i64 addrspace(1)* %out, i16
   %tid.truncated = trunc i32 %tid to i16
   %divergent.a = add i16 %a, %tid.truncated
   %zext = zext i16 %divergent.a to i64
-  store i64 %zext, i64 addrspace(1)* %out
+  store i64 %zext, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll b/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
index 78a367875ead6..184aa58a87182 100644
--- a/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
+++ b/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
@@ -11,12 +11,12 @@
 ; GCN-NOT: v[[HI]]
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
-define amdgpu_kernel void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
-  %ld.64 = load volatile i64, i64 addrspace(1)* %in0
-  %ld.32 = load volatile i32, i32 addrspace(1)* %in1
+define amdgpu_kernel void @zext_or_operand_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %ld.64 = load volatile i64, ptr addrspace(1) %in0
+  %ld.32 = load volatile i32, ptr addrspace(1) %in1
   %ext = zext i32 %ld.32 to i64
   %or = or i64 %ld.64, %ext
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   ret void
 }
 
@@ -31,11 +31,11 @@ define amdgpu_kernel void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrs
 ; GCN-NOT: _or_
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
-define amdgpu_kernel void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
-  %ld.64 = load volatile i64, i64 addrspace(1)* %in0
-  %ld.32 = load volatile i32, i32 addrspace(1)* %in1
+define amdgpu_kernel void @zext_or_operand_commute_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %ld.64 = load volatile i64, ptr addrspace(1) %in0
+  %ld.32 = load volatile i32, ptr addrspace(1) %in1
   %ext = zext i32 %ld.32 to i64
   %or = or i64 %ext, %ld.64
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/zext-lid.ll b/llvm/test/CodeGen/AMDGPU/zext-lid.ll
index e05b91c38a0cf..3550fe8f15367 100644
--- a/llvm/test/CodeGen/AMDGPU/zext-lid.ll
+++ b/llvm/test/CodeGen/AMDGPU/zext-lid.ll
@@ -3,37 +3,37 @@
 
 ; GCN-LABEL: {{^}}zext_grp_size_128:
 ; GCN-NOT: and_b32
-define amdgpu_kernel void @zext_grp_size_128(i32 addrspace(1)* nocapture %arg) #0 {
+define amdgpu_kernel void @zext_grp_size_128(ptr addrspace(1) nocapture %arg) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = and i32 %tmp, 127
-  store i32 %tmp1, i32 addrspace(1)* %arg, align 4
+  store i32 %tmp1, ptr addrspace(1) %arg, align 4
   %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y()
   %tmp3 = and i32 %tmp2, 127
-  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
-  store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4
+  %tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
+  store i32 %tmp3, ptr addrspace(1) %tmp4, align 4
   %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z()
   %tmp6 = and i32 %tmp5, 127
-  %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
-  store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4
+  %tmp7 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
+  store i32 %tmp6, ptr addrspace(1) %tmp7, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}zext_grp_size_32x4x1:
 ; GCN-NOT: and_b32
-define amdgpu_kernel void @zext_grp_size_32x4x1(i32 addrspace(1)* nocapture %arg) #0 !reqd_work_group_size !0 {
+define amdgpu_kernel void @zext_grp_size_32x4x1(ptr addrspace(1) nocapture %arg) #0 !reqd_work_group_size !0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = and i32 %tmp, 31
-  store i32 %tmp1, i32 addrspace(1)* %arg, align 4
+  store i32 %tmp1, ptr addrspace(1) %arg, align 4
   %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y()
   %tmp3 = and i32 %tmp2, 3
-  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
-  store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4
+  %tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
+  store i32 %tmp3, ptr addrspace(1) %tmp4, align 4
   %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z()
   %tmp6 = and i32 %tmp5, 1
-  %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
-  store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4
+  %tmp7 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
+  store i32 %tmp6, ptr addrspace(1) %tmp7, align 4
   ret void
 }
 
@@ -42,28 +42,28 @@ bb:
 
 ; When EarlyCSE is not run this call produces a range max with 0 active bits,
 ; which is a special case as an AssertZext from width 0 is invalid.
-define amdgpu_kernel void @zext_grp_size_1x1x1(i32 addrspace(1)* nocapture %arg) #0 !reqd_work_group_size !1 {
+define amdgpu_kernel void @zext_grp_size_1x1x1(ptr addrspace(1) nocapture %arg) #0 !reqd_work_group_size !1 {
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = and i32 %tmp, 1
-  store i32 %tmp1, i32 addrspace(1)* %arg, align 4
+  store i32 %tmp1, ptr addrspace(1) %arg, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}zext_grp_size_512:
 ; GCN-NOT: and_b32
-define amdgpu_kernel void @zext_grp_size_512(i32 addrspace(1)* nocapture %arg) #1 {
+define amdgpu_kernel void @zext_grp_size_512(ptr addrspace(1) nocapture %arg) #1 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = and i32 %tmp, 65535
-  store i32 %tmp1, i32 addrspace(1)* %arg, align 4
+  store i32 %tmp1, ptr addrspace(1) %arg, align 4
   %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y()
   %tmp3 = and i32 %tmp2, 65535
-  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
-  store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4
+  %tmp4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
+  store i32 %tmp3, ptr addrspace(1) %tmp4, align 4
   %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z()
   %tmp6 = and i32 %tmp5, 65535
-  %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
-  store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4
+  %tmp7 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
+  store i32 %tmp6, ptr addrspace(1) %tmp7, align 4
   ret void
 }
 
@@ -71,11 +71,11 @@ bb:
 ; O2-NOT: and_b32
 ; O2: v_and_b32_e32 v{{[0-9]+}}, 0x3ff,
 ; O2-NOT: and_b32
-define void @func_test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 {
+define void @func_test_workitem_id_x_known_max_range(ptr addrspace(1) nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %and = and i32 %id, 1023
-  store i32 %and, i32 addrspace(1)* %out, align 4
+  store i32 %and, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -83,11 +83,11 @@ entry:
 ; O2-NOT: and_b32
 ; O2: v_and_b32_e32 v{{[0-9]+}}, 0x3ff,
 ; O2-NOT: and_b32
-define void @func_test_workitem_id_x_default_range(i32 addrspace(1)* nocapture %out) #4 {
+define void @func_test_workitem_id_x_default_range(ptr addrspace(1) nocapture %out) #4 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %and = and i32 %id, 1023
-  store i32 %and, i32 addrspace(1)* %out, align 4
+  store i32 %and, ptr addrspace(1) %out, align 4
   ret void
 }