[llvm] 4998de4 - AMDGPU: Update some wait tests to opaque pointers

Thu Dec 1 18:14:03 PST 2022

Author: Matt Arsenault
Date: 2022-12-01T21:01:58-05:00
New Revision: 4998de4dcc83a0b0ad0b797716a15630ebed1755

URL: https://github.com/llvm/llvm-project/commit/4998de4dcc83a0b0ad0b797716a15630ebed1755
DIFF: https://github.com/llvm/llvm-project/commit/4998de4dcc83a0b0ad0b797716a15630ebed1755.diff

LOG: AMDGPU: Update some wait tests to opaque pointers

The script mangled the constantexprs in waitcnt-looptest.ll, so fixed
those manually.

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/wait.ll
    llvm/test/CodeGen/AMDGPU/waitcnt-flat.ll
    llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll
    llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/wait.ll b/llvm/test/CodeGen/AMDGPU/wait.ll
index 0d23e447b427e..41dd58db538d4 100644

--- a/llvm/test/CodeGen/AMDGPU/wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/wait.ll
@@ -13,19 +13,18 @@
 ; DEFAULT-DAG: exp
 ; DEFAULT: exp
 ; DEFAULT-NEXT: s_endpgm
-define amdgpu_vs void @main(<16 x i8> addrspace(4)* inreg %arg, <16 x i8> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, <16 x i8> addrspace(4)* inreg %arg3, <16 x i8> addrspace(4)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(4)* inreg %constptr) #0 {
+define amdgpu_vs void @main(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, ptr addrspace(4) inreg %arg3, ptr addrspace(4) inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, ptr addrspace(4) inreg %constptr) #0 {
 main_body:
-  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(4)* %arg3, i32 0
-  %tmp10 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp, !tbaa !0
+  %tmp10 = load <16 x i8>, ptr addrspace(4) %arg3, !tbaa !0
   %tmp10.cast = bitcast <16 x i8> %tmp10 to <4 x i32>
   %tmp11 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp10.cast, i32 %arg6, i32 0, i32 0, i32 0)
   %tmp12 = extractelement <4 x float> %tmp11, i32 0
   %tmp13 = extractelement <4 x float> %tmp11, i32 1
   call void @llvm.amdgcn.s.barrier() #1
   %tmp14 = extractelement <4 x float> %tmp11, i32 2
-  %tmp15 = load float, float addrspace(4)* %constptr, align 4
-  %tmp16 = getelementptr <16 x i8>, <16 x i8> addrspace(4)* %arg3, i32 1
-  %tmp17 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp16, !tbaa !0
+  %tmp15 = load float, ptr addrspace(4) %constptr, align 4
+  %tmp16 = getelementptr <16 x i8>, ptr addrspace(4) %arg3, i32 1
+  %tmp17 = load <16 x i8>, ptr addrspace(4) %tmp16, !tbaa !0
   %tmp17.cast = bitcast <16 x i8> %tmp17 to <4 x i32>
   %tmp18 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp17.cast, i32 %arg6, i32 0, i32 0, i32 0)
   %tmp19 = extractelement <4 x float> %tmp18, i32 0
@@ -46,10 +45,9 @@ main_body:
 ; ILPMAX: exp pos0
 ; ILPMAX-NEXT: exp param0
 ; ILPMAX: s_endpgm
-define amdgpu_vs void @main2([6 x <16 x i8>] addrspace(4)* inreg %arg, [17 x <16 x i8>] addrspace(4)* inreg %arg1, [17 x <4 x i32>] addrspace(4)* inreg %arg2, [34 x <8 x i32>] addrspace(4)* inreg %arg3, [16 x <16 x i8>] addrspace(4)* inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
+define amdgpu_vs void @main2(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, ptr addrspace(4) inreg %arg3, ptr addrspace(4) inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
 main_body:
-  %tmp = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(4)* %arg4, i64 0, i64 0
-  %tmp11 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp, align 16, !tbaa !0
+  %tmp11 = load <16 x i8>, ptr addrspace(4) %arg4, align 16, !tbaa !0
   %tmp12 = add i32 %arg5, %arg7
   %tmp11.cast = bitcast <16 x i8> %tmp11 to <4 x i32>
   %tmp13 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp11.cast, i32 %tmp12, i32 0, i32 0, i32 0)
@@ -57,8 +55,8 @@ main_body:
   %tmp15 = extractelement <4 x float> %tmp13, i32 1
   %tmp16 = extractelement <4 x float> %tmp13, i32 2
   %tmp17 = extractelement <4 x float> %tmp13, i32 3
-  %tmp18 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(4)* %arg4, i64 0, i64 1
-  %tmp19 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp18, align 16, !tbaa !0
+  %tmp18 = getelementptr [16 x <16 x i8>], ptr addrspace(4) %arg4, i64 0, i64 1
+  %tmp19 = load <16 x i8>, ptr addrspace(4) %tmp18, align 16, !tbaa !0
   %tmp20 = add i32 %arg5, %arg7
   %tmp19.cast = bitcast <16 x i8> %tmp19 to <4 x i32>
   %tmp21 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp19.cast, i32 %tmp20, i32 0, i32 0, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-flat.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-flat.ll
index 6df033f5d6297..203f1633fd8a5 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-flat.ll
@@ -10,9 +10,9 @@
 ; XGCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]]
 ; XGCN: s_waitcnt vmcnt(0) lgkmcnt(0)
 ; XGCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @test(i32* %out, i32 %in) {
-  store volatile i32 0, i32* %out
-  %val = load volatile i32, i32* %out
+define amdgpu_kernel void @test(ptr %out, i32 %in) {
+  store volatile i32 0, ptr %out
+  %val = load volatile i32, ptr %out
   ret void
 }
 
@@ -21,8 +21,8 @@ define amdgpu_kernel void @test(i32* %out, i32 %in) {
 ; GFX9: global_load_dword [[LD:v[0-9]+]]
 ; GFX9-NEXT: s_waitcnt vmcnt(0){{$}}
 ; GFX9-NEXT: ds_write_b32 [[LD]]
-define amdgpu_kernel void @test_waitcnt_type_flat_global(i32 addrspace(1)* %in) {
-  %val = load volatile i32, i32 addrspace(1)* %in
-  store volatile i32 %val, i32 addrspace(3)* undef
+define amdgpu_kernel void @test_waitcnt_type_flat_global(ptr addrspace(1) %in) {
+  %val = load volatile i32, ptr addrspace(1) %in
+  store volatile i32 %val, ptr addrspace(3) undef
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll
index 8a0d65ebb1743..b32ce6eb0acc0 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll
@@ -15,113 +15,111 @@
 @data_generic = addrspace(1) global [100 x float] [float 0.000000e+00, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, float 5.000000e-01, float 0x3FE3333340000000, float 0x3FE6666660000000, float 0x3FE99999A0000000, float 0x3FECCCCCC0000000, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF6666660000000, float 1.500000e+00, float 0x3FF99999A0000000, float 0x3FFB333340000000, float 0x3FFCCCCCC0000000, float 0x3FFE666660000000, float 2.000000e+00, float 0x4000CCCCC0000000, float 0x40019999A0000000, float 0x4002666660000000, float 0x4003333340000000, float 2.500000e+00, float 0x4004CCCCC0000000, float 0x40059999A0000000, float 0x4006666660000000, float 0x4007333340000000, float 3.000000e+00, float 0x4008CCCCC0000000, float 0x40099999A0000000, float 0x400A666660000000, float 0x400B333340000000, float 3.500000e+00, float 0x400CCCCCC0000000, float 0x400D9999A0000000, float 0x400E666660000000, float 0x400F333340000000, float 4.000000e+00, float 0x4010666660000000, float 0x4010CCCCC0000000, float 0x4011333340000000, float 0x40119999A0000000, float 4.500000e+00, float 0x4012666660000000, float 0x4012CCCCC0000000, float 0x4013333340000000, float 0x40139999A0000000, float 5.000000e+00, float 0x4014666660000000, float 0x4014CCCCC0000000, float 0x4015333340000000, float 0x40159999A0000000, float 5.500000e+00, float 0x4016666660000000, float 0x4016CCCCC0000000, float 0x4017333340000000, float 0x40179999A0000000, float 6.000000e+00, float 0x4018666660000000, float 0x4018CCCCC0000000, float 0x4019333340000000, float 0x40199999A0000000, float 6.500000e+00, float 0x401A666660000000, float 0x401ACCCCC0000000, float 0x401B333340000000, float 0x401B9999A0000000, float 7.000000e+00, float 0x401C666660000000, float 0x401CCCCCC0000000, float 0x401D333340000000, float 0x401D9999A0000000, float 7.500000e+00, float 0x401E666660000000, float 0x401ECCCCC0000000, float 0x401F333340000000, float 0x401F9999A0000000, float 8.000000e+00, float 0x4020333340000000, float 0x4020666660000000, float 0x40209999A0000000, float 0x4020CCCCC0000000, float 8.500000e+00, float 0x4021333340000000, float 0x4021666660000000, float 0x40219999A0000000, float 0x4021CCCCC0000000, float 9.000000e+00, float 0x4022333340000000, float 0x4022666660000000, float 0x40229999A0000000, float 0x4022CCCCC0000000, float 9.500000e+00, float 0x4023333340000000, float 0x4023666660000000, float 0x40239999A0000000, float 0x4023CCCCC0000000], align 4
 @data_reference = addrspace(1) global [100 x float] [float 0.000000e+00, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, float 5.000000e-01, float 0x3FE3333340000000, float 0x3FE6666660000000, float 0x3FE99999A0000000, float 0x3FECCCCCC0000000, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF6666660000000, float 1.500000e+00, float 0x3FF99999A0000000, float 0x3FFB333340000000, float 0x3FFCCCCCC0000000, float 0x3FFE666660000000, float 2.000000e+00, float 0x4000CCCCC0000000, float 0x40019999A0000000, float 0x4002666660000000, float 0x4003333340000000, float 2.500000e+00, float 0x4004CCCCC0000000, float 0x40059999A0000000, float 0x4006666660000000, float 0x4007333340000000, float 3.000000e+00, float 0x4008CCCCC0000000, float 0x40099999A0000000, float 0x400A666660000000, float 0x400B333340000000, float 3.500000e+00, float 0x400CCCCCC0000000, float 0x400D9999A0000000, float 0x400E666660000000, float 0x400F333340000000, float 4.000000e+00, float 0x4010666660000000, float 0x4010CCCCC0000000, float 0x4011333340000000, float 0x40119999A0000000, float 4.500000e+00, float 0x4012666660000000, float 0x4012CCCCC0000000, float 0x4013333340000000, float 0x40139999A0000000, float 5.000000e+00, float 0x4014666660000000, float 0x4014CCCCC0000000, float 0x4015333340000000, float 0x40159999A0000000, float 5.500000e+00, float 0x4016666660000000, float 0x4016CCCCC0000000, float 0x4017333340000000, float 0x40179999A0000000, float 6.000000e+00, float 0x4018666660000000, float 0x4018CCCCC0000000, float 0x4019333340000000, float 0x40199999A0000000, float 6.500000e+00, float 0x401A666660000000, float 0x401ACCCCC0000000, float 0x401B333340000000, float 0x401B9999A0000000, float 7.000000e+00, float 0x401C666660000000, float 0x401CCCCCC0000000, float 0x401D333340000000, float 0x401D9999A0000000, float 7.500000e+00, float 0x401E666660000000, float 0x401ECCCCC0000000, float 0x401F333340000000, float 0x401F9999A0000000, float 8.000000e+00, float 0x4020333340000000, float 0x4020666660000000, float 0x40209999A0000000, float 0x4020CCCCC0000000, float 8.500000e+00, float 0x4021333340000000, float 0x4021666660000000, float 0x40219999A0000000, float 0x4021CCCCC0000000, float 9.000000e+00, float 0x4022333340000000, float 0x4022666660000000, float 0x40229999A0000000, float 0x4022CCCCC0000000, float 9.500000e+00, float 0x4023333340000000, float 0x4023666660000000, float 0x40239999A0000000, float 0x4023CCCCC0000000], align 4
 
-define amdgpu_kernel void @testKernel(i32 addrspace(1)* nocapture %arg) local_unnamed_addr #0 {
+define amdgpu_kernel void @testKernel(ptr addrspace(1) nocapture %arg) local_unnamed_addr #0 {
 bb:
-  store <2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x float>* bitcast (float* getelementptr ([100 x float], [100 x float]* addrspacecast ([100 x float] addrspace(1)* @data_generic to [100 x float]*), i64 0, i64 4) to <2 x float>*), align 4
-  store <2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x float>* bitcast (float* getelementptr ([100 x float], [100 x float]* addrspacecast ([100 x float] addrspace(1)* @data_reference to [100 x float]*), i64 0, i64 4) to <2 x float>*), align 4
+  store <2 x float> <float 1.000000e+00, float 1.000000e+00>, ptr bitcast (ptr getelementptr ([100 x float], ptr addrspacecast ([100 x float] addrspace(1)* @data_generic to ptr), i64 0, i64 4) to ptr), align 4
+  store <2 x float> <float 1.000000e+00, float 1.000000e+00>, ptr bitcast (ptr getelementptr ([100 x float], ptr addrspacecast ([100 x float] addrspace(1)* @data_reference to ptr), i64 0, i64 4) to ptr), align 4
   br label %bb18
 
 bb1:                                              ; preds = %bb18
-  %tmp = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %tmp = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
   %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp3 = tail call i32 @llvm.amdgcn.workgroup.id.x()
-  %tmp4 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 4
-  %tmp5 = bitcast i8 addrspace(4)* %tmp4 to i16 addrspace(4)*
-  %tmp6 = load i16, i16 addrspace(4)* %tmp5, align 4
+  %tmp4 = getelementptr inbounds i8, ptr addrspace(4) %tmp, i64 4
+  %tmp6 = load i16, ptr addrspace(4) %tmp4, align 4
   %tmp7 = zext i16 %tmp6 to i32
   %tmp8 = mul i32 %tmp3, %tmp7
   %tmp9 = add i32 %tmp8, %tmp2
-  %tmp10 = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %tmp10 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %tmp11 = zext i32 %tmp9 to i64
-  %tmp12 = bitcast i8 addrspace(4)* %tmp10 to i64 addrspace(4)*
-  %tmp13 = load i64, i64 addrspace(4)* %tmp12, align 8
+  %tmp13 = load i64, ptr addrspace(4) %tmp10, align 8
   %tmp14 = add i64 %tmp13, %tmp11
   %tmp15 = zext i1 %tmp99 to i32
   %tmp16 = and i64 %tmp14, 4294967295
-  %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16
-  store i32 %tmp15, i32 addrspace(1)* %tmp17, align 4
+  %tmp17 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp16
+  store i32 %tmp15, ptr addrspace(1) %tmp17, align 4
   ret void
 
 bb18:                                             ; preds = %bb18, %bb
   %tmp19 = phi i64 [ 0, %bb ], [ %tmp102, %bb18 ]
   %tmp20 = phi i32 [ 0, %bb ], [ %tmp100, %bb18 ]
   %tmp21 = phi i1 [ true, %bb ], [ %tmp99, %bb18 ]
-  %tmp22 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp19
-  %tmp23 = load float, float addrspace(1)* %tmp22, align 4
-  %tmp24 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp19
-  %tmp25 = load float, float addrspace(1)* %tmp24, align 4
+  %tmp22 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp19
+  %tmp23 = load float, ptr addrspace(1) %tmp22, align 4
+  %tmp24 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp19
+  %tmp25 = load float, ptr addrspace(1) %tmp24, align 4
   %tmp26 = fcmp oeq float %tmp23, %tmp25
   %tmp27 = and i1 %tmp21, %tmp26
   %tmp28 = or i32 %tmp20, 1
   %tmp29 = sext i32 %tmp28 to i64
-  %tmp30 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp29
-  %tmp31 = load float, float addrspace(1)* %tmp30, align 4
-  %tmp32 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp29
-  %tmp33 = load float, float addrspace(1)* %tmp32, align 4
+  %tmp30 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp29
+  %tmp31 = load float, ptr addrspace(1) %tmp30, align 4
+  %tmp32 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp29
+  %tmp33 = load float, ptr addrspace(1) %tmp32, align 4
   %tmp34 = fcmp oeq float %tmp31, %tmp33
   %tmp35 = and i1 %tmp27, %tmp34
   %tmp36 = add nuw nsw i32 %tmp20, 2
   %tmp37 = sext i32 %tmp36 to i64
-  %tmp38 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp37
-  %tmp39 = load float, float addrspace(1)* %tmp38, align 4
-  %tmp40 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp37
-  %tmp41 = load float, float addrspace(1)* %tmp40, align 4
+  %tmp38 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp37
+  %tmp39 = load float, ptr addrspace(1) %tmp38, align 4
+  %tmp40 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp37
+  %tmp41 = load float, ptr addrspace(1) %tmp40, align 4
   %tmp42 = fcmp oeq float %tmp39, %tmp41
   %tmp43 = and i1 %tmp35, %tmp42
   %tmp44 = add nuw nsw i32 %tmp20, 3
   %tmp45 = sext i32 %tmp44 to i64
-  %tmp46 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp45
-  %tmp47 = load float, float addrspace(1)* %tmp46, align 4
-  %tmp48 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp45
-  %tmp49 = load float, float addrspace(1)* %tmp48, align 4
+  %tmp46 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp45
+  %tmp47 = load float, ptr addrspace(1) %tmp46, align 4
+  %tmp48 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp45
+  %tmp49 = load float, ptr addrspace(1) %tmp48, align 4
   %tmp50 = fcmp oeq float %tmp47, %tmp49
   %tmp51 = and i1 %tmp43, %tmp50
   %tmp52 = add nuw nsw i32 %tmp20, 4
   %tmp53 = sext i32 %tmp52 to i64
-  %tmp54 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp53
-  %tmp55 = load float, float addrspace(1)* %tmp54, align 4
-  %tmp56 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp53
-  %tmp57 = load float, float addrspace(1)* %tmp56, align 4
+  %tmp54 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp53
+  %tmp55 = load float, ptr addrspace(1) %tmp54, align 4
+  %tmp56 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp53
+  %tmp57 = load float, ptr addrspace(1) %tmp56, align 4
   %tmp58 = fcmp oeq float %tmp55, %tmp57
   %tmp59 = and i1 %tmp51, %tmp58
   %tmp60 = add nuw nsw i32 %tmp20, 5
   %tmp61 = sext i32 %tmp60 to i64
-  %tmp62 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp61
-  %tmp63 = load float, float addrspace(1)* %tmp62, align 4
-  %tmp64 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp61
-  %tmp65 = load float, float addrspace(1)* %tmp64, align 4
+  %tmp62 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp61
+  %tmp63 = load float, ptr addrspace(1) %tmp62, align 4
+  %tmp64 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp61
+  %tmp65 = load float, ptr addrspace(1) %tmp64, align 4
   %tmp66 = fcmp oeq float %tmp63, %tmp65
   %tmp67 = and i1 %tmp59, %tmp66
   %tmp68 = add nuw nsw i32 %tmp20, 6
   %tmp69 = sext i32 %tmp68 to i64
-  %tmp70 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp69
-  %tmp71 = load float, float addrspace(1)* %tmp70, align 4
-  %tmp72 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp69
-  %tmp73 = load float, float addrspace(1)* %tmp72, align 4
+  %tmp70 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp69
+  %tmp71 = load float, ptr addrspace(1) %tmp70, align 4
+  %tmp72 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp69
+  %tmp73 = load float, ptr addrspace(1) %tmp72, align 4
   %tmp74 = fcmp oeq float %tmp71, %tmp73
   %tmp75 = and i1 %tmp67, %tmp74
   %tmp76 = add nuw nsw i32 %tmp20, 7
   %tmp77 = sext i32 %tmp76 to i64
-  %tmp78 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp77
-  %tmp79 = load float, float addrspace(1)* %tmp78, align 4
-  %tmp80 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp77
-  %tmp81 = load float, float addrspace(1)* %tmp80, align 4
+  %tmp78 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp77
+  %tmp79 = load float, ptr addrspace(1) %tmp78, align 4
+  %tmp80 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp77
+  %tmp81 = load float, ptr addrspace(1) %tmp80, align 4
   %tmp82 = fcmp oeq float %tmp79, %tmp81
   %tmp83 = and i1 %tmp75, %tmp82
   %tmp84 = add nuw nsw i32 %tmp20, 8
   %tmp85 = sext i32 %tmp84 to i64
-  %tmp86 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp85
-  %tmp87 = load float, float addrspace(1)* %tmp86, align 4
-  %tmp88 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp85
-  %tmp89 = load float, float addrspace(1)* %tmp88, align 4
+  %tmp86 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp85
+  %tmp87 = load float, ptr addrspace(1) %tmp86, align 4
+  %tmp88 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp85
+  %tmp89 = load float, ptr addrspace(1) %tmp88, align 4
   %tmp90 = fcmp oeq float %tmp87, %tmp89
   %tmp91 = and i1 %tmp83, %tmp90
   %tmp92 = add nuw nsw i32 %tmp20, 9
   %tmp93 = sext i32 %tmp92 to i64
-  %tmp94 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_generic, i64 0, i64 %tmp93
-  %tmp95 = load float, float addrspace(1)* %tmp94, align 4
-  %tmp96 = getelementptr inbounds [100 x float], [100 x float] addrspace(1)* @data_reference, i64 0, i64 %tmp93
-  %tmp97 = load float, float addrspace(1)* %tmp96, align 4
+  %tmp94 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_generic, i64 0, i64 %tmp93
+  %tmp95 = load float, ptr addrspace(1) %tmp94, align 4
+  %tmp96 = getelementptr inbounds [100 x float], ptr addrspace(1) @data_reference, i64 0, i64 %tmp93
+  %tmp97 = load float, ptr addrspace(1) %tmp96, align 4
   %tmp98 = fcmp oeq float %tmp95, %tmp97
   %tmp99 = and i1 %tmp91, %tmp98
   %tmp100 = add nuw nsw i32 %tmp20, 10
@@ -131,7 +129,7 @@ bb18:                                             ; preds = %bb18, %bb
 }
 
 ; Function Attrs: nounwind readnone speculatable
-declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
+declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #1
 
 ; Function Attrs: nounwind readnone speculatable
 declare i32 @llvm.amdgcn.workitem.id.x() #1
@@ -140,7 +138,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare i32 @llvm.amdgcn.workgroup.id.x() #1
 
 ; Function Attrs: nounwind readnone speculatable
-declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #1
+declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #1
 
 attributes #0 = { "target-cpu"="fiji" "target-features"="-flat-for-global" }
 attributes #1 = { nounwind readnone speculatable }

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
index f1af64eb9b39d..ecf6dd1eb17ed 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@@ -9,20 +9,20 @@
 ; GFX8:         s_waitcnt vmcnt(0){{$}}
 ; GFX9PLUS:     s_waitcnt vmcnt(0){{$}}
 ; GCN-NEXT:     s_barrier
-define amdgpu_kernel void @barrier_vmcnt_global(i32 addrspace(1)* %arg) {
+define amdgpu_kernel void @barrier_vmcnt_global(ptr addrspace(1) %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = shl nuw nsw i64 %tmp1, 32
-  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1
-  %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
+  %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp4 = load i32, ptr addrspace(1) %tmp3, align 4
   fence syncscope("singlethread") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("singlethread") acquire
   %tmp5 = add nuw nsw i64 %tmp2, 4294967296
   %tmp6 = lshr exact i64 %tmp5, 32
-  %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6
-  store i32 %tmp4, i32 addrspace(1)* %tmp7, align 4
+  %tmp7 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp6
+  store i32 %tmp4, ptr addrspace(1) %tmp7, align 4
   ret void
 }
 
@@ -33,22 +33,22 @@ bb:
 ; GFX9:       s_waitcnt vmcnt(0){{$}}
 ; GFX10PLUS:  s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:   s_barrier
-define amdgpu_kernel void @barrier_vscnt_global(i32 addrspace(1)* %arg) {
+define amdgpu_kernel void @barrier_vscnt_global(ptr addrspace(1) %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = shl nuw nsw i64 %tmp1, 32
   %tmp3 = add nuw nsw i64 %tmp2, 8589934592
   %tmp4 = lshr exact i64 %tmp3, 32
-  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
-  store i32 0, i32 addrspace(1)* %tmp5, align 4
+  %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp4
+  store i32 0, ptr addrspace(1) %tmp5, align 4
   fence syncscope("singlethread") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("singlethread") acquire
   %tmp6 = add nuw nsw i64 %tmp2, 4294967296
   %tmp7 = lshr exact i64 %tmp6, 32
-  %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp7
-  store i32 1, i32 addrspace(1)* %tmp8, align 4
+  %tmp8 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp7
+  store i32 1, ptr addrspace(1) %tmp8, align 4
   ret void
 }
 
@@ -59,24 +59,24 @@ bb:
 ; GFX9PLUS:     s_waitcnt vmcnt(0){{$}}
 ; GFX10PLUS:    s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:     s_barrier
-define amdgpu_kernel void @barrier_vmcnt_vscnt_global(i32 addrspace(1)* %arg) {
+define amdgpu_kernel void @barrier_vmcnt_vscnt_global(ptr addrspace(1) %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = shl nuw nsw i64 %tmp1, 32
   %tmp3 = add nuw nsw i64 %tmp2, 8589934592
   %tmp4 = lshr exact i64 %tmp3, 32
-  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
-  store i32 0, i32 addrspace(1)* %tmp5, align 4
-  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1
-  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp4
+  store i32 0, ptr addrspace(1) %tmp5, align 4
+  %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
   fence syncscope("singlethread") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("singlethread") acquire
   %tmp8 = add nuw nsw i64 %tmp2, 4294967296
   %tmp9 = lshr exact i64 %tmp8, 32
-  %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp9
-  store i32 %tmp7, i32 addrspace(1)* %tmp10, align 4
+  %tmp10 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp9
+  store i32 %tmp7, ptr addrspace(1) %tmp10, align 4
   ret void
 }
 
@@ -84,20 +84,20 @@ bb:
 ; GCN:      flat_load_{{dword|b32}}
 ; GCN:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT: s_barrier
-define amdgpu_kernel void @barrier_vmcnt_flat(i32* %arg) {
+define amdgpu_kernel void @barrier_vmcnt_flat(ptr %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = shl nuw nsw i64 %tmp1, 32
-  %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
-  %tmp4 = load i32, i32* %tmp3, align 4
+  %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp1
+  %tmp4 = load i32, ptr %tmp3, align 4
   fence syncscope("singlethread") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("singlethread") acquire
   %tmp5 = add nuw nsw i64 %tmp2, 4294967296
   %tmp6 = lshr exact i64 %tmp5, 32
-  %tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp6
-  store i32 %tmp4, i32* %tmp7, align 4
+  %tmp7 = getelementptr inbounds i32, ptr %arg, i64 %tmp6
+  store i32 %tmp4, ptr %tmp7, align 4
   ret void
 }
 
@@ -107,22 +107,22 @@ bb:
 ; GFX10PLUS:   s_waitcnt lgkmcnt(0){{$}}
 ; GFX10PLUS:   s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_barrier
-define amdgpu_kernel void @barrier_vscnt_flat(i32* %arg) {
+define amdgpu_kernel void @barrier_vscnt_flat(ptr %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = shl nuw nsw i64 %tmp1, 32
   %tmp3 = add nuw nsw i64 %tmp2, 8589934592
   %tmp4 = lshr exact i64 %tmp3, 32
-  %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
-  store i32 0, i32* %tmp5, align 4
+  %tmp5 = getelementptr inbounds i32, ptr %arg, i64 %tmp4
+  store i32 0, ptr %tmp5, align 4
   fence syncscope("singlethread") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("singlethread") acquire
   %tmp6 = add nuw nsw i64 %tmp2, 4294967296
   %tmp7 = lshr exact i64 %tmp6, 32
-  %tmp8 = getelementptr inbounds i32, i32* %arg, i64 %tmp7
-  store i32 1, i32* %tmp8, align 4
+  %tmp8 = getelementptr inbounds i32, ptr %arg, i64 %tmp7
+  store i32 1, ptr %tmp8, align 4
   ret void
 }
 
@@ -131,24 +131,24 @@ bb:
 ; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX10PLUS:  s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:   s_barrier
-define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(i32* %arg) {
+define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(ptr %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = shl nuw nsw i64 %tmp1, 32
   %tmp3 = add nuw nsw i64 %tmp2, 8589934592
   %tmp4 = lshr exact i64 %tmp3, 32
-  %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
-  store i32 0, i32* %tmp5, align 4
-  %tmp6 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
-  %tmp7 = load i32, i32* %tmp6, align 4
+  %tmp5 = getelementptr inbounds i32, ptr %arg, i64 %tmp4
+  store i32 0, ptr %tmp5, align 4
+  %tmp6 = getelementptr inbounds i32, ptr %arg, i64 %tmp1
+  %tmp7 = load i32, ptr %tmp6, align 4
   fence syncscope("singlethread") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("singlethread") acquire
   %tmp8 = add nuw nsw i64 %tmp2, 4294967296
   %tmp9 = lshr exact i64 %tmp8, 32
-  %tmp10 = getelementptr inbounds i32, i32* %arg, i64 %tmp9
-  store i32 %tmp7, i32* %tmp10, align 4
+  %tmp10 = getelementptr inbounds i32, ptr %arg, i64 %tmp9
+  store i32 %tmp7, ptr %tmp10, align 4
   ret void
 }
 
@@ -159,24 +159,24 @@ bb:
 ; GFX10PLUS:  s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GFX10PLUS:  s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:   s_barrier
-define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) {
+define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = shl nuw nsw i64 %tmp1, 32
   %tmp3 = add nuw nsw i64 %tmp2, 8589934592
   %tmp4 = lshr exact i64 %tmp3, 32
-  %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
-  store i32 0, i32* %tmp5, align 4
-  %tmp6 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
-  %tmp7 = load i32, i32* %tmp6, align 4
+  %tmp5 = getelementptr inbounds i32, ptr %arg, i64 %tmp4
+  store i32 0, ptr %tmp5, align 4
+  %tmp6 = getelementptr inbounds i32, ptr %arg, i64 %tmp1
+  %tmp7 = load i32, ptr %tmp6, align 4
   fence syncscope("workgroup") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("workgroup") acquire
   %tmp8 = add nuw nsw i64 %tmp2, 4294967296
   %tmp9 = lshr exact i64 %tmp8, 32
-  %tmp10 = getelementptr inbounds i32, i32* %arg, i64 %tmp9
-  store i32 %tmp7, i32* %tmp10, align 4
+  %tmp10 = getelementptr inbounds i32, ptr %arg, i64 %tmp9
+  store i32 %tmp7, ptr %tmp10, align 4
   ret void
 }
 
@@ -186,17 +186,17 @@ bb:
 ; GFX8:     s_waitcnt vmcnt(0){{$}}
 ; GFX9PLUS: s_waitcnt vmcnt(0){{$}}
 ; GCN-NEXT: {{global|flat}}_store_{{dword|b32}}
-define amdgpu_kernel void @load_vmcnt_global(i32 addrspace(1)* %arg) {
+define amdgpu_kernel void @load_vmcnt_global(ptr addrspace(1) %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = shl nuw nsw i64 %tmp1, 32
-  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1
-  %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
+  %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp4 = load i32, ptr addrspace(1) %tmp3, align 4
   %tmp5 = add nuw nsw i64 %tmp2, 4294967296
   %tmp6 = lshr exact i64 %tmp5, 32
-  %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6
-  store i32 %tmp4, i32 addrspace(1)* %tmp7, align 4
+  %tmp7 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp6
+  store i32 %tmp4, ptr addrspace(1) %tmp7, align 4
   ret void
 }
 
@@ -205,17 +205,17 @@ bb:
 ; GCN-NOT:  vscnt
 ; GCN:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT: {{global|flat}}_store_{{dword|b32}}
-define amdgpu_kernel void @load_vmcnt_flat(i32* %arg) {
+define amdgpu_kernel void @load_vmcnt_flat(ptr %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
   %tmp2 = shl nuw nsw i64 %tmp1, 32
-  %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
-  %tmp4 = load i32, i32* %tmp3, align 4
+  %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp1
+  %tmp4 = load i32, ptr %tmp3, align 4
   %tmp5 = add nuw nsw i64 %tmp2, 4294967296
   %tmp6 = lshr exact i64 %tmp5, 32
-  %tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp6
-  store i32 %tmp4, i32* %tmp7, align 4
+  %tmp7 = getelementptr inbounds i32, ptr %arg, i64 %tmp6
+  store i32 %tmp4, ptr %tmp7, align 4
   ret void
 }
 
@@ -224,8 +224,8 @@ bb:
 ; GFX8_9:      s_waitcnt vmcnt(0)
 ; GFX10PLUS:   s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_setpc_b64
-define void @store_vscnt_private(i32 addrspace(5)* %p) {
-  store i32 0, i32 addrspace(5)* %p
+define void @store_vscnt_private(ptr addrspace(5) %p) {
+  store i32 0, ptr addrspace(5) %p
   ret void
 }
 
@@ -235,8 +235,8 @@ define void @store_vscnt_private(i32 addrspace(5)* %p) {
 ; GFX8_9:      s_waitcnt vmcnt(0)
 ; GFX10PLUS:   s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_setpc_b64
-define void @store_vscnt_global(i32 addrspace(1)* %p) {
-  store i32 0, i32 addrspace(1)* %p
+define void @store_vscnt_global(ptr addrspace(1) %p) {
+  store i32 0, ptr addrspace(1) %p
   ret void
 }
 
@@ -246,8 +246,8 @@ define void @store_vscnt_global(i32 addrspace(1)* %p) {
 ; GFX10PLUS:   s_waitcnt lgkmcnt(0){{$}}
 ; GFX10PLUS:   s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_setpc_b64
-define void @store_vscnt_flat(i32* %p) {
-  store i32 0, i32* %p
+define void @store_vscnt_flat(ptr %p) {
+  store i32 0, ptr %p
   ret void
 }