[llvm] r298444 - AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 21 14:40:01 PDT 2017


Modified: llvm/trunk/test/CodeGen/AMDGPU/sub.v2i16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sub.v2i16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sub.v2i16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/sub.v2i16.ll Tue Mar 21 16:39:51 2017
@@ -7,7 +7,7 @@
 
 ; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -27,7 +27,7 @@ define void @v_test_sub_v2i16(<2 x i16>
 
 ; VI: s_sub_i32
 ; VI: s_sub_i32
-define void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
+define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
   %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
   %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1
   %add = sub <2 x i16> %a, %b
@@ -38,7 +38,7 @@ define void @s_test_sub_v2i16(<2 x i16>
 ; GCN-LABEL: {{^}}s_test_sub_self_v2i16:
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]]
 ; GCN: buffer_store_dword [[ZERO]]
-define void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
+define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
   %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
   %add = sub <2 x i16> %a, %a
   store <2 x i16> %add, <2 x i16> addrspace(1)* %out
@@ -51,7 +51,7 @@ define void @s_test_sub_self_v2i16(<2 x
 
 ; VI: v_subrev_i32_e32
 ; VI: v_subrev_i32_e32
-define void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
+define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
   %add = sub <2 x i16> %a, %b
   store <2 x i16> %add, <2 x i16> addrspace(1)* %out
   ret void
@@ -63,7 +63,7 @@ define void @s_test_sub_v2i16_kernarg(<2
 
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffe38, v{{[0-9]+}}
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}}
-define void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -80,7 +80,7 @@ define void @v_test_sub_v2i16_constant(<
 
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x3df, v{{[0-9]+}}
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x34d, v{{[0-9]+}}
-define void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -99,7 +99,7 @@ define void @v_test_sub_v2i16_neg_consta
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD1]]
 ; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -118,7 +118,7 @@ define void @v_test_sub_v2i16_inline_neg
 ; VI-NOT: v_subrev_i16
 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -138,7 +138,7 @@ define void @v_test_sub_v2i16_inline_lo_
 ; VI-NOT: v_subrev_i16
 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -170,7 +170,7 @@ define void @v_test_sub_v2i16_inline_fp_
 ; VI-NOT: and
 ; VI-NOT: shl
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
-define void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -205,7 +205,7 @@ define void @v_test_sub_v2i16_zext_to_v2
 ; VI-DAG: v_subrev_u16_e32
 
 ; VI: buffer_store_dwordx4
-define void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -231,7 +231,7 @@ define void @v_test_sub_v2i16_zext_to_v2
 ; VI: v_subrev_u16_e32
 ; VI: v_subrev_u16_e32
 ; VI: buffer_store_dwordx2
-define void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -259,7 +259,7 @@ define void @v_test_sub_v2i16_sext_to_v2
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-define void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid

Modified: llvm/trunk/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll Tue Mar 21 16:39:51 2017
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL:{{^}}row_filter_C1_D0:
-define void @row_filter_C1_D0() #0 {
+define amdgpu_kernel void @row_filter_C1_D0() #0 {
 entry:
   br i1 undef, label %for.inc.1, label %do.body.preheader
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll Tue Mar 21 16:39:51 2017
@@ -20,7 +20,7 @@ target triple="amdgcn--"
 ; CHECK-NEXT: s_mov_b32 s6, -1
 ; CHECK-NEXT: buffer_store_dword v1, off, s[4:7], 0
 ; CHECK-NEXT: s_endpgm
-define void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind {
 entry:
   %v0 = insertelement <4 x float> undef, float %a0, i32 0
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0

Modified: llvm/trunk/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 ; Just make sure this test doesn't crash.
 ; CHECK-LABEL: foobar:
 ; CHECK: s_endpgm
-define void @foobar() {
+define amdgpu_kernel void @foobar() {
   %v0 = icmp eq <4 x i32> undef, <i32 0, i32 1, i32 2, i32 3>
   %v3 = sext <4 x i1> %v0 to <4 x i32>
   %v4 = extractelement <4 x i32> %v3, i32 1

Modified: llvm/trunk/test/CodeGen/AMDGPU/subreg-intervals.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/subreg-intervals.mir?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/subreg-intervals.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/subreg-intervals.mir Tue Mar 21 16:39:51 2017
@@ -10,8 +10,8 @@
 # CHECK-LABEL: Machine code for function test1:
 
 --- |
-  define void @test0() { ret void }
-  define void @test1() { ret void }
+  define amdgpu_kernel void @test0() { ret void }
+  define amdgpu_kernel void @test1() { ret void }
 ...
 ---
 name: test0

Modified: llvm/trunk/test/CodeGen/AMDGPU/target-cpu.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/target-cpu.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/target-cpu.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/target-cpu.ll Tue Mar 21 16:39:51 2017
@@ -14,7 +14,7 @@ declare void @llvm.amdgcn.s.dcache.wb()
 ; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
-define void @target_none() #0 {
+define amdgpu_kernel void @target_none() #0 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -30,7 +30,7 @@ define void @target_none() #0 {
 ; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
-define void @target_tahiti() #1 {
+define amdgpu_kernel void @target_tahiti() #1 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -46,7 +46,7 @@ define void @target_tahiti() #1 {
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
 ; CHECK: s_dcache_inv_vol
-define void @target_bonaire() #3 {
+define amdgpu_kernel void @target_bonaire() #3 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -63,7 +63,7 @@ define void @target_bonaire() #3 {
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x400
 ; CHECK: flat_store_dword
 ; CHECK: s_dcache_wb{{$}}
-define void @target_fiji() #4 {
+define amdgpu_kernel void @target_fiji() #4 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -79,7 +79,7 @@ define void @target_fiji() #4 {
 ; CHECK-LABEL: {{^}}promote_alloca_enabled:
 ; CHECK: ds_read_b32
 ; CHECK: ; LDSByteSize: 5120
-define void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
+define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
@@ -93,7 +93,7 @@ entry:
 ; CHECK: SCRATCH_RSRC_DWORD0
 ; CHECK: SCRATCH_RSRC_DWORD1
 ; CHECK: ScratchSize: 24
-define void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 {
+define amdgpu_kernel void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4

Modified: llvm/trunk/test/CodeGen/AMDGPU/trap.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/trap.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/trap.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/trap.ll Tue Mar 21 16:39:51 2017
@@ -38,7 +38,7 @@ declare void @llvm.debugtrap() #0
 ; TRAP-BIT: enable_trap_handler = 1
 ; NO-TRAP-BIT: enable_trap_handler = 0
 ; NO-MESA-TRAP: s_endpgm
-define void @hsa_trap() {
+define amdgpu_kernel void @hsa_trap() {
   call void @llvm.trap()
   ret void
 }
@@ -64,7 +64,7 @@ define void @hsa_trap() {
 ; TRAP-BIT: enable_trap_handler = 1
 ; NO-TRAP-BIT: enable_trap_handler = 0
 ; NO-MESA-TRAP: s_endpgm
-define void @hsa_debugtrap() {
+define amdgpu_kernel void @hsa_debugtrap() {
   call void @llvm.debugtrap()
   ret void
 }
@@ -75,7 +75,7 @@ define void @hsa_debugtrap() {
 ; NO-TRAP-BIT: enable_trap_handler = 0
 ; NO-HSA-TRAP: s_endpgm
 ; NO-MESA-TRAP: s_endpgm
-define void @trap() {
+define amdgpu_kernel void @trap() {
   call void @llvm.trap()
   ret void
 }

Modified: llvm/trunk/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll Tue Mar 21 16:39:51 2017
@@ -4,7 +4,7 @@
 ; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32:
 ; CHECK: buffer_load_dword v
 ; CHECK: buffer_store_dword v
-define void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %bc = bitcast <2 x i32> %ld to i64
   %trunc = trunc i64 %bc to i32
@@ -15,7 +15,7 @@ define void @trunc_i64_bitcast_v2i32(i32
 ; CHECK-LABEL: {{^}}trunc_i96_bitcast_v3i32:
 ; CHECK: buffer_load_dword v
 ; CHECK: buffer_store_dword v
-define void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) {
   %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
   %bc = bitcast <3 x i32> %ld to i96
   %trunc = trunc i96 %bc to i32
@@ -26,7 +26,7 @@ define void @trunc_i96_bitcast_v3i32(i32
 ; CHECK-LABEL: {{^}}trunc_i128_bitcast_v4i32:
 ; CHECK: buffer_load_dword v
 ; CHECK: buffer_store_dword v
-define void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %bc = bitcast <4 x i32> %ld to i128
   %trunc = trunc i128 %bc to i32
@@ -38,7 +38,7 @@ define void @trunc_i128_bitcast_v4i32(i3
 ; CHECK-LABEL: {{^}}trunc_i16_bitcast_v2i16:
 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_short [[VAL]]
-define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
   %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %bc = bitcast <2 x i16> %ld to i32
   %trunc = trunc i32 %bc to i16
@@ -54,7 +54,7 @@ define void @trunc_i16_bitcast_v2i16(i16
 ; SI: buffer_load_dword v[[VAL:[0-9]+]]
 ; VI: buffer_load_dwordx2 v{{\[}}[[VAL:[0-9]+]]
 ; CHECK: buffer_store_short [[VAL]]
-define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
   %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %bc = bitcast <4 x i16> %ld to i64
   %trunc = trunc i64 %bc to i16
@@ -66,7 +66,7 @@ define void @trunc_i16_bitcast_v4i16(i16
 ; CHECK-LABEL: {{^}}trunc_i8_bitcast_v2i8:
 ; CHECK: buffer_load_ubyte [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_byte [[VAL]]
-define void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
   %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %bc = bitcast <2 x i8> %ld to i16
   %trunc = trunc i16 %bc to i8
@@ -77,7 +77,7 @@ define void @trunc_i8_bitcast_v2i8(i8 ad
 ; CHECK-LABEL: {{^}}trunc_i32_bitcast_v4i8:
 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_byte [[VAL]]
-define void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
   %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %bc = bitcast <4 x i8> %ld to i32
   %trunc = trunc i32 %bc to i8
@@ -88,7 +88,7 @@ define void @trunc_i32_bitcast_v4i8(i8 a
 ; CHECK-LABEL: {{^}}trunc_i24_bitcast_v3i8:
 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_byte [[VAL]]
-define void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) {
   %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
   %bc = bitcast <3 x i8> %ld to i24
   %trunc = trunc i24 %bc to i8

Modified: llvm/trunk/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/trunc-cmp-constant.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/trunc-cmp-constant.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/trunc-cmp-constant.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 ; SI: v_cmp_eq_u32_e32 vcc, 0, [[TMP]]{{$}}
 ; SI: v_cndmask_b32_e64
 ; SI: buffer_store_byte
-define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, 0
@@ -25,7 +25,7 @@ define void @sextload_i1_to_i32_trunc_cm
 ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, 0
@@ -36,7 +36,7 @@ define void @zextload_i1_to_i32_trunc_cm
 ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, 1
@@ -48,7 +48,7 @@ define void @sextload_i1_to_i32_trunc_cm
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, 1
@@ -60,7 +60,7 @@ define void @zextload_i1_to_i32_trunc_cm
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, -1
@@ -71,7 +71,7 @@ define void @sextload_i1_to_i32_trunc_cm
 ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_neg1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, -1
@@ -84,7 +84,7 @@ define void @zextload_i1_to_i32_trunc_cm
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, 0
@@ -96,7 +96,7 @@ define void @sextload_i1_to_i32_trunc_cm
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, 0
@@ -107,7 +107,7 @@ define void @zextload_i1_to_i32_trunc_cm
 ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, 1
@@ -122,7 +122,7 @@ define void @sextload_i1_to_i32_trunc_cm
 ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, 1
@@ -137,7 +137,7 @@ define void @zextload_i1_to_i32_trunc_cm
 ; XSI: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 0{{$}}
 ; XSI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP0]]
 ; XSI-NEXT: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, -1
@@ -148,7 +148,7 @@ define void @sextload_i1_to_i32_trunc_cm
 ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_neg1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, -1
@@ -162,7 +162,7 @@ define void @zextload_i1_to_i32_trunc_cm
 ; SI: v_cmp_ne_u32_e32 vcc, -1, [[LOAD]]{{$}}
 ; SI-NEXT: v_cndmask_b32_e64
 ; SI: {{buffer|flat}}_store_byte
-define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr i8, i8 addrspace(1)* %in, i32 %tid.x
   %load = load i8, i8 addrspace(1)* %in.ptr

Modified: llvm/trunk/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll Tue Mar 21 16:39:51 2017
@@ -2,7 +2,7 @@
 
 ; GCN-LABEL: {{^}}global_truncstore_f64_to_f16:
 ; GCN: s_endpgm
-define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %val = load double, double addrspace(1)* %in
   %cvt = fptrunc double %val to half
   store half %cvt, half addrspace(1)* %out
@@ -11,7 +11,7 @@ define void @global_truncstore_f64_to_f1
 
 ; GCN-LABEL: {{^}}global_truncstore_v2f64_to_v2f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
   %val = load <2 x double>, <2 x double> addrspace(1)* %in
   %cvt = fptrunc <2 x double> %val to <2 x half>
   store <2 x half> %cvt, <2 x half> addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @global_truncstore_v2f64_to_
 
 ; GCN-LABEL: {{^}}global_truncstore_v3f64_to_v3f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
   %val = load <3 x double>, <3 x double> addrspace(1)* %in
   %cvt = fptrunc <3 x double> %val to <3 x half>
   store <3 x half> %cvt, <3 x half> addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @global_truncstore_v3f64_to_
 
 ; GCN-LABEL: {{^}}global_truncstore_v4f64_to_v4f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
   %val = load <4 x double>, <4 x double> addrspace(1)* %in
   %cvt = fptrunc <4 x double> %val to <4 x half>
   store <4 x half> %cvt, <4 x half> addrspace(1)* %out
@@ -38,7 +38,7 @@ define void @global_truncstore_v4f64_to_
 
 ; GCN-LABEL: {{^}}global_truncstore_v8f64_to_v8f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
   %val = load <8 x double>, <8 x double> addrspace(1)* %in
   %cvt = fptrunc <8 x double> %val to <8 x half>
   store <8 x half> %cvt, <8 x half> addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @global_truncstore_v8f64_to_
 
 ; GCN-LABEL: {{^}}global_truncstore_v16f64_to_v16f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
   %val = load <16 x double>, <16 x double> addrspace(1)* %in
   %cvt = fptrunc <16 x double> %val to <16 x half>
   store <16 x half> %cvt, <16 x half> addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/trunc-store-i1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/trunc-store-i1.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/trunc-store-i1.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/trunc-store-i1.ll Tue Mar 21 16:39:51 2017
@@ -7,7 +7,7 @@
 ; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
 ; SI: buffer_store_byte [[VREG]],
-define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind {
+define amdgpu_kernel void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind {
   %trunc = trunc i32 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
@@ -15,7 +15,7 @@ define void @global_truncstore_i32_to_i1
 
 ; SI-LABEL: {{^}}global_truncstore_i64_to_i1:
 ; SI: buffer_store_byte
-define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind {
+define amdgpu_kernel void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind {
   %trunc = trunc i64 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
@@ -26,13 +26,13 @@ define void @global_truncstore_i64_to_i1
 ; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
 ; SI: buffer_store_byte [[VREG]],
-define void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
+define amdgpu_kernel void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
   %trunc = trunc i16 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
 }
 ; SI-LABEL: {{^}}global_truncstore_i16_to_i1:
-define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind {
+define amdgpu_kernel void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind {
   %add = add i16 %val0, %val1
   %trunc = trunc i16 %add to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1

Modified: llvm/trunk/test/CodeGen/AMDGPU/trunc-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/trunc-store.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/trunc-store.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/trunc-store.ll Tue Mar 21 16:39:51 2017
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}truncstore_arg_v16i32_to_v16i8:
 ; SI: buffer_store_dwordx4
-define void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) {
+define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) {
   %trunc = trunc <16 x i32> %in to <16 x i8>
   store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out
   ret void
@@ -11,7 +11,7 @@ define void @truncstore_arg_v16i32_to_v1
 
 ; FUNC-LABEL: {{^}}truncstore_arg_v16i64_to_v16i8:
 ; SI: buffer_store_dwordx4
-define void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) {
+define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) {
   %trunc = trunc <16 x i64> %in to <16 x i8>
   store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@
 
 ; CHECK-LABEL: {{^}}test:
 ; CHECK: MEM_RAT_CACHELESS STORE_RAW
-define void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) {
+define amdgpu_kernel void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) {
 entry:
   %0 = icmp eq i32 %cond, 0
   br i1 %0, label %if, label %done

Modified: llvm/trunk/test/CodeGen/AMDGPU/trunc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/trunc.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/trunc.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/trunc.ll Tue Mar 21 16:39:51 2017
@@ -4,7 +4,7 @@
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
-define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
 ; GCN-LABEL: {{^}}trunc_i64_to_i32_store:
 ; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[0:1],
 ; GCN: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]]
@@ -28,7 +28,7 @@ define void @trunc_i64_to_i32_store(i32
 ; SI: buffer_store_dword [[VSHL]]
 ; VI: flat_store_dword v[{{[0-9:]+}}], [[VSHL]]
 
-define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
   %b = shl i64 %a, 2
   %result = trunc i64 %b to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -46,7 +46,7 @@ define void @trunc_load_shl_i64(i32 addr
 ; VI: flat_store_dword v[{{[0-9:]+}}], v[[LO_VREG]]
 ; GCN: v_mov_b32_e32
 ; GCN: v_mov_b32_e32
-define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
   %aa = add i64 %a, 234 ; Prevent shrinking store.
   %b = shl i64 %aa, 2
   %result = trunc i64 %b to i32
@@ -57,7 +57,7 @@ define void @trunc_shl_i64(i64 addrspace
 
 ; GCN-LABEL: {{^}}trunc_i32_to_i1:
 ; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}}
-define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
+define amdgpu_kernel void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
   %a = load i32, i32 addrspace(1)* %ptr, align 4
   %trunc = trunc i32 %a to i1
   %result = select i1 %trunc, i32 1, i32 0
@@ -67,7 +67,7 @@ define void @trunc_i32_to_i1(i32 addrspa
 
 ; GCN-LABEL: {{^}}trunc_i8_to_i1:
 ; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}}
-define void @trunc_i8_to_i1(i8 addrspace(1)* %out, i8 addrspace(1)* %ptr) {
+define amdgpu_kernel void @trunc_i8_to_i1(i8 addrspace(1)* %out, i8 addrspace(1)* %ptr) {
   %a = load i8, i8 addrspace(1)* %ptr, align 4
   %trunc = trunc i8 %a to i1
   %result = select i1 %trunc, i8 1, i8 0
@@ -77,7 +77,7 @@ define void @trunc_i8_to_i1(i8 addrspace
 
 ; GCN-LABEL: {{^}}sgpr_trunc_i16_to_i1:
 ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
-define void @sgpr_trunc_i16_to_i1(i16 addrspace(1)* %out, i16 %a) {
+define amdgpu_kernel void @sgpr_trunc_i16_to_i1(i16 addrspace(1)* %out, i16 %a) {
   %trunc = trunc i16 %a to i1
   %result = select i1 %trunc, i16 1, i16 0
   store i16 %result, i16 addrspace(1)* %out, align 4
@@ -86,7 +86,7 @@ define void @sgpr_trunc_i16_to_i1(i16 ad
 
 ; GCN-LABEL: {{^}}sgpr_trunc_i32_to_i1:
 ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
-define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
   %trunc = trunc i32 %a to i1
   %result = select i1 %trunc, i32 1, i32 0
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -99,7 +99,7 @@ define void @sgpr_trunc_i32_to_i1(i32 ad
 ; GCN: s_and_b32 [[MASKED:s[0-9]+]], 1, s[[SLO]]
 ; GCN: v_cmp_eq_u32_e64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], [[MASKED]], 1{{$}}
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s{{\[}}[[VLO]]:[[VHI]]]
-define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
   %trunc = trunc i64 %x to i1
   %sel = select i1 %trunc, i32 63, i32 -12
   store i32 %sel, i32 addrspace(1)* %out
@@ -112,7 +112,7 @@ define void @s_trunc_i64_to_i1(i32 addrs
 ; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]]
 ; GCN: v_cmp_eq_u32_e32 vcc, 1, [[MASKED]]
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
-define void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid

Modified: llvm/trunk/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/tti-unroll-prefs.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/tti-unroll-prefs.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/tti-unroll-prefs.ll Tue Mar 21 16:39:51 2017
@@ -19,7 +19,7 @@
 ; CHECK: store i8 0, i8 addrspace(1)*
 ; CHECK-NOT: store i8 0, i8 addrspace(1)*
 ; CHECK: ret void
-define void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) {
 entry:
   %add = add nsw i32 %b, 4
   %cmp = icmp sgt i32 %add, %a

Modified: llvm/trunk/test/CodeGen/AMDGPU/uaddo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/uaddo.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/uaddo.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/uaddo.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 
 ; EG: ADDC_UINT
 ; EG: ADDC_UINT
-define void @s_uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
@@ -27,7 +27,7 @@ define void @s_uaddo_i64_zext(i64 addrsp
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
@@ -42,7 +42,7 @@ define void @s_uaddo_i32(i32 addrspace(1
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
@@ -63,7 +63,7 @@ define void @v_uaddo_i32(i32 addrspace(1
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @v_uaddo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
@@ -85,7 +85,7 @@ define void @v_uaddo_i32_novcc(i32 addrs
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
@@ -100,7 +100,7 @@ define void @s_uaddo_i64(i64 addrspace(1
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i64, i64 addrspace(1)* %a.ptr
@@ -118,7 +118,7 @@ define void @v_uaddo_i64(i64 addrspace(1
 ; FUNC-LABEL: {{^}}v_uaddo_i16:
 ; VI: v_add_u16_e32
 ; VI: v_cmp_lt_u16_e32
-define void @v_uaddo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr

Modified: llvm/trunk/test/CodeGen/AMDGPU/udiv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/udiv.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/udiv.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/udiv.ll Tue Mar 21 16:39:51 2017
@@ -10,7 +10,7 @@
 ; EG: CF_END
 
 ; SI: v_rcp_iflag_f32_e32
-define void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -21,7 +21,7 @@ define void @udiv_i32(i32 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}s_udiv_i32:
 ; SI: v_rcp_iflag_f32_e32
-define void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %result = udiv i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@ define void @s_udiv_i32(i32 addrspace(1)
 ; SI: v_rcp_iflag_f32_e32
 ; SI: v_rcp_iflag_f32_e32
 ; SI: s_endpgm
-define void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -50,7 +50,7 @@ define void @udiv_v2i32(<2 x i32> addrsp
 ; FUNC-LABEL: {{^}}udiv_v4i32:
 ; EG: CF_END
 ; SI: s_endpgm
-define void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -63,7 +63,7 @@ define void @udiv_v4i32(<4 x i32> addrsp
 ; SI: buffer_load_dword [[VAL:v[0-9]+]]
 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 4, [[VAL]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %result = udiv i32 %a, 16
@@ -77,7 +77,7 @@ define void @udiv_i32_div_pow2(i32 addrs
 ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]]
 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 25, [[MULHI]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %result = udiv i32 %a, 34259182
@@ -91,7 +91,7 @@ define void @udiv_i32_div_k_even(i32 add
 ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]]
 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 24, [[MULHI]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %result = udiv i32 %a, 34259183
@@ -103,7 +103,7 @@ define void @udiv_i32_div_k_odd(i32 addr
 ; SI: v_rcp_f32
 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xff, v{{[0-9]+}}
 ; SI: buffer_store_dword [[TRUNC]]
-define void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -117,7 +117,7 @@ define void @v_udiv_i8(i32 addrspace(1)*
 ; SI: v_rcp_f32
 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xffff, v{{[0-9]+}}
 ; SI: buffer_store_dword [[TRUNC]]
-define void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in
   %den = load i16, i16 addrspace(1) * %den_ptr
@@ -131,7 +131,7 @@ define void @v_udiv_i16(i32 addrspace(1)
 ; SI: v_rcp_f32
 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
 ; SI: buffer_store_dword [[TRUNC]]
-define void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
   %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
   %num = load i23, i23 addrspace(1) * %in
   %den = load i23, i23 addrspace(1) * %den_ptr
@@ -143,7 +143,7 @@ define void @v_udiv_i23(i32 addrspace(1)
 
 ; FUNC-LABEL: {{^}}v_udiv_i24:
 ; SI-NOT: v_rcp_f32
-define void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
   %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
   %num = load i24, i24 addrspace(1) * %in
   %den = load i24, i24 addrspace(1) * %den_ptr
@@ -159,7 +159,7 @@ define void @v_udiv_i24(i32 addrspace(1)
 ; SI: v_mul_hi_u32
 ; SI: v_mul_hi_u32
 
-define void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
+define amdgpu_kernel void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
   %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %2 = udiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
   store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
@@ -168,7 +168,7 @@ define void @scalarize_mulhu_4xi32(<4 x
 
 ; FUNC-LABEL: {{^}}test_udiv2:
 ; SI: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1
-define void @test_udiv2(i32 %p) {
+define amdgpu_kernel void @test_udiv2(i32 %p) {
   %i = udiv i32 %p, 2
   store volatile i32 %i, i32 addrspace(1)* undef
   ret void
@@ -178,7 +178,7 @@ define void @test_udiv2(i32 %p) {
 ; SI: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
 ; SI: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}}
 ; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-define void @test_udiv_3_mulhu(i32 %p) {
+define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
    %i = udiv i32 %p, 3
    store volatile i32 %i, i32 addrspace(1)* undef
    ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/udivrem.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/udivrem.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/udivrem.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/udivrem.ll Tue Mar 21 16:39:51 2017
@@ -51,7 +51,7 @@
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
-define void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) {
+define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) {
   %result0 = udiv i32 %x, %y
   store i32 %result0, i32 addrspace(1)* %out0
   %result1 = urem i32 %x, %y
@@ -158,7 +158,7 @@ define void @test_udivrem(i32 addrspace(
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
-define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
   %result0 = udiv <2 x i32> %x, %y
   store <2 x i32> %result0, <2 x i32> addrspace(1)* %out
   %result1 = urem <2 x i32> %x, %y
@@ -340,7 +340,7 @@ define void @test_udivrem_v2(<2 x i32> a
 ; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
-define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
   %result0 = udiv <4 x i32> %x, %y
   store <4 x i32> %result0, <4 x i32> addrspace(1)* %out
   %result1 = urem <4 x i32> %x, %y

Modified: llvm/trunk/test/CodeGen/AMDGPU/udivrem24.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/udivrem24.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/udivrem24.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/udivrem24.ll Tue Mar 21 16:39:51 2017
@@ -12,7 +12,7 @@
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -31,7 +31,7 @@ define void @udiv24_i8(i8 addrspace(1)*
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in, align 2
   %den = load i16, i16 addrspace(1) * %den_ptr, align 2
@@ -50,7 +50,7 @@ define void @udiv24_i16(i16 addrspace(1)
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -67,7 +67,7 @@ define void @udiv23_i32(i32 addrspace(1)
 ; SI: v_rcp_iflag
 ; SI-NOT v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -84,7 +84,7 @@ define void @udiv24_i32(i32 addrspace(1)
 ; SI: v_rcp_iflag
 ; SI-NOT v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -101,7 +101,7 @@ define void @no_udiv24_u23_u24_i32(i32 a
 ; SI: v_rcp_iflag
 ; SI-NOT v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -121,7 +121,7 @@ define void @no_udiv24_u24_u23_i32(i32 a
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -141,7 +141,7 @@ define void @udiv25_i32(i32 addrspace(1)
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -161,7 +161,7 @@ define void @test_no_udiv24_i32_1(i32 ad
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -184,7 +184,7 @@ define void @test_no_udiv24_i32_2(i32 ad
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -203,7 +203,7 @@ define void @urem24_i8(i8 addrspace(1)*
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in, align 2
   %den = load i16, i16 addrspace(1) * %den_ptr, align 2
@@ -215,7 +215,7 @@ define void @urem24_i16(i16 addrspace(1)
 ; FUNC-LABEL: {{^}}urem24_i32:
 ; SI-NOT: v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -235,7 +235,7 @@ define void @urem24_i32(i32 addrspace(1)
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -255,7 +255,7 @@ define void @urem25_i32(i32 addrspace(1)
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -275,7 +275,7 @@ define void @test_no_urem24_i32_1(i32 ad
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -294,7 +294,7 @@ define void @test_no_urem24_i32_2(i32 ad
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
 
 ; EG: RECIP_IEEE
-define void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -313,7 +313,7 @@ define void @test_udiv24_u16_u23_i32(i32
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
 
 ; EG: RECIP_IEEE
-define void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4

Modified: llvm/trunk/test/CodeGen/AMDGPU/udivrem64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/udivrem64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/udivrem64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/udivrem64.ll Tue Mar 21 16:39:51 2017
@@ -70,7 +70,7 @@
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = udiv i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -144,7 +144,7 @@ define void @test_udiv(i64 addrspace(1)*
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -159,7 +159,7 @@ define void @test_urem(i64 addrspace(1)*
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 33
   %2 = lshr i64 %y, 33
   %result = udiv i64 %1, %2
@@ -176,7 +176,7 @@ define void @test_udiv3264(i64 addrspace
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 33
   %2 = lshr i64 %y, 33
   %result = urem i64 %1, %2
@@ -195,7 +195,7 @@ define void @test_urem3264(i64 addrspace
 ;VI-NOT: v_lshrrev_b64
 ;GCN: v_mad_f32
 ;GCN: s_endpgm
-define void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 41
   %2 = lshr i64 %y, 41
   %result = udiv i64 %1, %2
@@ -214,7 +214,7 @@ define void @test_udiv2364(i64 addrspace
 ;VI-NOT: v_lshrrev_b64
 ;GCN: v_mad_f32
 ;GCN: s_endpgm
-define void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 41
   %2 = lshr i64 %y, 41
   %result = urem i64 %1, %2

Modified: llvm/trunk/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/uint_to_fp.f64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/uint_to_fp.f64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/uint_to_fp.f64.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 ; SI-DAG: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep, align 8
@@ -19,21 +19,21 @@ define void @v_uint_to_fp_i64_to_f64(dou
 }
 
 ; SI-LABEL: {{^}}s_uint_to_fp_i64_to_f64
-define void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
   %cast = uitofp i64 %in to double
   store double %cast, double addrspace(1)* %out, align 8
   ret void
 }
 
 ; SI-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f64
-define void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) {
   %cast = uitofp <2 x i64> %in to <2 x double>
   store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
   ret void
 }
 
 ; SI-LABEL: {{^}}s_uint_to_fp_v4i64_to_v4f64
-define void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) {
   %cast = uitofp <4 x i64> %in to <4 x double>
   store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
   ret void
@@ -42,7 +42,7 @@ define void @s_uint_to_fp_v4i64_to_v4f64
 ; SI-LABEL: {{^}}s_uint_to_fp_i32_to_f64
 ; SI: v_cvt_f64_u32_e32
 ; SI: s_endpgm
-define void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
   %cast = uitofp i32 %in to double
   store double %cast, double addrspace(1)* %out, align 8
   ret void
@@ -52,7 +52,7 @@ define void @s_uint_to_fp_i32_to_f64(dou
 ; SI: v_cvt_f64_u32_e32
 ; SI: v_cvt_f64_u32_e32
 ; SI: s_endpgm
-define void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) {
   %cast = uitofp <2 x i32> %in to <2 x double>
   store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -64,7 +64,7 @@ define void @s_uint_to_fp_v2i32_to_v2f64
 ; SI: v_cvt_f64_u32_e32
 ; SI: v_cvt_f64_u32_e32
 ; SI: s_endpgm
-define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) {
   %cast = uitofp <4 x i32> %in to <4 x double>
   store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
   ret void
@@ -79,7 +79,7 @@ define void @s_uint_to_fp_v4i32_to_v4f64
 ; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
 ; SI: s_endpgm
-define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to double
   store double %fp, double addrspace(1)* %out, align 4
@@ -91,7 +91,7 @@ define void @uint_to_fp_i1_to_f64(double
 ; SI-NEXT: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) {
+define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) {
   %fp = uitofp i1 %in to double
   store double %fp, double addrspace(1)* %out, align 8
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/uint_to_fp.i64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/uint_to_fp.i64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/uint_to_fp.i64.ll Tue Mar 21 16:39:51 2017
@@ -4,7 +4,7 @@
 ; FIXME: This should be merged with uint_to_fp.ll, but s_uint_to_fp_v2i64 crashes on r600
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f16:
-define void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
   %result = uitofp i64 %in to half
   store half %result, half addrspace(1)* %out
   ret void
@@ -24,7 +24,7 @@ define void @s_uint_to_fp_i64_to_f16(hal
 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 [[VR_F16:v[0-9]+]], [[VR]]
 ; GCN: {{buffer|flat}}_store_short {{.*}}[[VR_F16]]
-define void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
@@ -35,7 +35,7 @@ define void @v_uint_to_fp_i64_to_f16(hal
 }
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f32:
-define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
   %result = uitofp i64 %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -54,7 +54,7 @@ define void @s_uint_to_fp_i64_to_f32(flo
 
 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]]
 ; GCN: {{buffer|flat}}_store_dword {{.*}}[[VR]]
-define void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -65,14 +65,14 @@ define void @v_uint_to_fp_i64_to_f32(flo
 }
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f32:
-define void @s_uint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
+define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
   %result = uitofp <2 x i64> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_uint_to_fp_v4i64_to_v4f32:
-define void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
@@ -83,14 +83,14 @@ define void @v_uint_to_fp_v4i64_to_v4f32
 }
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f16:
-define void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{
+define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{
   %result = uitofp <2 x i64> %in to <2 x half>
   store <2 x half> %result, <2 x half> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_uint_to_fp_v4i64_to_v4f16:
-define void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x half>, <4 x half> addrspace(1)* %out, i32 %tid

Modified: llvm/trunk/test/CodeGen/AMDGPU/uint_to_fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/uint_to_fp.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/uint_to_fp.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/uint_to_fp.ll Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@
 ; SI: v_cvt_f32_u32_e32
 
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-define void @s_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
   %result = uitofp i32 %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @s_uint_to_fp_i32_to_f32(flo
 ; SI: v_cvt_f32_u32_e32 {{v[0-9]+}}, {{v[0-9]+$}}
 
 ; R600: INT_TO_FLT
-define void @v_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -32,7 +32,7 @@ define void @v_uint_to_fp_i32_to_f32(flo
 
 ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
-define void @s_uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0 {
   %result = uitofp <2 x i32> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
@@ -49,7 +49,7 @@ define void @s_uint_to_fp_v2i32_to_v2f32
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @s_uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = uitofp <4 x i32> %value to <4 x float>
   store <4 x float> %result, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@ define void @s_uint_to_fp_v4i32_to_v4f32
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
@@ -81,7 +81,7 @@ define void @v_uint_to_fp_v4i32(<4 x flo
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to float
   store float %fp, float addrspace(1)* %out
@@ -92,7 +92,7 @@ define void @s_uint_to_fp_i1_to_f32(floa
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 {
   %fp = uitofp i1 %in to float
   store float %fp, float addrspace(1)* %out
   ret void
@@ -105,7 +105,7 @@ define void @s_uint_to_fp_i1_to_f32_load
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
 ; SI: {{buffer|flat}}_store_dword {{.*}}[[RESULT]]
 ; SI: s_endpgm
-define void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i1, i1 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -126,7 +126,7 @@ define void @v_uint_to_fp_i1_f32_load(fl
 ; R600-DAG: SETGT_UINT
 ; R600-DAG: SETE_INT
 
-define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
 entry:
   %cvt = uitofp i64 %in to float
   store float %cvt, float addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/uitofp.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/uitofp.f16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/uitofp.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/uitofp.f16.ll Tue Mar 21 16:39:51 2017
@@ -8,7 +8,7 @@
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @uitofp_i16_to_f16(
+define amdgpu_kernel void @uitofp_i16_to_f16(
     half addrspace(1)* %r,
     i16 addrspace(1)* %a) {
 entry:
@@ -24,7 +24,7 @@ entry:
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_I16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @uitofp_i32_to_f16(
+define amdgpu_kernel void @uitofp_i32_to_f16(
     half addrspace(1)* %r,
     i32 addrspace(1)* %a) {
 entry:
@@ -48,7 +48,7 @@ entry:
 ; GCN-DAG: v_or_b32_e32
 ; GCN:     buffer_store_dword
 ; GCN:     s_endpgm
-define void @uitofp_v2i16_to_v2f16(
+define amdgpu_kernel void @uitofp_v2i16_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x i16> addrspace(1)* %a) {
 entry:
@@ -68,7 +68,7 @@ entry:
 ; GCN-DAG: v_or_b32_e32
 ; GCN:     buffer_store_dword
 ; GCN:     s_endpgm
-define void @uitofp_v2i32_to_v2f16(
+define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x i32> addrspace(1)* %a) {
 entry:

Modified: llvm/trunk/test/CodeGen/AMDGPU/umed3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/umed3.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/umed3.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/umed3.ll Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i32:
 ; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -25,7 +25,7 @@ define void @v_test_umed3_r_i_i_i32(i32
 ; GCN-LABEL: {{^}}v_test_umed3_multi_use_r_i_i_i32:
 ; GCN: v_max_u32
 ; GCN: v_min_u32
-define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -45,7 +45,7 @@ define void @v_test_umed3_multi_use_r_i_
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_constant_order_i32:
 ; GCN: v_max_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
 ; GCN: v_min_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
-define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -64,7 +64,7 @@ define void @v_test_umed3_r_i_i_constant
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_sign_mismatch_i32:
 ; GCN: v_max_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
 ; GCN: v_min_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
-define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -83,7 +83,7 @@ define void @v_test_umed3_r_i_i_sign_mis
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i64:
 ; GCN: v_cmp_lt_u64
 ; GCN: v_cmp_gt_u64
-define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -102,7 +102,7 @@ define void @v_test_umed3_r_i_i_i64(i64
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i16:
 ; SICIVI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
@@ -173,7 +173,7 @@ define internal i8 @umax8(i8 %x, i8 %y)
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -185,7 +185,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_1:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -197,7 +197,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_2:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -209,7 +209,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_3:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -221,7 +221,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_4:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -233,7 +233,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_5:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -245,7 +245,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_6:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -257,7 +257,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_7:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -269,7 +269,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_8:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -281,7 +281,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_9:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -293,7 +293,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_10:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -305,7 +305,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_11:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -317,7 +317,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_12:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -329,7 +329,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_13:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -341,7 +341,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_14:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -353,7 +353,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_15:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -368,7 +368,7 @@ bb:
 ; GCN: s_and_b32
 ; GCN: s_and_b32
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
 bb:
   %tmp0 = call i16 @umin16(i16 %x, i16 %y)
   %tmp1 = call i16 @umax16(i16 %x, i16 %y)
@@ -383,7 +383,7 @@ bb:
 ; GCN: s_and_b32
 ; GCN: s_and_b32
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
 bb:
   %tmp0 = call i8 @umin8(i8 %x, i8 %y)
   %tmp1 = call i8 @umax8(i8 %x, i8 %y)
@@ -395,7 +395,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_0:
 ; GCN-NOT: v_med3_u32
-define void @s_test_umed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -408,7 +408,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_1:
 ; GCN-NOT: v_med3_u32
-define void @s_test_umed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -421,7 +421,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_2:
 ; GCN-NOT: v_med3_u32
-define void @s_test_umed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -434,7 +434,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_result:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -447,7 +447,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src0:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0_imm_src0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 1, i32 %y)
   %tmp1 = call i32 @umax(i32 1, i32 %y)
@@ -459,7 +459,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src1:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0_imm_src1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 2)
   %tmp1 = call i32 @umax(i32 %x, i32 2)
@@ -471,7 +471,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src2:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 9
-define void @s_test_umed3_i32_pat_0_imm_src2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -491,7 +491,7 @@ bb:
 ; VI: v_max_u16
 
 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
+define amdgpu_kernel void @v_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid

Modified: llvm/trunk/test/CodeGen/AMDGPU/unaligned-load-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/unaligned-load-store.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/unaligned-load-store.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/unaligned-load-store.ll Tue Mar 21 16:39:51 2017
@@ -8,7 +8,7 @@
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 {
   %v = load i16, i16 addrspace(3)* %p, align 1
   store i16 %v, i16 addrspace(3)* %r, align 1
   ret void
@@ -23,7 +23,7 @@ define void @local_unaligned_load_store_
 ; UNALIGNED: buffer_load_ushort
 ; UNALIGNED: buffer_store_short
 ; SI: s_endpgm
-define void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
   %v = load i16, i16 addrspace(1)* %p, align 1
   store i16 %v, i16 addrspace(1)* %r, align 1
   ret void
@@ -42,7 +42,7 @@ define void @global_unaligned_load_store
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
   %v = load i32, i32 addrspace(3)* %p, align 1
   store i32 %v, i32 addrspace(3)* %r, align 1
   ret void
@@ -60,7 +60,7 @@ define void @local_unaligned_load_store_
 
 ; UNALIGNED: buffer_load_dword
 ; UNALIGNED: buffer_store_dword
-define void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(1)* %p, align 1
   store i32 %v, i32 addrspace(1)* %r, align 1
   ret void
@@ -74,7 +74,7 @@ define void @global_unaligned_load_store
 
 ; UNALIGNED: buffer_load_dword
 ; UNALIGNED: buffer_store_dword
-define void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(1)* %p, align 2
   store i32 %v, i32 addrspace(1)* %r, align 2
   ret void
@@ -85,7 +85,7 @@ define void @global_align2_load_store_i3
 ; GCN: ds_read_u16
 ; GCN: ds_write_b16
 ; GCN: ds_write_b16
-define void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
   %v = load i32, i32 addrspace(3)* %p, align 2
   store i32 %v, i32 addrspace(3)* %r, align 2
   ret void
@@ -132,7 +132,7 @@ define void @local_align2_load_store_i32
 ; SI-NOT: v_lshl
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 {
   %v = load i64, i64 addrspace(3)* %p, align 1
   store i64 %v, i64 addrspace(3)* %r, align 1
   ret void
@@ -179,7 +179,7 @@ define void @local_unaligned_load_store_
 ; SI-NOT: v_lshl
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 {
   %v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1
   store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1
   ret void
@@ -209,7 +209,7 @@ define void @local_unaligned_load_store_
 
 ; UNALIGNED: buffer_load_dwordx2
 ; UNALIGNED: buffer_store_dwordx2
-define void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(1)* %p, align 2
   store i64 %v, i64 addrspace(1)* %r, align 2
   ret void
@@ -239,7 +239,7 @@ define void @global_align2_load_store_i6
 
 ; UNALIGNED: buffer_load_dwordx2
 ; UNALIGNED: buffer_store_dwordx2
-define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(1)* %p, align 1
   store i64 %v, i64 addrspace(1)* %r, align 1
   ret void
@@ -286,7 +286,7 @@ define void @unaligned_load_store_i64_gl
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: s_endpgm
-define void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
   ret void
@@ -329,7 +329,7 @@ define void @local_unaligned_load_store_
 
 ; UNALIGNED: buffer_load_dwordx4
 ; UNALIGNED: buffer_store_dwordx4
-define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
   ret void
@@ -337,7 +337,7 @@ define void @global_unaligned_load_store
 
 ; FUNC-LABEL: {{^}}local_load_i64_align_4:
 ; GCN: ds_read2_b32
-define void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %val = load i64, i64 addrspace(3)* %in, align 4
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
@@ -345,7 +345,7 @@ define void @local_load_i64_align_4(i64
 
 ; FUNC-LABEL: {{^}}local_load_i64_align_4_with_offset
 ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
-define void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4
   %val = load i64, i64 addrspace(3)* %ptr, align 4
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -356,7 +356,7 @@ define void @local_load_i64_align_4_with
 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
 ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
 ; GCN: s_endpgm
-define void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
   %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
   %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
@@ -375,7 +375,7 @@ define void @local_load_i64_align_4_with
 ; GCN: ds_read_u8
 ; GCN: ds_read_u8
 ; GCN: store_dwordx2
-define void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %val = load i64, i64 addrspace(3)* %in, align 1
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
@@ -383,7 +383,7 @@ define void @local_load_i64_align_1(i64
 
 ; FUNC-LABEL: {{^}}local_store_i64_align_4:
 ; GCN: ds_write2_b32
-define void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
+define amdgpu_kernel void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
   store i64 %val, i64 addrspace(3)* %out, align 4
   ret void
 }
@@ -391,7 +391,7 @@ define void @local_store_i64_align_4(i64
 ; FUNC-LABEL: {{^}}local_store_i64_align_4_with_offset
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
 ; GCN: s_endpgm
-define void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
   %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4
   store i64 0, i64 addrspace(3)* %ptr, align 4
   ret void
@@ -401,7 +401,7 @@ define void @local_store_i64_align_4_wit
 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; GCN: s_endpgm
-define void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
   %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
   %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
   %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
@@ -418,7 +418,7 @@ define void @local_store_i64_align_4_wit
 ; UNALIGNED: s_load_dword
 
 ; SI: buffer_store_dword
-define void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(2)* %p, align 1
   store i32 %v, i32 addrspace(1)* %r, align 4
   ret void
@@ -430,7 +430,7 @@ define void @constant_unaligned_load_i32
 
 ; UNALIGNED: s_load_dword
 ; UNALIGNED: buffer_store_dword
-define void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(2)* %p, align 2
   store i32 %v, i32 addrspace(1)* %r, align 4
   ret void
@@ -444,7 +444,7 @@ define void @constant_align2_load_i32(i3
 
 ; UNALIGNED: s_load_dwordx2
 ; UNALIGNED: buffer_store_dwordx2
-define void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(2)* %p, align 2
   store i64 %v, i64 addrspace(1)* %r, align 4
   ret void
@@ -453,7 +453,7 @@ define void @constant_align2_load_i64(i6
 ; SI-LABEL: {{^}}constant_align4_load_i64:
 ; SI: s_load_dwordx2
 ; SI: buffer_store_dwordx2
-define void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(2)* %p, align 4
   store i64 %v, i64 addrspace(1)* %r, align 4
   ret void
@@ -462,7 +462,7 @@ define void @constant_align4_load_i64(i6
 ; SI-LABEL: {{^}}constant_align4_load_v4i32:
 ; SI: s_load_dwordx4
 ; SI: buffer_store_dwordx4
-define void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 4
   store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
   ret void
@@ -482,7 +482,7 @@ define void @constant_align4_load_v4i32(
 ; UNALIGNED: buffer_load_dwordx2
 
 ; SI: buffer_store_dwordx2
-define void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> addrspace(1)* %r) #0 {
   %v = load <2 x i32>, <2 x i32> addrspace(2)* %p, align 1
   store <2 x i32> %v, <2 x i32> addrspace(1)* %r, align 4
   ret void
@@ -512,7 +512,7 @@ define void @constant_unaligned_load_v2i
 ; UNALIGNED: buffer_load_dwordx4
 
 ; SI: buffer_store_dwordx4
-define void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
   ret void
@@ -521,7 +521,7 @@ define void @constant_unaligned_load_v4i
 ; SI-LABEL: {{^}}constant_align4_load_i8:
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_byte
-define void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
   %v = load i8, i8 addrspace(2)* %p, align 4
   store i8 %v, i8 addrspace(1)* %r, align 4
   ret void
@@ -530,7 +530,7 @@ define void @constant_align4_load_i8(i8
 ; SI-LABEL: {{^}}constant_align2_load_i8:
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_byte
-define void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
   %v = load i8, i8 addrspace(2)* %p, align 2
   store i8 %v, i8 addrspace(1)* %r, align 2
   ret void
@@ -541,7 +541,7 @@ define void @constant_align2_load_i8(i8
 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[LO]]
 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HI]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
   %gep0 = getelementptr i32, i32 addrspace(2)* %p, i64 1
   %v0 = load i32, i32 addrspace(2)* %p, align 4
   %v1 = load i32, i32 addrspace(2)* %gep0, align 4
@@ -571,7 +571,7 @@ define void @constant_align4_merge_load_
 ; SI: ds_read_u8
 
 ; SI: ScratchSize: 0{{$}}
-define void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 1
   store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
   ret void
@@ -596,7 +596,7 @@ define void @local_load_align1_v16i8(<16
 ; SI: ds_write_b8
 
 ; SI: ScratchSize: 0{{$}}
-define void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 {
+define amdgpu_kernel void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 {
   store <16 x i8> zeroinitializer, <16 x i8> addrspace(3)* %out, align 1
   ret void
 }

Modified: llvm/trunk/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll Tue Mar 21 16:39:51 2017
@@ -4,7 +4,7 @@
 
 
 ; CHECK-LABEL: {{^}}func:
-define void @func() #0 {
+define amdgpu_kernel void @func() #0 {
 B0:
   br i1 undef, label %B1, label %B2
 
@@ -72,7 +72,7 @@ bb11:
 ; CHECK: v_mov_b32_e32 v[[OUTPUT_LO]], v6
 
 ; CHECK: buffer_store_dwordx4 v{{\[}}[[OUTPUT_LO]]:[[OUTPUT_HI]]{{\]}}
-define void @partially_undef_copy() #0 {
+define amdgpu_kernel void @partially_undef_copy() #0 {
   %tmp0 = call i32 asm sideeffect "v_mov_b32_e32 v5, 5", "={VGPR5}"()
   %tmp1 = call i32 asm sideeffect "v_mov_b32_e32 v6, 6", "={VGPR6}"()
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 ; SI hits an assertion at -O0, evergreen hits a not implemented unreachable.
 
 ; COMMON-LABEL: {{^}}branch_true:
-define void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+define amdgpu_kernel void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 true, label %for.end, label %for.body.lr.ph
 
@@ -42,7 +42,7 @@ for.end:
 ; SI: s_cbranch_vccnz
 ; SI: s_cbranch_scc1
 ; SI: s_endpgm
-define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+define amdgpu_kernel void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 false, label %for.end, label %for.body.lr.ph
 
@@ -79,7 +79,7 @@ for.end:
 ; SI: s_cbranch_scc1
 ; SI: s_cbranch_scc1
 ; SI: s_endpgm
-define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+define amdgpu_kernel void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 undef, label %for.end, label %for.body.lr.ph
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/uniform-cfg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/uniform-cfg.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/uniform-cfg.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/uniform-cfg.ll Tue Mar 21 16:39:51 2017
@@ -12,7 +12,7 @@
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -40,7 +40,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = fcmp oeq float %cond, 0.0
   br i1 %cmp0, label %if, label %else
@@ -68,7 +68,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %else, label %if
@@ -96,7 +96,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = fcmp oeq float %cond, 0.0
   br i1 %cmp0, label %else, label %if
@@ -123,7 +123,7 @@ done:
 ; GCN: buffer_store_dword
 ; GCN: [[ENDIF_LABEL]]:
 ; GCN: s_endpgm
-define void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) {
+define amdgpu_kernel void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) {
 entry:
   %a.0 = fadd float %a, 10.0
   %cond = bitcast float %a.0 to i32
@@ -148,7 +148,7 @@ endif:
 ; GCN: buffer_store_dword
 ; GCN: [[ENDIF_LABEL]]:
 ; GCN: s_endpgm
-define void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) {
+define amdgpu_kernel void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) {
 entry:
   %a.0 = fadd float %a, 10.0
   %cond = bitcast float %a.0 to i32
@@ -176,7 +176,7 @@ endif:
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
 ; GCN: s_endpgm
-define void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) {
+define amdgpu_kernel void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) {
 entry:
   %cmp = icmp eq i32 %a, 0
   br i1 %cmp, label %if.then, label %if.else
@@ -209,7 +209,7 @@ if.end:
 ; GCN: v_mov_b32_e32 [[THREE:v[0-9]+]], 3
 ; GCN: buffer_store_dword [[THREE]]
 ; GCN: s_endpgm
-define void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) {
+define amdgpu_kernel void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) {
 entry:
   %cmp = icmp eq i32 %a, 0
   br i1 %cmp, label %if.then, label %if.else
@@ -233,7 +233,7 @@ if.end:
 ; GCN: buffer_store_dword
 ; GCN: [[LABEL]]:
 ; GCN: s_endpgm
-define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
 main_body:
   %0 = icmp sgt i32 %cond, 0
   %1 = sext i1 %0 to i32
@@ -258,7 +258,7 @@ ENDIF:
 ; GCN: {{^}}[[BODY]]:
 ; GCN: buffer_store
 ; GCN: s_endpgm
-define void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %cmp0 = icmp sgt i32 %cond0, 0
@@ -284,7 +284,7 @@ bb9:
 ; SI: s_cmp_lg_u32 [[I]], 0
 ; SI: s_cbranch_scc1 [[LOOP_LABEL]]
 ; SI: s_endpgm
-define void @uniform_loop(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @uniform_loop(i32 addrspace(1)* %out, i32 %a) {
 entry:
   br label %loop
 
@@ -310,7 +310,7 @@ done:
 ; GCN: {{^}}[[IF_UNIFORM_LABEL]]:
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
-define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %d_cmp = icmp ult i32 %tid, 16
@@ -338,7 +338,7 @@ endif:
 ; GCN: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
-define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %u_cmp = icmp eq i32 %cond, 0
   br i1 %u_cmp, label %if, label %endif
@@ -370,7 +370,7 @@ endif:
 ; GCN: [[IF_UNIFORM]]:
 ; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
 ; GCN: buffer_store_dword [[TWO]]
-define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %d_cmp = icmp eq i32 %tid, 0
@@ -410,7 +410,7 @@ exit:
 
 ; GCN: BB[[FNNUM]]_3:
 ; GCN: s_endpgm
-define void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp1 = icmp sgt i32 %cond, 0
@@ -445,7 +445,7 @@ bb9:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc_i64_eq(i64 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -477,7 +477,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc_i64_ne(i64 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp ne i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -505,7 +505,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc_i64_sgt(i64 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp sgt i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -524,7 +524,7 @@ done:
 
 ; GCN-LABEL: {{^}}move_to_valu_i64_eq:
 ; GCN: v_cmp_eq_u64_e32
-define void @move_to_valu_i64_eq(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @move_to_valu_i64_eq(i32 addrspace(1)* %out) {
   %cond = load volatile i64, i64 addrspace(3)* undef
   %cmp0 = icmp eq i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -543,7 +543,7 @@ done:
 
 ; GCN-LABEL: {{^}}move_to_valu_i64_ne:
 ; GCN: v_cmp_ne_u64_e32
-define void @move_to_valu_i64_ne(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @move_to_valu_i64_ne(i32 addrspace(1)* %out) {
   %cond = load volatile i64, i64 addrspace(3)* undef
   %cmp0 = icmp ne i64 %cond, 0
   br i1 %cmp0, label %if, label %else

Modified: llvm/trunk/test/CodeGen/AMDGPU/uniform-crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/uniform-crash.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/uniform-crash.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/uniform-crash.ll Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@
 ; GCN: s_cbranch_scc1 [[LABEL:BB[0-9_A-Z]+]]
 ; GCN: [[LABEL]]:
 ; GCN-NEXT: s_endpgm
-define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
 main_body:
   %0 = icmp sgt i32 %cond, 0
   %1 = sext i1 %0 to i32
@@ -25,7 +25,7 @@ ENDIF:
 ; GCN: {{^}}[[LOOP:[A-Z0-9_]+]]:
 ; GCN: s_cbranch_scc1 [[LOOP]]
 ; GCN: {{^}}[[BB0]]:
-define void @fix_sgpr_live_ranges_crash(i32 %arg, i32 %arg1)  {
+define amdgpu_kernel void @fix_sgpr_live_ranges_crash(i32 %arg, i32 %arg1)  {
 bb:
   %cnd = trunc i32 %arg to i1
   br i1 %cnd, label %bb2, label %bb5

Modified: llvm/trunk/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll Tue Mar 21 16:39:51 2017
@@ -38,7 +38,7 @@ out:
 ; CHECK-NEXT: s_xor_b64
 ; CHECK-NEXT: ; mask branch
 ; CHECK-NEXT: s_cbranch_execz
-define void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 main_body:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %cc = icmp eq i32 %tid, 0

Modified: llvm/trunk/test/CodeGen/AMDGPU/unknown-processor.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/unknown-processor.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/unknown-processor.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/unknown-processor.ll Tue Mar 21 16:39:51 2017
@@ -13,7 +13,7 @@
 ; GCN: ScratchSize: 8{{$}}
 
 ; R600: MOV
-define void @foo() {
+define amdgpu_kernel void @foo() {
   %alloca = alloca i32, align 4
   store volatile i32 0, i32* %alloca
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/unroll.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/unroll.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/unroll.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/unroll.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 ; CHECK-LABEL: @test
 ; CHECK-NOT: alloca
 ; CHECK: store i32 5, i32 addrspace(1)* %out
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
 entry:
   %0 = alloca [32 x i32]
   br label %loop.header

Modified: llvm/trunk/test/CodeGen/AMDGPU/unsupported-cc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/unsupported-cc.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/unsupported-cc.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/unsupported-cc.ll Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
-define void @slt(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @slt(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp slt i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -18,7 +18,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
-define void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp ult i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -31,7 +31,7 @@ entry:
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
 ; CHECK-NEXT: LSHR *
-define void @ult_float(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ult_float(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ult float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
@@ -43,7 +43,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @ult_float_native(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ult_float_native(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ult float %in, 5.0
   %1 = select i1 %0, float 0.0, float 1.0
@@ -55,7 +55,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @olt(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @olt(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
@@ -67,7 +67,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
-define void @sle(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @sle(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sle i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -79,7 +79,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
-define void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp ule i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -92,7 +92,7 @@ entry:
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
 ; CHECK-NEXT: LSHR *
-define void @ule_float(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ule_float(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ule float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
@@ -104,7 +104,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @ule_float_native(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ule_float_native(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ule float %in, 5.0
   %1 = select i1 %0, float 0.0, float 1.0
@@ -116,7 +116,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT:1084227584(5.000000e+00)
-define void @ole(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ole(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ole float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0

Modified: llvm/trunk/test/CodeGen/AMDGPU/urem.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/urem.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/urem.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/urem.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 ; FUNC-LABEL: {{^}}test_urem_i32:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -26,7 +26,7 @@ define void @test_urem_i32(i32 addrspace
 ; SI: v_sub_i32
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = urem i32 %num, 7
   store i32 %result, i32 addrspace(1)* %out
@@ -36,7 +36,7 @@ define void @test_urem_i32_7(i32 addrspa
 ; FUNC-LABEL: {{^}}test_urem_v2i32:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -48,7 +48,7 @@ define void @test_urem_v2i32(<2 x i32> a
 ; FUNC-LABEL: {{^}}test_urem_v4i32:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -60,7 +60,7 @@ define void @test_urem_v4i32(<4 x i32> a
 ; FUNC-LABEL: {{^}}test_urem_i64:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %a = load i64, i64 addrspace(1)* %in
   %b = load i64, i64 addrspace(1)* %b_ptr
@@ -72,7 +72,7 @@ define void @test_urem_i64(i64 addrspace
 ; FUNC-LABEL: {{^}}test_urem_v2i64:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
@@ -84,7 +84,7 @@ define void @test_urem_v2i64(<2 x i64> a
 ; FUNC-LABEL: {{^}}test_urem_v4i64:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
   %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr

Modified: llvm/trunk/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll Tue Mar 21 16:39:51 2017
@@ -11,7 +11,7 @@ declare float @llvm.amdgcn.div.fixup.f32
 ; GCN: s_load_dword [[SGPR:s[0-9]+]],
 ; GCN: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
   %dbl = fadd float %a, %a
   store float %dbl, float addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @test_sgpr_use_twice_binop(f
 ; GCN: s_load_dword [[SGPR:s[0-9]+]],
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -35,7 +35,7 @@ define void @test_sgpr_use_three_ternary
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -58,7 +58,7 @@ define void @test_sgpr_use_twice_ternary
 ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VA1]], [[SA]], [[VB]]
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
-define void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 {
   %va0 = load volatile float, float addrspace(1)* %in
   %va1 = load volatile float, float addrspace(1)* %in
   %fma0 = call float @llvm.fma.f32(float %a, float %va0, float %b) #1
@@ -76,7 +76,7 @@ define void @test_use_s_v_s(float addrsp
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -90,7 +90,7 @@ define void @test_sgpr_use_twice_ternary
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -100,7 +100,7 @@ define void @test_sgpr_use_twice_ternary
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -110,7 +110,7 @@ define void @test_sgpr_use_twice_ternary
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], 2.0, [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -121,7 +121,7 @@ define void @test_sgpr_use_twice_ternary
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, float %a) #0 {
   %val = call float @llvm.amdgcn.div.fixup.f32(float 2.0, float %a, float %a) #1
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -132,7 +132,7 @@ define void @test_sgpr_use_twice_ternary
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float 1024.0) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -143,7 +143,7 @@ define void @test_sgpr_use_twice_ternary
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT0]]
-define void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1
   store float %fma, float addrspace(1)* %out
   ret void
@@ -158,7 +158,7 @@ define void @test_literal_use_twice_tern
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
-define void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1
   %fma1 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %b) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -171,7 +171,7 @@ define void @test_literal_use_twice_tern
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1
   store float %fma, float addrspace(1)* %out
   ret void
@@ -186,7 +186,7 @@ define void @test_literal_use_twice_tern
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
-define void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1
   %fma1 = call float @llvm.fma.f32(float 1024.0, float %b, float 1024.0) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -199,7 +199,7 @@ define void @test_literal_use_twice_tern
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1
   store float %fma, float addrspace(1)* %out
   ret void
@@ -214,7 +214,7 @@ define void @test_literal_use_twice_tern
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
-define void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1
   %fma1 = call float @llvm.fma.f32(float %b, float 1024.0, float 1024.0) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -234,7 +234,7 @@ define void @test_literal_use_twice_tern
 
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
-define void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float %a, float %b, float 1024.0) #1
   %fma1 = call float @llvm.fma.f32(float %a, float %b, float 4096.0) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -259,7 +259,7 @@ define void @test_s0_s1_k_f32(float addr
 
 ; GCN: buffer_store_dwordx2 [[RESULT0]]
 ; GCN: buffer_store_dwordx2 [[RESULT1]]
-define void @test_s0_s1_k_f64(double addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @test_s0_s1_k_f64(double addrspace(1)* %out, double %a, double %b) #0 {
   %fma0 = call double @llvm.fma.f64(double %a, double %b, double 1024.0) #1
   %fma1 = call double @llvm.fma.f64(double %a, double %b, double 4096.0) #1
   store volatile double %fma0, double addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/usubo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/usubo.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/usubo.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/usubo.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 
 ; EG: SUBB_UINT
 ; EG: ADDC_UINT
-define void @s_usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
@@ -27,7 +27,7 @@ define void @s_usubo_i64_zext(i64 addrsp
 
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
-define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
   %carry = extractvalue { i32, i1 } %usub, 1
@@ -42,7 +42,7 @@ define void @s_usubo_i32(i32 addrspace(1
 
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
-define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
@@ -63,7 +63,7 @@ define void @v_usubo_i32(i32 addrspace(1
 
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
-define void @v_usubo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
@@ -87,7 +87,7 @@ define void @v_usubo_i32_novcc(i32 addrs
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT
 ; EG: SUB_INT
-define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
@@ -104,7 +104,7 @@ define void @s_usubo_i64(i64 addrspace(1
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT
 ; EG: SUB_INT
-define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i64, i64 addrspace(1)* %a.ptr
@@ -122,7 +122,7 @@ define void @v_usubo_i64(i64 addrspace(1
 ; FUNC-LABEL: {{^}}v_usubo_i16:
 ; VI: v_subrev_u16_e32
 ; VI: v_cmp_gt_u16_e32
-define void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr

Modified: llvm/trunk/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll Tue Mar 21 16:39:51 2017
@@ -1,14 +1,14 @@
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}kernel_arg_i64:
-define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
+define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
   store i64 %a, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; i64 arg works, v1i64 arg does not.
 ; CHECK-LABEL: {{^}}kernel_arg_v1i64:
-define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
+define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
   ret void
 }

Modified: llvm/trunk/test/CodeGen/AMDGPU/v_cndmask.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/v_cndmask.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/v_cndmask.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/v_cndmask.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-DAG: v{{[0-9]}}
 ; All nan values are converted to 0xffffffff
 ; GCN: s_endpgm
-define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
+define amdgpu_kernel void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
   %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
   %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx
   %f = load float, float addrspace(1)* %f.gep
@@ -30,7 +30,7 @@ define void @v_cnd_nan_nosgpr(float addr
 ; GCN-DAG: v{{[0-9]}}
 ; All nan values are converted to 0xffffffff
 ; GCN: s_endpgm
-define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
+define amdgpu_kernel void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
   %setcc = icmp ne i32 %c, 0
   %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
   store float %select, float addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @v_cnd_nan(float addrspace(1
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -62,7 +62,7 @@ define void @fcmp_sgprX_k0_select_k1_sgp
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -77,7 +77,7 @@ define void @fcmp_sgprX_k0_select_k1_sgp
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -92,7 +92,7 @@ define void @fcmp_sgprX_k0_select_k0_sgp
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -107,7 +107,7 @@ define void @fcmp_sgprX_k0_select_k0_sgp
 ; GCN-DAG: {{buffer|flat}}_load_dword [[Z:v[0-9]+]]
 ; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
 ; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 0, [[Z]], [[COND]]
-define void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
@@ -124,7 +124,7 @@ define void @fcmp_sgprX_k0_select_k0_vgp
 ; GCN-DAG: s_load_dword [[X:s[0-9]+]]
 ; GCN: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
 ; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, [[Z]], [[COND]]
-define void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
@@ -142,7 +142,7 @@ define void @fcmp_sgprX_k0_select_k1_vgp
 ; GCN-DAG: v_cmp_ngt_f32_e32 vcc, 0, [[X]]
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
-define void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -159,7 +159,7 @@ define void @fcmp_vgprX_k0_select_k1_sgp
 ; GCN: {{buffer|flat}}_load_dword [[Z:v[0-9]+]]
 ; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc
-define void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -178,7 +178,7 @@ define void @fcmp_vgprX_k0_select_k1_vgp
 ; GCN: {{buffer|flat}}_load_dword [[Z:v[0-9]+]]
 ; GCN: v_cmp_lt_i32_e32 vcc, -1, [[X]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[Z]], vcc
-define void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -203,7 +203,7 @@ define void @icmp_vgprX_k0_select_k1_vgp
 ; VI-DAG: v_cmp_lt_i64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}}
 ; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v[[Z_HI]], s
 ; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 2, v[[Z_LO]], s
-define void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i64, i64 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -226,7 +226,7 @@ define void @icmp_vgprX_k0_select_k1_vgp
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -249,7 +249,7 @@ define void @fcmp_vgprX_k0_select_vgprZ_
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -275,7 +275,7 @@ define void @fcmp_vgprX_k0_select_k1_vgp
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
-define void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -298,7 +298,7 @@ define void @fcmp_k0_vgprX_select_k1_vgp
 ; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, vcc
 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s
 ; GCN: store_byte
-define void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -321,7 +321,7 @@ define void @icmp_vgprX_k0_select_k1_vgp
 ; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -343,7 +343,7 @@ define void @fcmp_vgprX_k0_selectf64_k1_
 ; GCN: v_cmp_nlg_f32_e32 vcc, 0, [[X]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -364,7 +364,7 @@ define void @fcmp_vgprX_k0_selecti64_k1_
 
 ; GCN: v_cmp_gt_u32_e32 vcc, 2, [[X]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, [[Z]], vcc
-define void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -386,7 +386,7 @@ define void @icmp_vgprX_k0_selectf32_k1_
 ; GCN: v_cmp_nle_f32_e32 vcc, 4.0, [[X]]
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -1.0, vcc
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -2.0, vcc
-define void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext

Modified: llvm/trunk/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.cvt.pk.u8.f32(f
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_0:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 0, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_0(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_0(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 0, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_cvt_pk_u8_f32_idx_0(i32 a
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_1:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_1(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_1(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 1, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @v_cvt_pk_u8_f32_idx_1(i32 a
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_2:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_2(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_2(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 2, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -29,7 +29,7 @@ define void @v_cvt_pk_u8_f32_idx_2(i32 a
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_3:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 3, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 3, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -40,7 +40,7 @@ define void @v_cvt_pk_u8_f32_idx_3(i32 a
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 3, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_combine(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_combine(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result0 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 0, i32 %reg) #0
   %result1 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 1, i32 %result0) #0
   %result2 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 2, i32 %result1) #0
@@ -51,7 +51,7 @@ define void @v_cvt_pk_u8_f32_combine(i32
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx(i32 addrspace(1)* %out, float %src, i32 %idx, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx(i32 addrspace(1)* %out, float %src, i32 %idx, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 %idx, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/v_mac.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/v_mac.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/v_mac.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/v_mac.ll Tue Mar 21 16:39:51 2017
@@ -8,7 +8,7 @@
 ; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
 ; GCN: v_mac_f32_e32 [[C]], [[B]], [[A]]
 ; GCN: buffer_store_dword [[C]]
-define void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -26,7 +26,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_inline_sgpr_inline:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]}}, s{{[0-9]+}}, 0.5, 0.5
-define void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 {
 entry:
   %tmp0 = fmul float 0.5, %in
   %tmp1 = fadd float %tmp0, 0.5
@@ -37,7 +37,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_vvs:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 {
+define amdgpu_kernel void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
 
@@ -52,7 +52,7 @@ entry:
 
 ; GCN-LABEL: {{^}}mac_ssv:
 ; GCN: v_mac_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 {
+define amdgpu_kernel void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 {
 entry:
   %c = load float, float addrspace(1)* %in
 
@@ -65,7 +65,7 @@ entry:
 ; GCN-LABEL: {{^}}mac_mad_same_add:
 ; GCN: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
 ; GCN: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
-define void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -96,7 +96,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_neg_src0:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -116,7 +116,7 @@ entry:
 ; GCN-LABEL: {{^}}nsz_mad_sub0_src0:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @nsz_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @nsz_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -136,7 +136,7 @@ entry:
 ; GCN-LABEL: {{^}}safe_mad_sub0_src0:
 ; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0,
 ; GCN: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[SUB0]]
-define void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -156,7 +156,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_neg_src1:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -176,7 +176,7 @@ entry:
 ; GCN-LABEL: {{^}}nsz_mad_sub0_src1:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @nsz_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @nsz_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -196,7 +196,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_neg_src2:
 ; GCN-NOT: v_mac
 ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
-define void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -222,7 +222,7 @@ entry:
 
 ; GCN: v_add_f32_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
 ; GCN: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
-define void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 {
+define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -257,7 +257,7 @@ bb:
 
 ; VI-FLUSH: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
 ; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
-define void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 {
+define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64

Modified: llvm/trunk/test/CodeGen/AMDGPU/v_mac_f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/v_mac_f16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/v_mac_f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/v_mac_f16.ll Tue Mar 21 16:39:51 2017
@@ -14,7 +14,7 @@
 ; VI:  v_mac_f16_e32 v[[C_F16]], v[[B_F16]], v[[A_F16]]
 ; VI:  buffer_store_short v[[C_F16]]
 ; GCN: s_endpgm
-define void @mac_f16(
+define amdgpu_kernel void @mac_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -38,7 +38,7 @@ entry:
 ; VI:  v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
 ; VI:  v_mac_f16_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_f16_same_add(
+define amdgpu_kernel void @mac_f16_same_add(
     half addrspace(1)* %r0,
     half addrspace(1)* %r1,
     half addrspace(1)* %a,
@@ -73,7 +73,7 @@ entry:
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_a(
+define amdgpu_kernel void @mac_f16_neg_a(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -100,7 +100,7 @@ entry:
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_b(
+define amdgpu_kernel void @mac_f16_neg_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -127,7 +127,7 @@ entry:
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_c(
+define amdgpu_kernel void @mac_f16_neg_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -151,7 +151,7 @@ entry:
 ; VI:  v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
 ; GCN: s_endpgm
-define void @mac_f16_neg_a_safe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_a_safe_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -175,7 +175,7 @@ entry:
 ; VI:  v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_f16_neg_b_safe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_b_safe_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -199,7 +199,7 @@ entry:
 ; VI:  v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_f16_neg_c_safe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_c_safe_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -226,7 +226,7 @@ entry:
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_a_nsz_fp_math(
+define amdgpu_kernel void @mac_f16_neg_a_nsz_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -253,7 +253,7 @@ entry:
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_b_nsz_fp_math(
+define amdgpu_kernel void @mac_f16_neg_b_nsz_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -280,7 +280,7 @@ entry:
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_c_nsz_fp_math(
+define amdgpu_kernel void @mac_f16_neg_c_nsz_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -328,7 +328,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
 ; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @mac_v2f16(
+define amdgpu_kernel void @mac_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -355,7 +355,7 @@ entry:
 ; VI:  v_mac_f16_e32 [[ADD0]], v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_v2f16_same_add(
+define amdgpu_kernel void @mac_v2f16_same_add(
     <2 x half> addrspace(1)* %r0,
     <2 x half> addrspace(1)* %r1,
     <2 x half> addrspace(1)* %a,
@@ -392,7 +392,7 @@ entry:
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_a(
+define amdgpu_kernel void @mac_v2f16_neg_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -421,7 +421,7 @@ entry:
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_b(
+define amdgpu_kernel void @mac_v2f16_neg_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -454,7 +454,7 @@ entry:
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_c(
+define amdgpu_kernel void @mac_v2f16_neg_c(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -482,7 +482,7 @@ entry:
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
 ; GCN: s_endpgm
-define void @mac_v2f16_neg_a_safe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -510,7 +510,7 @@ entry:
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_v2f16_neg_b_safe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -538,7 +538,7 @@ entry:
 ; VI:  v_mac_f16_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_v2f16_neg_c_safe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_c_safe_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -571,7 +571,7 @@ entry:
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_a_nsz_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_a_nsz_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -604,7 +604,7 @@ entry:
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_b_nsz_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_b_nsz_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -637,7 +637,7 @@ entry:
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_c_nsz_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_c_nsz_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,

Modified: llvm/trunk/test/CodeGen/AMDGPU/v_madak_f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/v_madak_f16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/v_madak_f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/v_madak_f16.ll Tue Mar 21 16:39:51 2017
@@ -7,7 +7,7 @@
 ; VI:  v_madak_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], 0x4900{{$}}
 ; VI:  buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @madak_f16(
+define amdgpu_kernel void @madak_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -28,7 +28,7 @@ entry:
 ; VI:  v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @madak_f16_use_2(
+define amdgpu_kernel void @madak_f16_use_2(
     half addrspace(1)* %r0,
     half addrspace(1)* %r1,
     half addrspace(1)* %a,

Modified: llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll Tue Mar 21 16:39:51 2017
@@ -29,7 +29,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 ; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]
 ; SI-NEXT: ; mask branch
 ;
-define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
+define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   switch i32 %tid, label %default [
@@ -76,7 +76,7 @@ end:
 ; SI: BB1_2:
 ; SI: s_or_b64 exec, exec, [[BR_SREG]]
 ; SI: s_endpgm
-define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %is.0 = icmp ne i32 %tid, 0
   br i1 %is.0, label %store, label %exit
@@ -106,7 +106,7 @@ exit:
 ; SI: [[LABEL_EXIT]]:
 ; SI: s_endpgm
 
-define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+define amdgpu_kernel void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %is.0 = icmp ne i32 %tid, 0
@@ -173,7 +173,7 @@ exit:
 ; SI-NOT: [[COND_STATE]]
 ; SI: s_endpgm
 
-define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
+define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp4 = sext i32 %tmp to i64

Modified: llvm/trunk/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir Tue Mar 21 16:39:51 2017
@@ -1,7 +1,7 @@
 # RUN: llc -run-pass si-insert-waits -march=amdgcn -mcpu=tahiti -o - %s | FileCheck %s
 --- |
 
-  define void @vccz_corrupt_workaround(float %cond, i32 addrspace(1)* %out) #0 {
+  define amdgpu_kernel void @vccz_corrupt_workaround(float %cond, i32 addrspace(1)* %out) #0 {
   entry:
     %cmp0 = fcmp oeq float %cond, 0.000000e+00
     br i1 %cmp0, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
@@ -20,7 +20,7 @@
     ret void
   }
 
-  define void @vccz_corrupt_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
+  define amdgpu_kernel void @vccz_corrupt_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
   entry:
     br i1 undef, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/vector-alloca.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/vector-alloca.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/vector-alloca.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/vector-alloca.ll Tue Mar 21 16:39:51 2017
@@ -15,7 +15,7 @@
 ; EG: MOV
 ; EG: MOV
 ; EG: MOVA_INT
-define void @vector_read(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -44,7 +44,7 @@ entry:
 ; EG: MOV
 ; EG: MOVA_INT
 ; EG: MOVA_INT
-define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+define amdgpu_kernel void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -71,7 +71,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}bitcast_gep:
 ; EG: STORE_RAW
-define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+define amdgpu_kernel void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -93,7 +93,7 @@ entry:
 ; OPT-LABEL: @vector_read_bitcast_gep(
 ; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index
 ; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
-define void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -121,7 +121,7 @@ entry:
 ; OPT: store float
 ; OPT: store float
 ; OPT: load float
-define void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [4 x i32]
   %tmp.bc = bitcast [4 x i32]* %tmp to [4 x float]*

Modified: llvm/trunk/test/CodeGen/AMDGPU/vector-extract-insert.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/vector-extract-insert.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/vector-extract-insert.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/vector-extract-insert.ll Tue Mar 21 16:39:51 2017
@@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
+define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
@@ -30,7 +30,7 @@ define void @extract_insert_same_dynelt_
 ; GCN: v_movreld_b32
 ; GCN: v_movrels_b32
 ; GCN: buffer_store_dword v
-define void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 {
+define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
@@ -49,7 +49,7 @@ define void @extract_insert_different_dy
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
+define amdgpu_kernel void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
@@ -68,7 +68,7 @@ define void @extract_insert_same_elt2_v4
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @extract_insert_same_dynelt_v4f32(float addrspace(1)* %out, <4 x float> addrspace(1)* %in, float %val, i32 %idx) #1 {
+define amdgpu_kernel void @extract_insert_same_dynelt_v4f32(float addrspace(1)* %out, <4 x float> addrspace(1)* %in, float %val, i32 %idx) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %id.ext

Modified: llvm/trunk/test/CodeGen/AMDGPU/vectorize-global-local.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/vectorize-global-local.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/vectorize-global-local.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/vectorize-global-local.ll Tue Mar 21 16:39:51 2017
@@ -12,7 +12,7 @@
 ; CHECK-DAG: ds_write2_b32
 ; CHECK-DAG: ds_write2_b32
 
-define void @vectorize_global_local(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(3)* nocapture %arg1) {
+define amdgpu_kernel void @vectorize_global_local(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(3)* nocapture %arg1) {
 bb:
   %tmp = load i32, i32 addrspace(1)* %arg, align 4
   store i32 %tmp, i32 addrspace(3)* %arg1, align 4

Modified: llvm/trunk/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@
 ; EG: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
 ; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00
 
-define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %v = load i32, i32 addrspace(1)* %in
   store i32 %v, i32 addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @vtx_fetch32(i32 addrspace(1
 ; EG: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[SRC]],0x40,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x08,0x00
 ; CM: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[SRC]],0x00,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x00,0x00
 
-define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %v = load <4 x i32>, <4 x i32> addrspace(1)* %in
   store <4 x i32> %v, <4 x i32> addrspace(1)* %out
   ret void
@@ -26,7 +26,7 @@ define void @vtx_fetch128(<4 x i32> addr
 ; EG: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #3 ; encoding: [0x40,0x03,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
 ; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #3 ; encoding: [0x40,0x03,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00
 
-define void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace(7)* %in) {
+define amdgpu_kernel void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace(7)* %in) {
   %v = load i32, i32 addrspace(7)* %in
   store i32 %v, i32 addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@ define void @vtx_fetch32_id3(i32 addrspa
 
 @t = internal addrspace(2) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3]
 
-define void @vtx_fetch32_id2(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @vtx_fetch32_id2(i32 addrspace(1)* %out, i32 %in) {
   %a = getelementptr inbounds [4 x i32], [4 x i32] addrspace(2)* @t, i32 0, i32 %in
   %v = load i32, i32 addrspace(2)* %a
   store i32 %v, i32 addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll Tue Mar 21 16:39:51 2017
@@ -45,7 +45,7 @@
 ; GCN: ScratchSize: 1536
 
 ; s[0:3] input user SGPRs. s4,s5,s6 = workgroup IDs. s8 scratch offset.
-define void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 {
+define amdgpu_kernel void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 {
 bb:
   %tmp = add i32 %arg1, %arg2
   %tmp7 = extractelement <4 x float> %arg6, i32 0

Modified: llvm/trunk/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll Tue Mar 21 16:39:51 2017
@@ -4,7 +4,7 @@
 
 declare float @llvm.amdgcn.rsq.legacy(float) #0
 
-define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float %src), !dbg !4
   store float %rsq, float addrspace(1)* %out, align 4
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/vop-shrink.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/vop-shrink.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/vop-shrink.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/vop-shrink.ll Tue Mar 21 16:39:51 2017
@@ -8,7 +8,7 @@
 
 ; ModuleID = 'vop-shrink.ll'
 
-define void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) {
+define amdgpu_kernel void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) {
 entry:
   %vgpr = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tmp = icmp eq i32 %cond, 0
@@ -35,7 +35,7 @@ endif:
 
 ; FUNC-LABEL: {{^}}add_fold:
 ; SI: v_add_f32_e32 v{{[0-9]+}}, 0x44800000
-define void @add_fold(float addrspace(1)* %out) {
+define amdgpu_kernel void @add_fold(float addrspace(1)* %out) {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = uitofp i32 %tmp to float

Modified: llvm/trunk/test/CodeGen/AMDGPU/vselect.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/vselect.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/vselect.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/vselect.ll Tue Mar 21 16:39:51 2017
@@ -10,7 +10,7 @@
 ; SI: v_cndmask_b32_e64
 ; SI: v_cndmask_b32_e32
 
-define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
+define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
 entry:
   %load0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0
   %load1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1
@@ -28,7 +28,7 @@ entry:
 ;SI: v_cndmask_b32_e64
 ;SI: v_cndmask_b32_e32
 
-define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
+define amdgpu_kernel void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
 entry:
   %0 = load <2 x float>, <2 x float> addrspace(1)* %in0
   %1 = load <2 x float>, <2 x float> addrspace(1)* %in1
@@ -52,7 +52,7 @@ entry:
 ; SI: v_cndmask_b32
 ; SI: v_cndmask_b32
 
-define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
+define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
 entry:
   %load0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0
   %load1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1
@@ -68,7 +68,7 @@ entry:
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
+define amdgpu_kernel void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
 entry:
   %0 = load <4 x float>, <4 x float> addrspace(1)* %in0
   %1 = load <4 x float>, <4 x float> addrspace(1)* %in1

Modified: llvm/trunk/test/CodeGen/AMDGPU/vselect64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/vselect64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/vselect64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/vselect64.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 ; Make sure the vectors aren't being stored on the stack.  We know they are
 ; being stored on the stack if the shaders uses at leat 10 registers.
 ; CHECK-NOT: {{\**}} MOV T{{[0-9][0-9]}}.X
-define void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) {
+define amdgpu_kernel void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) {
 entry:
        %cmp = icmp ne  <4 x i32> %c, <i32 0, i32 0, i32 0, i32 0>
        %result = select <4 x i1> %cmp, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x i64> <i64 4, i64 5, i64 6, i64 7>

Modified: llvm/trunk/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/vtx-fetch-branch.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/vtx-fetch-branch.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/vtx-fetch-branch.ll Tue Mar 21 16:39:51 2017
@@ -10,7 +10,7 @@
 ; CHECK-NOT: ALU_POP_AFTER
 ; CHECK: TEX
 ; CHECK-NEXT: POP
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
 entry:
   %0 = icmp eq i32 %cond, 0
   br i1 %0, label %endif, label %if

Modified: llvm/trunk/test/CodeGen/AMDGPU/vtx-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/vtx-schedule.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/vtx-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/vtx-schedule.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 ; CHECK: VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0
 ; CHECK: Fetch clause
 ; CHECK: VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 0
-define void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) {
+define amdgpu_kernel void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) {
 entry:
   %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in0
   %1 = load i32, i32 addrspace(1)* %0

Modified: llvm/trunk/test/CodeGen/AMDGPU/waitcnt-flat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/waitcnt-flat.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/waitcnt-flat.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/waitcnt-flat.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 ; XGCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]]
 ; XGCN: s_waitcnt vmcnt(0) lgkmcnt(0)
 ; XGCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}]
-define void @test(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) {
   store volatile i32 0, i32 addrspace(1)* %out
   %val = load volatile i32, i32 addrspace(1)* %out
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/waitcnt.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/waitcnt.mir?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/waitcnt.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/waitcnt.mir Tue Mar 21 16:39:51 2017
@@ -1,18 +1,18 @@
 # RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-waits  %s -o - | FileCheck %s
 
 --- |
-  define void @flat_zero_waitcnt(i32 addrspace(1)* %global4,
+  define amdgpu_kernel void @flat_zero_waitcnt(i32 addrspace(1)* %global4,
                                  <4 x i32> addrspace(1)* %global16,
                                  i32 addrspace(4)* %flat4,
                                  <4 x i32> addrspace(4)* %flat16) {
     ret void
   }
 
-  define void @single_fallthrough_successor_no_end_block_wait() {
+  define amdgpu_kernel void @single_fallthrough_successor_no_end_block_wait() {
     ret void
   }
 
-  define void @single_branch_successor_not_next_block() {
+  define amdgpu_kernel void @single_branch_successor_not_next_block() {
     ret void
   }
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll Tue Mar 21 16:39:51 2017
@@ -10,7 +10,7 @@ declare void @llvm.write_register.i32(me
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 
-define void @write_vgpr_into_sgpr() {
+define amdgpu_kernel void @write_vgpr_into_sgpr() {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   call void @llvm.write_register.i32(metadata !0, i32 %tid)
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/write_register.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/write_register.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/write_register.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/write_register.ll Tue Mar 21 16:39:51 2017
@@ -4,7 +4,7 @@ declare void @llvm.write_register.i32(me
 declare void @llvm.write_register.i64(metadata, i64) #0
 
 ; CHECK-LABEL: {{^}}test_write_m0:
-define void @test_write_m0(i32 %val) #0 {
+define amdgpu_kernel void @test_write_m0(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !0, i32 0)
   call void @llvm.write_register.i32(metadata !0, i32 -1)
   call void @llvm.write_register.i32(metadata !0, i32 %val)
@@ -15,7 +15,7 @@ define void @test_write_m0(i32 %val) #0
 ; CHECK: s_mov_b64 exec, 0
 ; CHECK: s_mov_b64 exec, -1
 ; CHECK: s_mov_b64 exec, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_write_exec(i64 %val) #0 {
+define amdgpu_kernel void @test_write_exec(i64 %val) #0 {
   call void @llvm.write_register.i64(metadata !1, i64 0)
   call void @llvm.write_register.i64(metadata !1, i64 -1)
   call void @llvm.write_register.i64(metadata !1, i64 %val)
@@ -26,7 +26,7 @@ define void @test_write_exec(i64 %val) #
 ; CHECK: s_mov_b64 flat_scratch, 0
 ; CHECK: s_mov_b64 flat_scratch, -1
 ; CHECK: s_mov_b64 flat_scratch, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_write_flat_scratch(i64 %val) #0 {
+define amdgpu_kernel void @test_write_flat_scratch(i64 %val) #0 {
   call void @llvm.write_register.i64(metadata !2, i64 0)
   call void @llvm.write_register.i64(metadata !2, i64 -1)
   call void @llvm.write_register.i64(metadata !2, i64 %val)
@@ -36,7 +36,7 @@ define void @test_write_flat_scratch(i64
 ; CHECK-LABEL: {{^}}test_write_flat_scratch_lo:
 ; CHECK: s_mov_b32 flat_scratch_lo, 0
 ; CHECK: s_mov_b32 flat_scratch_lo, s{{[0-9]+}}
-define void @test_write_flat_scratch_lo(i32 %val) #0 {
+define amdgpu_kernel void @test_write_flat_scratch_lo(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !3, i32 0)
   call void @llvm.write_register.i32(metadata !3, i32 %val)
   ret void
@@ -45,7 +45,7 @@ define void @test_write_flat_scratch_lo(
 ; CHECK-LABEL: {{^}}test_write_flat_scratch_hi:
 ; CHECK: s_mov_b32 flat_scratch_hi, 0
 ; CHECK: s_mov_b32 flat_scratch_hi, s{{[0-9]+}}
-define void @test_write_flat_scratch_hi(i32 %val) #0 {
+define amdgpu_kernel void @test_write_flat_scratch_hi(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !4, i32 0)
   call void @llvm.write_register.i32(metadata !4, i32 %val)
   ret void
@@ -54,7 +54,7 @@ define void @test_write_flat_scratch_hi(
 ; CHECK-LABEL: {{^}}test_write_exec_lo:
 ; CHECK: s_mov_b32 exec_lo, 0
 ; CHECK: s_mov_b32 exec_lo, s{{[0-9]+}}
-define void @test_write_exec_lo(i32 %val) #0 {
+define amdgpu_kernel void @test_write_exec_lo(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !5, i32 0)
   call void @llvm.write_register.i32(metadata !5, i32 %val)
   ret void
@@ -63,7 +63,7 @@ define void @test_write_exec_lo(i32 %val
 ; CHECK-LABEL: {{^}}test_write_exec_hi:
 ; CHECK: s_mov_b32 exec_hi, 0
 ; CHECK: s_mov_b32 exec_hi, s{{[0-9]+}}
-define void @test_write_exec_hi(i32 %val) #0 {
+define amdgpu_kernel void @test_write_exec_hi(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !6, i32 0)
   call void @llvm.write_register.i32(metadata !6, i32 %val)
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll Tue Mar 21 16:39:51 2017
@@ -4,7 +4,7 @@
 ;CHECK: {{^}}fill3d:
 ;CHECK-NOT: MULLO_INT T[0-9]+
 
-define void @fill3d(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @fill3d(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %x.i = tail call i32 @llvm.r600.read.global.size.x() #1
   %y.i18 = tail call i32 @llvm.r600.read.global.size.y() #1

Modified: llvm/trunk/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 
 ; TODO: enable doubles
 ; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32:
-define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
   %val = load double, double addrspace(1)* %in, align 8
   %add = fadd double %val, 4.0
   %bc = bitcast double %add to <2 x i32>
@@ -14,7 +14,7 @@ define void @bitcast_f64_to_v2i32(<2 x i
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2i64_to_v2f64:
-define void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
+define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
@@ -30,7 +30,7 @@ end:
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2f64_to_v2i64:
-define void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
+define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end

Modified: llvm/trunk/test/CodeGen/AMDGPU/xor.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/xor.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/xor.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/xor.ll Tue Mar 21 16:39:51 2017
@@ -10,7 +10,7 @@
 ; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
+define amdgpu_kernel void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in0
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %in1
   %result = xor <2 x i32> %a, %b
@@ -29,7 +29,7 @@ define void @xor_v2i32(<2 x i32> addrspa
 ; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
+define amdgpu_kernel void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in0
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %in1
   %result = xor <4 x i32> %a, %b
@@ -46,7 +46,7 @@ define void @xor_v4i32(<4 x i32> addrspa
 ; SI: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
+define amdgpu_kernel void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
   %a = load float, float addrspace(1) * %in0
   %b = load float, float addrspace(1) * %in1
   %acmp = fcmp oge float %a, 0.000000e+00
@@ -63,7 +63,7 @@ define void @xor_i1(float addrspace(1)*
 ; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
+define amdgpu_kernel void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
   %a = load volatile i1, i1 addrspace(1)* %in0
   %b = load volatile i1, i1 addrspace(1)* %in1
   %xor = xor i1 %a, %b
@@ -73,7 +73,7 @@ define void @v_xor_i1(i1 addrspace(1)* %
 
 ; FUNC-LABEL: {{^}}vector_xor_i32:
 ; SI: v_xor_b32_e32
-define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %a = load i32, i32 addrspace(1)* %in0
   %b = load i32, i32 addrspace(1)* %in1
   %result = xor i32 %a, %b
@@ -83,7 +83,7 @@ define void @vector_xor_i32(i32 addrspac
 
 ; FUNC-LABEL: {{^}}scalar_xor_i32:
 ; SI: s_xor_b32
-define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %result = xor i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -91,7 +91,7 @@ define void @scalar_xor_i32(i32 addrspac
 
 ; FUNC-LABEL: {{^}}scalar_not_i32:
 ; SI: s_not_b32
-define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
   %result = xor i32 %a, -1
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -99,7 +99,7 @@ define void @scalar_not_i32(i32 addrspac
 
 ; FUNC-LABEL: {{^}}vector_not_i32:
 ; SI: v_not_b32
-define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %a = load i32, i32 addrspace(1)* %in0
   %b = load i32, i32 addrspace(1)* %in1
   %result = xor i32 %a, -1
@@ -111,7 +111,7 @@ define void @vector_not_i32(i32 addrspac
 ; SI: v_xor_b32_e32
 ; SI: v_xor_b32_e32
 ; SI: s_endpgm
-define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
   %a = load i64, i64 addrspace(1)* %in0
   %b = load i64, i64 addrspace(1)* %in1
   %result = xor i64 %a, %b
@@ -122,7 +122,7 @@ define void @vector_xor_i64(i64 addrspac
 ; FUNC-LABEL: {{^}}scalar_xor_i64:
 ; SI: s_xor_b64
 ; SI: s_endpgm
-define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %result = xor i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -130,7 +130,7 @@ define void @scalar_xor_i64(i64 addrspac
 
 ; FUNC-LABEL: {{^}}scalar_not_i64:
 ; SI: s_not_b64
-define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
   %result = xor i64 %a, -1
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -139,7 +139,7 @@ define void @scalar_not_i64(i64 addrspac
 ; FUNC-LABEL: {{^}}vector_not_i64:
 ; SI: v_not_b32
 ; SI: v_not_b32
-define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
   %a = load i64, i64 addrspace(1)* %in0
   %b = load i64, i64 addrspace(1)* %in1
   %result = xor i64 %a, -1
@@ -153,7 +153,7 @@ define void @vector_not_i64(i64 addrspac
 
 ; FUNC-LABEL: {{^}}xor_cf:
 ; SI: s_xor_b64
-define void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) {
+define amdgpu_kernel void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
@@ -178,7 +178,7 @@ endif:
 ; SI-DAG: s_xor_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
-define void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = xor i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -192,7 +192,7 @@ define void @scalar_xor_literal_i64(i64
 
 ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
 ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
-define void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = xor i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
 
@@ -211,7 +211,7 @@ define void @scalar_xor_literal_multi_us
 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
 ; SI-NOT: xor_b32
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = xor i64 %a, 63
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -220,7 +220,7 @@ define void @scalar_xor_inline_imm_i64(i
 ; FUNC-LABEL: {{^}}scalar_xor_neg_inline_imm_i64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 ; SI: s_xor_b64 [[VAL]], [[VAL]], -8
-define void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = xor i64 %a, -8
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -231,7 +231,7 @@ define void @scalar_xor_neg_inline_imm_i
 ; SI: v_xor_b32_e32 {{v[0-9]+}}, -8, v[[LO_VREG]]
 ; SI: v_xor_b32_e32 {{v[0-9]+}}, -1, {{.*}}
 ; SI: s_endpgm
-define void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = xor i64 %loada, -8
   store i64 %or, i64 addrspace(1)* %out
@@ -243,7 +243,7 @@ define void @vector_xor_i64_neg_inline_i
 ; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]]
 ; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @vector_xor_literal_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_xor_literal_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = xor i64 %loada, 22470723082367
   store i64 %or, i64 addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/zero_extend.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/zero_extend.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/zero_extend.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/zero_extend.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 ; SI: {{^}}s_mad_zext_i32_to_i64:
 ; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}}
 ; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
-define void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 {
 entry:
   %tmp0 = mul i32 %a, %b
   %tmp1 = add i32 %tmp0, %c
@@ -20,7 +20,7 @@ entry:
 
 ; SI-LABEL: {{^}}s_cmp_zext_i1_to_i32
 ; SI: v_cndmask_b32
-define void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %tmp0 = icmp eq i32 %a, %b
   %tmp1 = zext i1 %tmp0 to i32
@@ -29,7 +29,7 @@ entry:
 }
 
 ; SI-LABEL: {{^}}s_arg_zext_i1_to_i64:
-define void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 {
+define amdgpu_kernel void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 {
   %ext = zext i1 %arg to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
@@ -39,7 +39,7 @@ define void @s_arg_zext_i1_to_i64(i64 ad
 ; SI: s_mov_b32 s{{[0-9]+}}, 0
 ; SI: v_cmp_eq_u32
 ; SI: v_cndmask_b32
-define void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp eq i32 %a, %b
   %ext = zext i1 %cmp to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
@@ -49,7 +49,7 @@ define void @s_cmp_zext_i1_to_i64(i64 ad
 ; SI-LABEL: {{^}}s_cmp_zext_i1_to_i16
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; SI: buffer_store_short [[RESULT]]
-define void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
+define amdgpu_kernel void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
   %tmp0 = icmp eq i16 %a, %b
   %tmp1 = zext i1 %tmp0 to i16
   store i16 %tmp1, i16 addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll Tue Mar 21 16:39:51 2017
@@ -11,7 +11,7 @@
 ; GCN-NOT: v[[HI]]
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %ld.64 = load volatile i64, i64 addrspace(1)* %in0
   %ld.32 = load volatile i32, i32 addrspace(1)* %in1
   %ext = zext i32 %ld.32 to i64
@@ -31,7 +31,7 @@ define void @zext_or_operand_i64(i64 add
 ; GCN-NOT: _or_
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %ld.64 = load volatile i64, i64 addrspace(1)* %in0
   %ld.32 = load volatile i32, i32 addrspace(1)* %in1
   %ext = zext i32 %ld.32 to i64

Modified: llvm/trunk/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir (original)
+++ llvm/trunk/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@
 
   @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00], align 4
 
-  define void @float(float addrspace(1)* %out, i32 %index) #0 {
+  define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) #0 {
   entry:
     %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
     %1 = load float, float addrspace(2)* %0

Modified: llvm/trunk/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir (original)
+++ llvm/trunk/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir Tue Mar 21 16:39:51 2017
@@ -1,6 +1,6 @@
 # RUN: llc --mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-fold-operands,si-shrink-instructions %s -o - | FileCheck %s
 --- |
-  define void @add_f32_1.0_one_f16_use() #0 {
+  define amdgpu_kernel void @add_f32_1.0_one_f16_use() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -11,7 +11,7 @@
     ret void
   }
 
-  define void @add_f32_1.0_multi_f16_use() #0 {
+  define amdgpu_kernel void @add_f32_1.0_multi_f16_use() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -22,7 +22,7 @@
     ret void
   }
 
-  define void @add_f32_1.0_one_f32_use_one_f16_use () #0 {
+  define amdgpu_kernel void @add_f32_1.0_one_f32_use_one_f16_use () #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -33,7 +33,7 @@
     ret void
   }
 
-  define void @add_f32_1.0_one_f32_use_multi_f16_use () #0 {
+  define amdgpu_kernel void @add_f32_1.0_one_f32_use_multi_f16_use () #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -46,7 +46,7 @@
     ret void
   }
 
-  define void @add_i32_1_multi_f16_use() #0 {
+  define amdgpu_kernel void @add_i32_1_multi_f16_use() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f16.add0 = fadd half %f16.val0, 0xH0001
@@ -56,7 +56,7 @@
     ret void
   }
 
-  define void @add_i32_m2_one_f32_use_multi_f16_use () #0 {
+  define amdgpu_kernel void @add_i32_m2_one_f32_use_multi_f16_use () #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -69,7 +69,7 @@
     ret void
   }
 
-  define void @add_f16_1.0_multi_f32_use() #0 {
+  define amdgpu_kernel void @add_f16_1.0_multi_f32_use() #0 {
     %f32.val0 = load volatile float, float addrspace(1)* undef
     %f32.val1 = load volatile float, float addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -80,7 +80,7 @@
     ret void
   }
 
-  define void @add_f16_1.0_other_high_bits_multi_f16_use() #0 {
+  define amdgpu_kernel void @add_f16_1.0_other_high_bits_multi_f16_use() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile half, half addrspace(1)* undef
@@ -91,7 +91,7 @@
     ret void
   }
 
-  define void @add_f16_1.0_other_high_bits_use_f16_f32() #0 {
+  define amdgpu_kernel void @add_f16_1.0_other_high_bits_use_f16_f32() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile half, half addrspace(1)* undef

Modified: llvm/trunk/test/CodeGen/MIR/AMDGPU/intrinsics.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/MIR/AMDGPU/intrinsics.mir?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/MIR/AMDGPU/intrinsics.mir (original)
+++ llvm/trunk/test/CodeGen/MIR/AMDGPU/intrinsics.mir Tue Mar 21 16:39:51 2017
@@ -2,7 +2,7 @@
 
 --- |
 
-  define void @use_intrin() {
+  define amdgpu_kernel void @use_intrin() {
     ret void
   }
 

Modified: llvm/trunk/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir (original)
+++ llvm/trunk/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@
 
   @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00], align 4
 
-  define void @float(float addrspace(1)* %out, i32 %index) #0 {
+  define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) #0 {
   entry:
     %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
     %1 = load float, float addrspace(2)* %0

Modified: llvm/trunk/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/MIR/AMDGPU/target-index-operands.mir?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/MIR/AMDGPU/target-index-operands.mir (original)
+++ llvm/trunk/test/CodeGen/MIR/AMDGPU/target-index-operands.mir Tue Mar 21 16:39:51 2017
@@ -7,7 +7,7 @@
 
   @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00], align 4
 
-  define void @float(float addrspace(1)* %out, i32 %index) #0 {
+  define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) #0 {
   entry:
     %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
     %1 = load float, float addrspace(2)* %0
@@ -15,7 +15,7 @@
     ret void
   }
 
-  define void @float2(float addrspace(1)* %out, i32 %index) #0 {
+  define amdgpu_kernel void @float2(float addrspace(1)* %out, i32 %index) #0 {
   entry:
     %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
     %1 = load float, float addrspace(2)* %0

Modified: llvm/trunk/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll (original)
+++ llvm/trunk/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 ; ASC-NOT: ptrtoint
 ; ASC-NOT: inttoptr
 
-define void @test_sink_ptrtoint_asc(float addrspace(1)* nocapture %arg, float addrspace(1)* nocapture readonly %arg1, float addrspace(3)* %arg2) #0 {
+define amdgpu_kernel void @test_sink_ptrtoint_asc(float addrspace(1)* nocapture %arg, float addrspace(1)* nocapture readonly %arg1, float addrspace(3)* %arg2) #0 {
 bb:
   %tmp = getelementptr inbounds float, float addrspace(3)* %arg2, i32 16
   %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.x() #1

Modified: llvm/trunk/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll (original)
+++ llvm/trunk/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll Tue Mar 21 16:39:51 2017
@@ -14,7 +14,7 @@ target datalayout = "e-p:32:32-p1:64:64-
 ; CHECK-LABEL: @indvar_32_bit(
 ; CHECK-NOT: sext i32
 ; CHECK: phi i32
-define void @indvar_32_bit(i32 %n, i32* nocapture %output) {
+define amdgpu_kernel void @indvar_32_bit(i32 %n, i32* nocapture %output) {
 entry:
   %cmp5 = icmp sgt i32 %n, 0
   br i1 %cmp5, label %for.body.preheader, label %for.end
@@ -46,7 +46,7 @@ for.end:
 ; CHECK-NOT: ashr i64
 ; CHECK-NOT: mul nsw i64
 ; CHECK-NOT: add nsw i64
-define void @no_promote_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @no_promote_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   br label %for.body
 
@@ -72,7 +72,7 @@ for.end:
 ; be legalized anyway.
 
 ; CHECK-LABEL: @indvar_48_bit(
-define void @indvar_48_bit(i48 %n, i48* nocapture %output) {
+define amdgpu_kernel void @indvar_48_bit(i48 %n, i48* nocapture %output) {
 entry:
   %cmp5 = icmp sgt i48 %n, 0
   br i1 %cmp5, label %for.body.preheader, label %for.end

Modified: llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll (original)
+++ llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll Tue Mar 21 16:39:51 2017
@@ -45,7 +45,7 @@ define float @load_private_from_flat(flo
 ; CHECK-LABEL: @store_global_from_flat(
 ; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)*
 ; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* %tmp0
-define void @store_global_from_flat(float addrspace(4)* %generic_scalar) #0 {
+define amdgpu_kernel void @store_global_from_flat(float addrspace(4)* %generic_scalar) #0 {
   %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)*
   store float 0.0, float addrspace(1)* %tmp0
   ret void
@@ -54,7 +54,7 @@ define void @store_global_from_flat(floa
 ; CHECK-LABEL: @store_group_from_flat(
 ; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)*
 ; CHECK-NEXT: store float 0.000000e+00, float addrspace(3)* %tmp0
-define void @store_group_from_flat(float addrspace(4)* %generic_scalar) #0 {
+define amdgpu_kernel void @store_group_from_flat(float addrspace(4)* %generic_scalar) #0 {
   %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)*
   store float 0.0, float addrspace(3)* %tmp0
   ret void
@@ -63,7 +63,7 @@ define void @store_group_from_flat(float
 ; CHECK-LABEL: @store_private_from_flat(
 ; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float*
 ; CHECK-NEXT: store float 0.000000e+00, float* %tmp0
-define void @store_private_from_flat(float addrspace(4)* %generic_scalar) #0 {
+define amdgpu_kernel void @store_private_from_flat(float addrspace(4)* %generic_scalar) #0 {
   %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float*
   store float 0.0, float* %tmp0
   ret void
@@ -74,7 +74,7 @@ define void @store_private_from_flat(flo
 ; CHECK-NEXT: %val = load i32, i32 addrspace(1)* %input, align 4
 ; CHECK-NEXT: store i32 %val, i32 addrspace(1)* %output, align 4
 ; CHECK-NEXT: ret void
-define void @load_store_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @load_store_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -87,7 +87,7 @@ define void @load_store_global(i32 addrs
 ; CHECK-NEXT: %val = load i32, i32 addrspace(3)* %input, align 4
 ; CHECK-NEXT: store i32 %val, i32 addrspace(3)* %output, align 4
 ; CHECK-NEXT: ret void
-define void @load_store_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
+define amdgpu_kernel void @load_store_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -100,7 +100,7 @@ define void @load_store_group(i32 addrsp
 ; CHECK-NEXT: %val = load i32, i32* %input, align 4
 ; CHECK-NEXT: store i32 %val, i32* %output, align 4
 ; CHECK-NEXT: ret void
-define void @load_store_private(i32* nocapture %input, i32* nocapture %output) #0 {
+define amdgpu_kernel void @load_store_private(i32* nocapture %input, i32* nocapture %output) #0 {
   %tmp0 = addrspacecast i32* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -113,7 +113,7 @@ define void @load_store_private(i32* noc
 ; CHECK-NEXT: %val = load i32, i32 addrspace(4)* %input, align 4
 ; CHECK-NEXT: store i32 %val, i32 addrspace(4)* %output, align 4
 ; CHECK-NEXT: ret void
-define void @load_store_flat(i32 addrspace(4)* nocapture %input, i32 addrspace(4)* nocapture %output) #0 {
+define amdgpu_kernel void @load_store_flat(i32 addrspace(4)* nocapture %input, i32 addrspace(4)* nocapture %output) #0 {
   %val = load i32, i32 addrspace(4)* %input, align 4
   store i32 %val, i32 addrspace(4)* %output, align 4
   ret void
@@ -122,7 +122,7 @@ define void @load_store_flat(i32 addrspa
 ; CHECK-LABEL: @store_addrspacecast_ptr_value(
 ; CHECK: %cast = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
 ; CHECK-NEXT: store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %output, align 4
-define void @store_addrspacecast_ptr_value(i32 addrspace(1)* nocapture %input, i32 addrspace(4)* addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @store_addrspacecast_ptr_value(i32 addrspace(1)* nocapture %input, i32 addrspace(4)* addrspace(1)* nocapture %output) #0 {
   %cast = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
   store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %output, align 4
   ret void

Modified: llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll (original)
+++ llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll Tue Mar 21 16:39:51 2017
@@ -28,7 +28,7 @@
 ; CHECK: store float %v, float addrspace(3)* %tmp7, align 4
 ; CHECK: call void @llvm.amdgcn.s.barrier()
 ; CHECK: ret void
-define void @load_store_lds_f32(i32 %i, float %v) #0 {
+define amdgpu_kernel void @load_store_lds_f32(i32 %i, float %v) #0 {
 bb:
   %tmp = load float, float addrspace(4)* addrspacecast (float addrspace(3)* @scalar to float addrspace(4)*), align 4
   call void @use(float %tmp)
@@ -83,7 +83,7 @@ bb:
 
 ; CHECK-LABEL: @nested_const_expr(
 ; CHECK: store i32 1, i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds ([10 x float], [10 x float] addrspace(3)* @array, i64 0, i64 1) to i32 addrspace(3)*), align 4
-define void @nested_const_expr() #0 {
+define amdgpu_kernel void @nested_const_expr() #0 {
   store i32 1, i32 addrspace(4)* bitcast (float addrspace(4)* getelementptr ([10 x float], [10 x float] addrspace(4)* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float] addrspace(4)*), i64 0, i64 1) to i32 addrspace(4)*), align 4
   ret void
 }
@@ -93,7 +93,7 @@ define void @nested_const_expr() #0 {
 ; CHECK-NEXT: %v = load float, float addrspace(1)* %addr
 ; CHECK-NEXT: store float %v, float addrspace(1)* %addr
 ; CHECK-NEXT: ret void
-define void @rauw(float addrspace(1)* %input) #0 {
+define amdgpu_kernel void @rauw(float addrspace(1)* %input) #0 {
 bb:
   %generic_input = addrspacecast float addrspace(1)* %input to float addrspace(4)*
   %addr = getelementptr float, float addrspace(4)* %generic_input, i64 10
@@ -117,7 +117,7 @@ bb:
 ; CHECK: %exit_cond = icmp eq float addrspace(3)* %i2, %end
 
 ; CHECK: br i1 %exit_cond, label %exit, label %loop
-define void @loop() #0 {
+define amdgpu_kernel void @loop() #0 {
 entry:
   %p = addrspacecast [10 x float] addrspace(3)* @array to float addrspace(4)*
   %end = getelementptr float, float addrspace(4)* %p, i64 10
@@ -150,7 +150,7 @@ exit:
 ; CHECK: %0 = addrspacecast float addrspace(3)* %i2 to float addrspace(4)*
 ; CHECK: %exit_cond = icmp eq float addrspace(4)* %0, %end
 ; CHECK: br i1 %exit_cond, label %exit, label %loop
-define void @loop_with_generic_bound() #0 {
+define amdgpu_kernel void @loop_with_generic_bound() #0 {
 entry:
   %p = addrspacecast [10 x float] addrspace(3)* @array to float addrspace(4)*
   %end = load float addrspace(4)*, float addrspace(4)* addrspace(1)* @generic_end

Modified: llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll (original)
+++ llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll Tue Mar 21 16:39:51 2017
@@ -2,7 +2,7 @@
 
 ; CHECK-LABEL: @memset_group_to_flat(
 ; CHECK: call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 {
+define amdgpu_kernel void @memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 {
   %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
   call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
@@ -10,7 +10,7 @@ define void @memset_group_to_flat(i8 add
 
 ; CHECK-LABEL: @memset_global_to_flat(
 ; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %global.ptr, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 {
+define amdgpu_kernel void @memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 {
   %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
   call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
@@ -18,7 +18,7 @@ define void @memset_global_to_flat(i8 ad
 
 ; CHECK-LABEL: @memset_group_to_flat_no_md(
 ; CHECK: call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 4, i64 %size, i32 4, i1 false){{$}}
-define void @memset_group_to_flat_no_md(i8 addrspace(3)* %group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memset_group_to_flat_no_md(i8 addrspace(3)* %group.ptr, i64 %size) #0 {
   %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
   call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 %size, i32 4, i1 false)
   ret void
@@ -26,7 +26,7 @@ define void @memset_group_to_flat_no_md(
 
 ; CHECK-LABEL: @memset_global_to_flat_no_md(
 ; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %global.ptr, i8 4, i64 %size, i32 4, i1 false){{$}}
-define void @memset_global_to_flat_no_md(i8 addrspace(1)* %global.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memset_global_to_flat_no_md(i8 addrspace(1)* %global.ptr, i64 %size) #0 {
   %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
   call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 %size, i32 4, i1 false)
   ret void
@@ -34,7 +34,7 @@ define void @memset_global_to_flat_no_md
 
 ; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group(
 ; CHCK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memcpy_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
@@ -42,7 +42,7 @@ define void @memcpy_flat_to_flat_replace
 
 ; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_with_group(
 ; CHECK: call void @llvm.memcpy.p3i8.p4i8.i64(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memcpy_flat_to_flat_replace_dest_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size) #0 {
   %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
@@ -50,7 +50,7 @@ define void @memcpy_flat_to_flat_replace
 
 ; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_src_with_group(
 ; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* %src.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memcpy_flat_to_flat_replace_dest_src_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_src_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   %cast.dest = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
@@ -59,7 +59,7 @@ define void @memcpy_flat_to_flat_replace
 
 ; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_group_src_global(
 ; CHECK: call void @llvm.memcpy.p3i8.p1i8.i64(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memcpy_flat_to_flat_replace_dest_group_src_global(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_group_src_global(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(1)* %src.global.ptr to i8 addrspace(4)*
   %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
@@ -68,7 +68,7 @@ define void @memcpy_flat_to_flat_replace
 
 ; CHECK-LABEL: @memcpy_group_to_flat_replace_dest_global(
 ; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memcpy_group_to_flat_replace_dest_global(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size) #0 {
+define amdgpu_kernel void @memcpy_group_to_flat_replace_dest_global(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size) #0 {
   %cast.dest = addrspacecast i8 addrspace(1)* %dest.global.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p3i8.i32(i8 addrspace(4)* %cast.dest, i8 addrspace(3)* %src.group.ptr, i32 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
@@ -76,7 +76,7 @@ define void @memcpy_group_to_flat_replac
 
 ; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(
 ; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa.struct !7
-define void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa.struct !7
   ret void
@@ -84,7 +84,7 @@ define void @memcpy_flat_to_flat_replace
 
 ; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_no_md(
 ; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}}
-define void @memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false)
   ret void
@@ -93,7 +93,7 @@ define void @memcpy_flat_to_flat_replace
 ; CHECK-LABEL: @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(
 ; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest0, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}}
 ; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}}
-define void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest0, i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest0, i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest0, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false)
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest1, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false)
@@ -103,14 +103,14 @@ define void @multiple_memcpy_flat_to_fla
 ; Check for iterator problems if the pointer has 2 uses in the same call
 ; CHECK-LABEL: @memcpy_group_flat_to_flat_self(
 ; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 addrspace(3)* %group.ptr, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memcpy_group_flat_to_flat_self(i8 addrspace(3)* %group.ptr) #0 {
+define amdgpu_kernel void @memcpy_group_flat_to_flat_self(i8 addrspace(3)* %group.ptr) #0 {
   %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast, i8 addrspace(4)* %cast, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
 }
 ; CHECK-LABEL: @memmove_flat_to_flat_replace_src_with_group(
 ; CHECK: call void @llvm.memmove.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memmove_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   call void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void

Modified: llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll (original)
+++ llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 
 ; CHECK-LABEL: @generic_address_bitcast_const(
 ; CHECK: %vecload1 = load <2 x double>, <2 x double> addrspace(1)* bitcast (double addrspace(1)* getelementptr inbounds ([100 x double], [100 x double] addrspace(1)* @data, i64 0, i64 4) to <2 x double> addrspace(1)*), align 8
-define void @generic_address_bitcast_const(i64 %arg0, i32 addrspace(1)* nocapture %results) #0 {
+define amdgpu_kernel void @generic_address_bitcast_const(i64 %arg0, i32 addrspace(1)* nocapture %results) #0 {
 entry:
   %tmp1 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = zext i32 %tmp1 to i64
@@ -39,7 +39,7 @@ declare i32 @_Z9get_fencePU3AS4v(i8 addr
 ; CHECK: %tmp1 = bitcast %opencl.pipe_t addrspace(3)* %in_pipe to i32 addrspace(3)*
 ; CHECK: %add.ptr = getelementptr inbounds i32, i32 addrspace(3)* %tmp1, i32 2
 ; CHECK: %tmp2 = load i32, i32 addrspace(3)* %add.ptr, align 4
-define void @generic_address_pipe_bug9673(%opencl.pipe_t addrspace(3)* nocapture %in_pipe, i32 addrspace(1)* nocapture %dst) #0 {
+define amdgpu_kernel void @generic_address_pipe_bug9673(%opencl.pipe_t addrspace(3)* nocapture %in_pipe, i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = bitcast %opencl.pipe_t addrspace(3)* %in_pipe to i32 addrspace(3)*
@@ -55,7 +55,7 @@ entry:
 ; CHECK: br i1
 ; CHECK: load float, float addrspace(4)*
 ; CHECK: br label
-define void @generic_address_bug9749(i32 addrspace(1)* nocapture %results) #0 {
+define amdgpu_kernel void @generic_address_bug9749(i32 addrspace(1)* nocapture %results) #0 {
 entry:
   %ptr = alloca float addrspace(4)*, align 8
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -85,7 +85,7 @@ helperFunction.exit:
 ; CHECK-LABEL: @generic_address_opt_phi_bug9776_simple_phi_kernel(
 ; CHECK: phi i32 addrspace(3)*
 ; CHECK: store i32 %i.03, i32 addrspace(3)* %
-define void @generic_address_opt_phi_bug9776_simple_phi_kernel(i32 addrspace(3)* nocapture %in, i32 %numElems) #0 {
+define amdgpu_kernel void @generic_address_opt_phi_bug9776_simple_phi_kernel(i32 addrspace(3)* nocapture %in, i32 %numElems) #0 {
 entry:
   %cmp1 = icmp eq i32 %numElems, 0
   br i1 %cmp1, label %for.end, label %for.body.lr.ph
@@ -110,7 +110,7 @@ for.end:
 ; CHECK-LABEL: @generic_address_bug9899(
 ; CHECK: %vecload = load <2 x i32>, <2 x i32> addrspace(3)*
 ; CHECK: store <2 x i32> %tmp16, <2 x i32> addrspace(3)*
-define void @generic_address_bug9899(i64 %arg0, i32 addrspace(3)* nocapture %sourceA, i32 addrspace(3)* nocapture %destValues) #0 {
+define amdgpu_kernel void @generic_address_bug9899(i64 %arg0, i32 addrspace(3)* nocapture %sourceA, i32 addrspace(3)* nocapture %destValues) #0 {
 entry:
   %tmp1 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = zext i32 %tmp1 to i64

Modified: llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/select.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/select.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/select.ll (original)
+++ llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/select.ll Tue Mar 21 16:39:51 2017
@@ -18,7 +18,7 @@ define i32 addrspace(4)* @return_select_
 ; CHECK-LABEL: @store_select_group_flat(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1
 ; CHECK: store i32 -1, i32 addrspace(3)* %select
-define void @store_select_group_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 {
+define amdgpu_kernel void @store_select_group_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1
@@ -43,7 +43,7 @@ define i32 @load_select_group_flat_md(i1
 ; CHECK: %2 = addrspacecast i32* %private.ptr.1 to i32 addrspace(4)*
 ; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* %2
 ; CHECK: store i32 -1, i32 addrspace(4)* %select
-define void @store_select_mismatch_group_private_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32* %private.ptr.1) #0 {
+define amdgpu_kernel void @store_select_mismatch_group_private_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32* %private.ptr.1) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %cast1 = addrspacecast i32* %private.ptr.1 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1
@@ -73,7 +73,7 @@ bb:
 ; CHECK-LABEL: @store_select_group_flat_null(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
 ; CHECK: store i32 -1, i32 addrspace(3)* %select
-define void @store_select_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* null
   store i32 -1, i32 addrspace(4)* %select
@@ -83,7 +83,7 @@ define void @store_select_group_flat_nul
 ; CHECK-LABEL: @store_select_group_flat_null_swap(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*), i32 addrspace(3)* %group.ptr.0
 ; CHECK: store i32 -1, i32 addrspace(3)* %select
-define void @store_select_group_flat_null_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_null_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* null, i32 addrspace(4)* %cast0
   store i32 -1, i32 addrspace(4)* %select
@@ -93,7 +93,7 @@ define void @store_select_group_flat_nul
 ; CHECK-LABEL: @store_select_group_flat_undef(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* undef
 ; CHECK: store i32 -1, i32 addrspace(3)* %select
-define void @store_select_group_flat_undef(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_undef(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* undef
   store i32 -1, i32 addrspace(4)* %select
@@ -103,7 +103,7 @@ define void @store_select_group_flat_und
 ; CHECK-LABEL: @store_select_group_flat_undef_swap(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* undef, i32 addrspace(3)* %group.ptr.0
 ; CHECK: store i32 -1, i32 addrspace(3)* %select
-define void @store_select_group_flat_undef_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_undef_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* undef, i32 addrspace(4)* %cast0
   store i32 -1, i32 addrspace(4)* %select
@@ -114,7 +114,7 @@ define void @store_select_group_flat_und
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
 ; CHECK: %gep = getelementptr i32, i32 addrspace(3)* %select, i64 16
 ; CHECK: store i32 -1, i32 addrspace(3)* %gep
-define void @store_select_gep_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_gep_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* null
   %gep = getelementptr i32, i32 addrspace(4)* %select, i64 16
@@ -127,7 +127,7 @@ define void @store_select_gep_group_flat
 ; CHECK-LABEL: @store_select_group_flat_constexpr(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* @lds1
 ; CHECK: store i32 7, i32 addrspace(3)* %select
-define void @store_select_group_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds1 to i32 addrspace(4)*)
   store i32 7, i32 addrspace(4)* %select
@@ -137,7 +137,7 @@ define void @store_select_group_flat_con
 ; CHECK-LABEL: @store_select_group_flat_inttoptr_flat(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* inttoptr (i64 12345 to i32 addrspace(4)*) to i32 addrspace(3)*)
 ; CHECK: store i32 7, i32 addrspace(3)* %select
-define void @store_select_group_flat_inttoptr_flat(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_inttoptr_flat(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* inttoptr (i64 12345 to i32 addrspace(4)*)
   store i32 7, i32 addrspace(4)* %select
@@ -147,7 +147,7 @@ define void @store_select_group_flat_int
 ; CHECK-LABEL: @store_select_group_flat_inttoptr_group(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* inttoptr (i32 400 to i32 addrspace(3)*)
 ; CHECK-NEXT: store i32 7, i32 addrspace(3)* %select
-define void @store_select_group_flat_inttoptr_group(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_inttoptr_group(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i32 400 to i32 addrspace(3)*) to i32 addrspace(4)*)
   store i32 7, i32 addrspace(4)* %select
@@ -158,7 +158,7 @@ define void @store_select_group_flat_int
 ; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
 ; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)
 ; CHECK: store i32 7, i32 addrspace(4)* %select
-define void @store_select_group_global_mismatch_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)
   store i32 7, i32 addrspace(4)* %select
@@ -169,7 +169,7 @@ define void @store_select_group_global_m
 ; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
 ; CHECK: %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*), i32 addrspace(4)* %1
 ; CHECK: store i32 7, i32 addrspace(4)* %select
-define void @store_select_group_global_mismatch_flat_constexpr_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_flat_constexpr_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*), i32 addrspace(4)* %cast0
   store i32 7, i32 addrspace(4)* %select
@@ -179,7 +179,7 @@ define void @store_select_group_global_m
 ; CHECK-LABEL: @store_select_group_global_mismatch_null_null(
 ; CHECK: %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)
 ; CHECK: store i32 7, i32 addrspace(4)* %select
-define void @store_select_group_global_mismatch_null_null(i1 %c) #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_null_null(i1 %c) #0 {
   %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)
   store i32 7, i32 addrspace(4)* %select
   ret void
@@ -187,42 +187,42 @@ define void @store_select_group_global_m
 
 ; CHECK-LABEL: @store_select_group_global_mismatch_null_null_constexpr(
 ; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
-define void @store_select_group_global_mismatch_null_null_constexpr() #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_null_null_constexpr() #0 {
   store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
   ret void
 }
 
 ; CHECK-LABEL: @store_select_group_global_mismatch_gv_null_constexpr(
 ; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
-define void @store_select_group_global_mismatch_gv_null_constexpr() #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_gv_null_constexpr() #0 {
   store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
   ret void
 }
 
 ; CHECK-LABEL: @store_select_group_global_mismatch_null_gv_constexpr(
 ; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)), align 4
-define void @store_select_group_global_mismatch_null_gv_constexpr() #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_null_gv_constexpr() #0 {
   store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)), align 4
   ret void
 }
 
 ; CHECK-LABEL: @store_select_group_global_mismatch_inttoptr_null_constexpr(
 ; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i64 123 to i32 addrspace(3)*) to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
-define void @store_select_group_global_mismatch_inttoptr_null_constexpr() #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_inttoptr_null_constexpr() #0 {
   store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i64 123 to i32 addrspace(3)*) to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
   ret void
 }
 
 ; CHECK-LABEL: @store_select_group_global_mismatch_inttoptr_flat_null_constexpr(
 ; CHECK: store i32 7, i32 addrspace(1)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(1)* addrspacecast (i32 addrspace(4)* inttoptr (i64 123 to i32 addrspace(4)*) to i32 addrspace(1)*), i32 addrspace(1)* null), align 4
-define void @store_select_group_global_mismatch_inttoptr_flat_null_constexpr() #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_inttoptr_flat_null_constexpr() #0 {
   store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* inttoptr (i64 123 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
   ret void
 }
 
 ; CHECK-LABEL: @store_select_group_global_mismatch_undef_undef_constexpr(
 ; CHECK: store i32 7, i32 addrspace(3)* null
-define void @store_select_group_global_mismatch_undef_undef_constexpr() #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_undef_undef_constexpr() #0 {
   store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* undef to i32 addrspace(4)*)), align 4
   ret void
 }
@@ -233,7 +233,7 @@ define void @store_select_group_global_m
 ; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
 ; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* inttoptr (i32 add (i32 ptrtoint ([1024 x i32] addrspace(3)* @lds2 to i32), i32 124) to i32 addrspace(1)*) to i32 addrspace(4)*)
 ; CHECK: store i32 7, i32 addrspace(4)* %select
-define void @store_select_group_constexpr_ptrtoint(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_constexpr_ptrtoint(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* inttoptr (i32 add (i32 ptrtoint ([1024 x i32] addrspace(3)* @lds2 to i32), i32 124) to i32 addrspace(1)*) to i32 addrspace(4)*)
   store i32 7, i32 addrspace(4)* %select
@@ -248,7 +248,7 @@ define void @store_select_group_constexp
 ; CHECK: %extract1 = extractelement <2 x i32 addrspace(4)*> %select, i32 1
 ; CHECK: store i32 -1, i32 addrspace(4)* %extract0
 ; CHECK: store i32 -2, i32 addrspace(4)* %extract1
-define void @store_select_group_flat_vector(i1 %c, <2 x i32 addrspace(3)*> %group.ptr.0, <2 x i32 addrspace(3)*> %group.ptr.1) #0 {
+define amdgpu_kernel void @store_select_group_flat_vector(i1 %c, <2 x i32 addrspace(3)*> %group.ptr.0, <2 x i32 addrspace(3)*> %group.ptr.1) #0 {
   %cast0 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.0 to <2 x i32 addrspace(4)*>
   %cast1 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.1 to <2 x i32 addrspace(4)*>
   %select = select i1 %c, <2 x i32 addrspace(4)*> %cast0, <2 x i32 addrspace(4)*> %cast1

Modified: llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll (original)
+++ llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: @volatile_load_flat_from_global(
 ; CHECK: load volatile i32, i32 addrspace(4)*
 ; CHECK: store i32 %val, i32 addrspace(1)*
-define void @volatile_load_flat_from_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_load_flat_from_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
   %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
@@ -16,7 +16,7 @@ define void @volatile_load_flat_from_glo
 ; CHECK-LABEL: @volatile_load_flat_from_constant(
 ; CHECK: load volatile i32, i32 addrspace(4)*
 ; CHECK: store i32 %val, i32 addrspace(1)*
-define void @volatile_load_flat_from_constant(i32 addrspace(2)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_load_flat_from_constant(i32 addrspace(2)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(2)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
   %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
@@ -27,7 +27,7 @@ define void @volatile_load_flat_from_con
 ; CHECK-LABEL: @volatile_load_flat_from_group(
 ; CHECK: load volatile i32, i32 addrspace(4)*
 ; CHECK: store i32 %val, i32 addrspace(3)*
-define void @volatile_load_flat_from_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_load_flat_from_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)*
   %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
@@ -38,7 +38,7 @@ define void @volatile_load_flat_from_gro
 ; CHECK-LABEL: @volatile_load_flat_from_private(
 ; CHECK: load volatile i32, i32 addrspace(4)*
 ; CHECK: store i32 %val, i32*
-define void @volatile_load_flat_from_private(i32* nocapture %input, i32* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_load_flat_from_private(i32* nocapture %input, i32* nocapture %output) #0 {
   %tmp0 = addrspacecast i32* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32* %output to i32 addrspace(4)*
   %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
@@ -49,7 +49,7 @@ define void @volatile_load_flat_from_pri
 ; CHECK-LABEL: @volatile_store_flat_to_global(
 ; CHECK: load i32, i32 addrspace(1)*
 ; CHECK: store volatile i32 %val, i32 addrspace(4)*
-define void @volatile_store_flat_to_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_store_flat_to_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -60,7 +60,7 @@ define void @volatile_store_flat_to_glob
 ; CHECK-LABEL: @volatile_store_flat_to_group(
 ; CHECK: load i32, i32 addrspace(3)*
 ; CHECK: store volatile i32 %val, i32 addrspace(4)*
-define void @volatile_store_flat_to_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_store_flat_to_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -71,7 +71,7 @@ define void @volatile_store_flat_to_grou
 ; CHECK-LABEL: @volatile_store_flat_to_private(
 ; CHECK: load i32, i32*
 ; CHECK: store volatile i32 %val, i32 addrspace(4)*
-define void @volatile_store_flat_to_private(i32* nocapture %input, i32* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_store_flat_to_private(i32* nocapture %input, i32* nocapture %output) #0 {
   %tmp0 = addrspacecast i32* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -119,7 +119,7 @@ define { i32, i1 } @volatile_cmpxchg_gro
 ; CHECK-LABEL: @volatile_memset_group_to_flat(
 ; CHECK: addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
 ; CHECK: call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %1, i8 4, i64 32, i32 4, i1 true)
-define void @volatile_memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 {
+define amdgpu_kernel void @volatile_memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 {
   %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
   call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 true)
   ret void
@@ -128,7 +128,7 @@ define void @volatile_memset_group_to_fl
 ; CHECK-LABEL: @volatile_memset_global_to_flat(
 ; CHECK: addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
 ; CHECK: call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %1, i8 4, i64 32, i32 4, i1 true)
-define void @volatile_memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 {
+define amdgpu_kernel void @volatile_memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 {
   %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
   call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 true)
   ret void

Modified: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll (original)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll Tue Mar 21 16:39:51 2017
@@ -15,7 +15,7 @@ target datalayout = "e-p:32:32-p1:64:64-
 ; NOSCOPE: load float
 ; NOSCOPE: store float
 ; NOSCOPE: store float
-define void @vectorize_alias_scope(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @vectorize_alias_scope(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   store float 0.0, float addrspace(1)* %a, align 4, !noalias !0

Modified: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll (original)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll Tue Mar 21 16:39:51 2017
@@ -10,7 +10,7 @@ target triple = "amdgcn--"
 
 ; ALIGNED: load i8, i8* %ptr0, align 1{{$}}
 ; ALIGNED: load i8, i8* %ptr1, align 1{{$}}
-define void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i8], align 1
   %ptr0 = getelementptr inbounds [128 x i8], [128 x i8]* %alloca, i32 0, i32 %offset
   %val0 = load i8, i8* %ptr0, align 1
@@ -27,7 +27,7 @@ define void @load_unknown_offset_align1_
 
 ; ALIGNED: load i16, i16* %ptr0, align 1{{$}}
 ; ALIGNED: load i16, i16* %ptr1, align 1{{$}}
-define void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i16], align 1
   %ptr0 = getelementptr inbounds [128 x i16], [128 x i16]* %alloca, i32 0, i32 %offset
   %val0 = load i16, i16* %ptr0, align 1
@@ -47,7 +47,7 @@ define void @load_unknown_offset_align1_
 
 ; ALIGNED: load i32, i32* %ptr0, align 1
 ; ALIGNED: load i32, i32* %ptr1, align 1
-define void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i32], align 1
   %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset
   %val0 = load i32, i32* %ptr0, align 1
@@ -68,7 +68,7 @@ define void @load_unknown_offset_align1_
 ; FIXME: Should change alignment
 ; ALIGNED: load i32
 ; ALIGNED: load i32
-define void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i32], align 16
   %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset
   %val0 = load i32, i32* %ptr0, align 1
@@ -85,7 +85,7 @@ define void @load_alloca16_unknown_offse
 
 ; ALIGNED: store i8 9, i8* %ptr0, align 1{{$}}
 ; ALIGNED: store i8 10, i8* %ptr1, align 1{{$}}
-define void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i8], align 1
   %ptr0 = getelementptr inbounds [128 x i8], [128 x i8]* %alloca, i32 0, i32 %offset
   store i8 9, i8* %ptr0, align 1
@@ -100,7 +100,7 @@ define void @store_unknown_offset_align1
 
 ; ALIGNED: store i16 9, i16* %ptr0, align 1{{$}}
 ; ALIGNED: store i16 10, i16* %ptr1, align 1{{$}}
-define void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i16], align 1
   %ptr0 = getelementptr inbounds [128 x i16], [128 x i16]* %alloca, i32 0, i32 %offset
   store i16 9, i16* %ptr0, align 1
@@ -119,7 +119,7 @@ define void @store_unknown_offset_align1
 
 ; ALIGNED: store i32 9, i32* %ptr0, align 1
 ; ALIGNED: store i32 10, i32* %ptr1, align 1
-define void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i32], align 1
   %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset
   store i32 9, i32* %ptr0, align 1

Modified: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll (original)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll Tue Mar 21 16:39:51 2017
@@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK: sext i32 %id.x to i64
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float> zeroinitializer
-define void @basic_merge_sext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @basic_merge_sext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %sext.id.x = sext i32 %id.x to i64
@@ -32,7 +32,7 @@ entry:
 ; CHECK: zext i32 %id.x to i64
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float>
-define void @basic_merge_zext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @basic_merge_zext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %zext.id.x = zext i32 %id.x to i64
@@ -54,7 +54,7 @@ entry:
 ; CHECK-LABEL: @merge_op_zext_index(
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float>
-define void @merge_op_zext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
+define amdgpu_kernel void @merge_op_zext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %shl = shl i32 %id.x, 2
@@ -81,7 +81,7 @@ entry:
 ; CHECK-LABEL: @merge_op_sext_index(
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float>
-define void @merge_op_sext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
+define amdgpu_kernel void @merge_op_sext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %shl = shl i32 %id.x, 2
@@ -112,7 +112,7 @@ entry:
 ; CHECK: loop:
 ; CHECK: load <2 x i32>
 ; CHECK: store <2 x i32>
-define void @zext_trunc_phi_1(i32 addrspace(1)* nocapture noalias %a, i32 addrspace(1)* nocapture noalias %b, i32 addrspace(1)* nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 {
+define amdgpu_kernel void @zext_trunc_phi_1(i32 addrspace(1)* nocapture noalias %a, i32 addrspace(1)* nocapture noalias %b, i32 addrspace(1)* nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 {
 entry:
   %cmp0 = icmp eq i32 %n, 0
   br i1 %cmp0, label %exit, label %loop

Modified: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll (original)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll Tue Mar 21 16:39:51 2017
@@ -11,7 +11,7 @@ target datalayout = "e-p:32:32-p1:64:64-
 ; CHECK: load <2 x float>
 ; CHECK: %w = add i32 %y, 9
 ; CHECK: %foo = add i32 %z, %w
-define void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
 entry:
   %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
   %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx
@@ -38,7 +38,7 @@ entry:
 ; CHECK: %w = add i32 %y, 9
 ; CHECK: store <2 x float>
 ; CHECK: %foo = add i32 %z, %w
-define void @insert_store_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @insert_store_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
 entry:
   %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
   %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx

Modified: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll (original)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll Tue Mar 21 16:39:51 2017
@@ -8,7 +8,7 @@ target datalayout = "e-p:32:32-p1:64:64-
 ; CHECK: store double 0.000000e+00, double addrspace(1)* %a,
 ; CHECK: load double
 ; CHECK: store double 0.000000e+00, double addrspace(1)* %a.idx.1
-define void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %a.idx.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
   %c.idx.1 = getelementptr inbounds double, double addrspace(1)* %c, i64 1

Modified: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll (original)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll Tue Mar 21 16:39:51 2017
@@ -17,7 +17,7 @@ target datalayout = "e-p:32:32-p1:64:64-
 ; ELT8-UNALIGNED: store <2 x i32>
 
 ; ELT16-UNALIGNED: store <4 x i32>
-define void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 {
   %out.gep.1 = getelementptr i32, i32* %out, i32 1
   %out.gep.2 = getelementptr i32, i32* %out, i32 2
   %out.gep.3 = getelementptr i32, i32* %out, i32 3
@@ -44,7 +44,7 @@ define void @merge_private_store_4_vecto
 ; ELT4-UNALIGNED: store i32
 ; ELT4-UNALIGNED: store i32
 ; ELT4-UNALIGNED: store i32
-define void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32* %out) #0 {
   %out.gep.1 = getelementptr i32, i32* %out, i32 1
   %out.gep.2 = getelementptr i32, i32* %out, i32 2
   %out.gep.3 = getelementptr i32, i32* %out, i32 3
@@ -71,7 +71,7 @@ define void @merge_private_store_4_vecto
 ; ELT4-UNALIGNED: store i32
 ; ELT4-UNALIGNED: store i32
 ; ELT4-UNALIGNED: store i32
-define void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32* %out) #0 {
   %out.gep.1 = getelementptr i32, i32* %out, i32 1
   %out.gep.2 = getelementptr i32, i32* %out, i32 2
   %out.gep.3 = getelementptr i32, i32* %out, i32 3
@@ -85,7 +85,7 @@ define void @merge_private_store_4_vecto
 
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8(
 ; ALL: store <4 x i8>
-define void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 {
   %out.gep.1 = getelementptr i8, i8* %out, i32 1
   %out.gep.2 = getelementptr i8, i8* %out, i32 2
   %out.gep.3 = getelementptr i8, i8* %out, i32 3
@@ -104,7 +104,7 @@ define void @merge_private_store_4_vecto
 ; ALIGNED: store i8
 
 ; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8>* %1, align 1
-define void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8* %out) #0 {
   %out.gep.1 = getelementptr i8, i8* %out, i32 1
   %out.gep.2 = getelementptr i8, i8* %out, i32 2
   %out.gep.3 = getelementptr i8, i8* %out, i32 3
@@ -118,7 +118,7 @@ define void @merge_private_store_4_vecto
 
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16(
 ; ALL: store <2 x i16>
-define void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 {
   %out.gep.1 = getelementptr i16, i16* %out, i32 1
 
   store i16 9, i16* %out, align 4
@@ -131,7 +131,7 @@ define void @merge_private_store_4_vecto
 ; ALIGNED: store i16
 
 ; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 2
-define void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16* %out) #0 {
   %out.gep.1 = getelementptr i16, i16* %out, i32 1
 
   store i16 9, i16* %out, align 2
@@ -144,7 +144,7 @@ define void @merge_private_store_4_vecto
 ; ALIGNED: store i16
 
 ; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 1
-define void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16* %out) #0 {
   %out.gep.1 = getelementptr i16, i16* %out, i32 1
 
   store i16 9, i16* %out, align 1
@@ -154,7 +154,7 @@ define void @merge_private_store_4_vecto
 
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align8(
 ; ALL: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 8
-define void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16* %out) #0 {
   %out.gep.1 = getelementptr i16, i16* %out, i32 1
 
   store i16 9, i16* %out, align 8
@@ -179,7 +179,7 @@ define void @merge_private_store_4_vecto
 ; ELT16-ALIGNED: store i32
 
 ; ELT16-UNALIGNED: store <3 x i32>
-define void @merge_private_store_3_vector_elts_loads_v4i32(i32* %out) #0 {
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(i32* %out) #0 {
   %out.gep.1 = getelementptr i32, i32* %out, i32 1
   %out.gep.2 = getelementptr i32, i32* %out, i32 2
 
@@ -202,7 +202,7 @@ define void @merge_private_store_3_vecto
 ; ELT8-UNALIGNED: store i32
 
 ; ELT16-UNALIGNED: store <3 x i32>
-define void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32* %out) #0 {
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32* %out) #0 {
   %out.gep.1 = getelementptr i32, i32* %out, i32 1
   %out.gep.2 = getelementptr i32, i32* %out, i32 2
 
@@ -218,7 +218,7 @@ define void @merge_private_store_3_vecto
 ; ALIGNED: store i8
 
 ; UNALIGNED: store <3 x i8>
-define void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8* %out) #0 {
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8* %out) #0 {
   %out.gep.1 = getelementptr i8, i8* %out, i8 1
   %out.gep.2 = getelementptr i8, i8* %out, i8 2
 

Modified: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll (original)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll Tue Mar 21 16:39:51 2017
@@ -10,7 +10,7 @@ target datalayout = "e-p:32:32-p1:64:64-
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i8(
 ; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(1)* %{{[0-9]+}}, align 2
-define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
 
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -20,7 +20,7 @@ define void @merge_global_store_2_consta
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i8_natural_align
 ; CHECK: store <2 x i8>
-define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
 
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -30,7 +30,7 @@ define void @merge_global_store_2_consta
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i16
 ; CHECK: store <2 x i16> <i16 456, i16 123>, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 123, i16 addrspace(1)* %out.gep.1
@@ -40,7 +40,7 @@ define void @merge_global_store_2_consta
 
 ; CHECK-LABEL: @merge_global_store_2_constants_0_i16
 ; CHECK: store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 0, i16 addrspace(1)* %out.gep.1
@@ -50,7 +50,7 @@ define void @merge_global_store_2_consta
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align
 ; CHECK: store <2 x i16>
-define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 123, i16 addrspace(1)* %out.gep.1
@@ -60,7 +60,7 @@ define void @merge_global_store_2_consta
 
 ; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align
 ; CHECK: store <2 x half>
-define void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
 
   store half 2.0, half addrspace(1)* %out.gep.1
@@ -70,7 +70,7 @@ define void @merge_global_store_2_consta
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i32
 ; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
   store i32 123, i32 addrspace(1)* %out.gep.1
@@ -80,7 +80,7 @@ define void @merge_global_store_2_consta
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i32_f32
 ; CHECK: store <2 x i32> <i32 456, i32 1065353216>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
   store float 1.0, float addrspace(1)* %out.gep.1.bc
@@ -90,7 +90,7 @@ define void @merge_global_store_2_consta
 
 ; CHECK-LABEL: @merge_global_store_2_constants_f32_i32
 ; CHECK  store <2 x float> <float 4.000000e+00, float 0x370EC00000000000>, <2 x float> addrspace(1)* %{{[0-9]+$}}
-define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
   store i32 123, i32 addrspace(1)* %out.gep.1.bc
@@ -100,7 +100,7 @@ define void @merge_global_store_2_consta
 
 ; CHECK-LABEL: @merge_global_store_4_constants_i32
 ; CHECK: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -114,7 +114,7 @@ define void @merge_global_store_4_consta
 
 ; CHECK-LABEL: @merge_global_store_4_constants_f32_order
 ; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}
-define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -129,7 +129,7 @@ define void @merge_global_store_4_consta
 ; First store is out of order.
 ; CHECK-LABEL: @merge_global_store_4_constants_f32
 ; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -143,7 +143,7 @@ define void @merge_global_store_4_consta
 
 ; CHECK-LABEL: @merge_global_store_4_constants_mixed_i32_f32
 ; CHECK: store <4 x i32> <i32 1090519040, i32 11, i32 1073741824, i32 17>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -160,7 +160,7 @@ define void @merge_global_store_4_consta
 
 ; CHECK-LABEL: @merge_global_store_3_constants_i32
 ; CHECK: store <3 x i32> <i32 1234, i32 123, i32 456>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 
@@ -172,7 +172,7 @@ define void @merge_global_store_3_consta
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i64
 ; CHECK: store <2 x i64> <i64 456, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
-define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 
   store i64 123, i64 addrspace(1)* %out.gep.1
@@ -183,7 +183,7 @@ define void @merge_global_store_2_consta
 ; CHECK-LABEL: @merge_global_store_4_constants_i64
 ; CHECK: store <2 x i64> <i64 456, i64 333>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
 ; CHECK: store <2 x i64> <i64 1234, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
-define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
   %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
   %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
@@ -202,7 +202,7 @@ define void @merge_global_store_4_consta
 ; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT0]], i32 0
 ; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT1]], i32 1
 ; CHECK: store <2 x i32> [[INSERT1]]
-define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 
@@ -220,7 +220,7 @@ define void @merge_global_store_2_adjace
 ; CHECK: insertelement
 ; CHECK: insertelement
 ; CHECK: store <2 x i32>
-define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 
@@ -241,7 +241,7 @@ define void @merge_global_store_2_adjace
 ; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT1]], i32 0
 ; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT0]], i32 1
 ; CHECK: store <2 x i32> [[INSERT1]]
-define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 
@@ -256,7 +256,7 @@ define void @merge_global_store_2_adjace
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -279,7 +279,7 @@ define void @merge_global_store_4_adjace
 ; CHECK-LABEL: @merge_global_store_3_adjacent_loads_i32
 ; CHECK: load <3 x i32>
 ; CHECK: store <3 x i32>
-define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
@@ -298,7 +298,7 @@ define void @merge_global_store_3_adjace
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_f32
 ; CHECK: load <4 x float>
 ; CHECK: store <4 x float>
-define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -321,7 +321,7 @@ define void @merge_global_store_4_adjace
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32_nonzero_base
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
@@ -346,7 +346,7 @@ define void @merge_global_store_4_adjace
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_inverse_i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -373,7 +373,7 @@ define void @merge_global_store_4_adjace
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_shuffle_i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -408,7 +408,7 @@ define void @merge_global_store_4_adjace
 ; CHECK: insertelement <4 x i8>
 ; CHECK: insertelement <4 x i8>
 ; CHECK: store <4 x i8>
-define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
@@ -431,7 +431,7 @@ define void @merge_global_store_4_adjace
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8_natural_align
 ; CHECK: load <4 x i8>
 ; CHECK: store <4 x i8>
-define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
@@ -454,7 +454,7 @@ define void @merge_global_store_4_adjace
 ; CHECK-LABEL: @merge_global_store_4_vector_elts_loads_v4i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -474,7 +474,7 @@ define void @merge_global_store_4_vector
 
 ; CHECK-LABEL: @merge_local_store_2_constants_i8
 ; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(3)* %{{[0-9]+}}, align 2
-define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
 
   store i8 123, i8 addrspace(3)* %out.gep.1
@@ -484,7 +484,7 @@ define void @merge_local_store_2_constan
 
 ; CHECK-LABEL: @merge_local_store_2_constants_i32
 ; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(3)* %{{[0-9]+}}, align 4
-define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 
   store i32 123, i32 addrspace(3)* %out.gep.1
@@ -495,7 +495,7 @@ define void @merge_local_store_2_constan
 ; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2
 ; CHECK: store i32
 ; CHECK: store i32
-define void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 
   store i32 123, i32 addrspace(3)* %out.gep.1, align 2
@@ -506,7 +506,7 @@ define void @merge_local_store_2_constan
 ; CHECK-LABEL: @merge_local_store_4_constants_i32
 ; CHECK: store <2 x i32> <i32 456, i32 333>, <2 x i32> addrspace(3)*
 ; CHECK: store <2 x i32> <i32 1234, i32 123>, <2 x i32> addrspace(3)*
-define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
@@ -521,7 +521,7 @@ define void @merge_local_store_4_constan
 ; CHECK-LABEL: @merge_global_store_5_constants_i32
 ; CHECK: store <4 x i32> <i32 9, i32 12, i32 16, i32 -12>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store i32
-define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
   store i32 9, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 12, i32 addrspace(1)* %idx1, align 4
@@ -537,7 +537,7 @@ define void @merge_global_store_5_consta
 ; CHECK-LABEL: @merge_global_store_6_constants_i32
 ; CHECK: store <4 x i32> <i32 13, i32 15, i32 62, i32 63>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store <2 x i32> <i32 11, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
   store i32 13, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 15, i32 addrspace(1)* %idx1, align 4
@@ -555,7 +555,7 @@ define void @merge_global_store_6_consta
 ; CHECK-LABEL: @merge_global_store_7_constants_i32
 ; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store <3 x i32> <i32 98, i32 91, i32 212>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 999, i32 addrspace(1)* %idx1, align 4
@@ -575,7 +575,7 @@ define void @merge_global_store_7_consta
 ; CHECK-LABEL: @merge_global_store_8_constants_i32
 ; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store <4 x i32> <i32 98, i32 91, i32 212, i32 999>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 999, i32 addrspace(1)* %idx1, align 4
@@ -597,7 +597,7 @@ define void @merge_global_store_8_consta
 ; CHECK-LABEL: @copy_v3i32_align4
 ; CHECK: %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
 ; CHECK: store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
-define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
   store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
   ret void
@@ -606,7 +606,7 @@ define void @copy_v3i32_align4(<3 x i32>
 ; CHECK-LABEL: @copy_v3i64_align4
 ; CHECK: %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
 ; CHECK: store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
-define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
   store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
   ret void
@@ -615,7 +615,7 @@ define void @copy_v3i64_align4(<3 x i64>
 ; CHECK-LABEL: @copy_v3f32_align4
 ; CHECK: %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
 ; CHECK: store <3 x float>
-define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
   %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
   store <3 x float> %fadd, <3 x float> addrspace(1)* %out
@@ -625,7 +625,7 @@ define void @copy_v3f32_align4(<3 x floa
 ; CHECK-LABEL: @copy_v3f64_align4
 ; CHECK: %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
 ; CHECK: store <3 x double> %fadd, <3 x double> addrspace(1)* %out
-define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
   %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
   store <3 x double> %fadd, <3 x double> addrspace(1)* %out

Modified: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll (original)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@ target datalayout = "e-p:32:32-p1:64:64-
 ; CHECK-LABEL: @merge_v2i32_v2i32(
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32> zeroinitializer
-define void @merge_v2i32_v2i32(<2 x i32> addrspace(1)* nocapture %a, <2 x i32> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2i32_v2i32(<2 x i32> addrspace(1)* nocapture %a, <2 x i32> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %b, i64 1
@@ -22,7 +22,7 @@ entry:
 ; CHECK-LABEL: @merge_v1i32_v1i32(
 ; CHECK: load <2 x i32>
 ; CHECK: store <2 x i32> zeroinitializer
-define void @merge_v1i32_v1i32(<1 x i32> addrspace(1)* nocapture %a, <1 x i32> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v1i32_v1i32(<1 x i32> addrspace(1)* nocapture %a, <1 x i32> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %b, i64 1
@@ -41,7 +41,7 @@ entry:
 ; CHECK: load <3 x i32>
 ; CHECK: store <3 x i32> zeroinitializer
 ; CHECK: store <3 x i32> zeroinitializer
-define void @no_merge_v3i32_v3i32(<3 x i32> addrspace(1)* nocapture %a, <3 x i32> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @no_merge_v3i32_v3i32(<3 x i32> addrspace(1)* nocapture %a, <3 x i32> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b, i64 1
@@ -58,7 +58,7 @@ entry:
 ; CHECK-LABEL: @merge_v2i16_v2i16(
 ; CHECK: load <4 x i16>
 ; CHECK: store <4 x i16> zeroinitializer
-define void @merge_v2i16_v2i16(<2 x i16> addrspace(1)* nocapture %a, <2 x i16> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2i16_v2i16(<2 x i16> addrspace(1)* nocapture %a, <2 x i16> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b, i64 1
@@ -76,7 +76,7 @@ entry:
 ; CHECK-LABEL: @merge_load_i32_v2i16(
 ; CHECK: load i32,
 ; CHECK: load <2 x i16>
-define void @merge_load_i32_v2i16(i32 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_i32_v2i16(i32 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 1
   %a.1.cast = bitcast i32 addrspace(1)* %a.1 to <2 x i16> addrspace(1)*

Modified: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll (original)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll Tue Mar 21 16:39:51 2017
@@ -7,7 +7,7 @@
 
 ; CHECK-LABEL: @load_keep_base_alignment_missing_align(
 ; CHECK: load <2 x float>, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4
-define void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) {
+define amdgpu_kernel void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) {
   %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11
   %val0 = load float, float addrspace(3)* %ptr0
 
@@ -21,7 +21,7 @@ define void @load_keep_base_alignment_mi
 
 ; CHECK-LABEL: @store_keep_base_alignment_missing_align(
 ; CHECK: store <2 x float> zeroinitializer, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4
-define void @store_keep_base_alignment_missing_align() {
+define amdgpu_kernel void @store_keep_base_alignment_missing_align() {
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 1
   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2
   store float 0.0, float addrspace(3)* %arrayidx0

Modified: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll (original)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll Tue Mar 21 16:39:51 2017
@@ -11,7 +11,7 @@ target datalayout = "e-p:32:32-p1:64:64-
 ; CHECK: store i32 0
 ; CHECK: store i32 0
 
-define void @no_crash(i32 %arg) {
+define amdgpu_kernel void @no_crash(i32 %arg) {
   %tmp2 = add i32 %arg, 14
   %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp2
   %tmp4 = add i32 %arg, 15
@@ -37,7 +37,7 @@ define void @no_crash(i32 %arg) {
 ; CHECK: load i32
 ; CHECK: load i32
 
-define void @interleave_get_longest(i32 %arg) {
+define amdgpu_kernel void @interleave_get_longest(i32 %arg) {
   %a1 = add i32 %arg, 1
   %a2 = add i32 %arg, 2
   %a3 = add i32 %arg, 3

Modified: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll (original)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 ; CHECK: store i32
 ; CHECK: store i32
 ; CHECK: store i32
-define void @no_implicit_float(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @no_implicit_float(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3

Modified: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll (original)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll Tue Mar 21 16:39:51 2017
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: @optnone(
 ; CHECK: store i32
 ; CHECK: store i32
-define void @optnone(i32 addrspace(1)* %out) noinline optnone {
+define amdgpu_kernel void @optnone(i32 addrspace(1)* %out) noinline optnone {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
   store i32 123, i32 addrspace(1)* %out.gep.1
@@ -13,7 +13,7 @@ define void @optnone(i32 addrspace(1)* %
 
 ; CHECK-LABEL: @do_opt(
 ; CHECK: store <2 x i32>
-define void @do_opt(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @do_opt(i32 addrspace(1)* %out) {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
   store i32 123, i32 addrspace(1)* %out.gep.1

Modified: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll (original)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)*
 ; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)*
 ; CHECK: store <2 x i64> zeroinitializer
-define void @merge_v2p1i8(i8 addrspace(1)* addrspace(1)* nocapture %a, i8 addrspace(1)* addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2p1i8(i8 addrspace(1)* addrspace(1)* nocapture %a, i8 addrspace(1)* addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b, i64 1
@@ -28,7 +28,7 @@ entry:
 ; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)*
 ; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)*
 ; CHECK: store <2 x i32> zeroinitializer
-define void @merge_v2p3i8(i8 addrspace(3)* addrspace(3)* nocapture %a, i8 addrspace(3)* addrspace(3)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2p3i8(i8 addrspace(3)* addrspace(3)* nocapture %a, i8 addrspace(3)* addrspace(3)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i64 1
   %b.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b, i64 1
@@ -46,7 +46,7 @@ entry:
 ; CHECK: load <2 x i64>
 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
 ; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)*
-define void @merge_load_i64_ptr64(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_i64_ptr64(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
   %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)*
@@ -61,7 +61,7 @@ entry:
 ; CHECK: load <2 x i64>
 ; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0
 ; CHECK: inttoptr i64 [[ELT0]] to i8 addrspace(1)*
-define void @merge_load_ptr64_i64(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_ptr64_i64(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 =  getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -76,7 +76,7 @@ entry:
 ; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr0 to i64
 ; CHECK: insertelement <2 x i64> undef, i64 [[ELT0]], i32 0
 ; CHECK: store <2 x i64>
-define void @merge_store_ptr64_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, i64 %val1) #0 {
+define amdgpu_kernel void @merge_store_ptr64_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, i64 %val1) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -92,7 +92,7 @@ entry:
 ; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64
 ; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1]], i32 1
 ; CHECK: store <2 x i64>
-define void @merge_store_i64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(1)* %ptr1) #0 {
+define amdgpu_kernel void @merge_store_i64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(1)* %ptr1) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
   %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to i64 addrspace(1)*
@@ -107,7 +107,7 @@ entry:
 ; CHECK: load <2 x i32>
 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 1
 ; CHECK: inttoptr i32 [[ELT1]] to i8 addrspace(3)*
-define void @merge_load_i32_ptr32(i32 addrspace(3)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_i32_ptr32(i32 addrspace(3)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
   %a.1.cast = bitcast i32 addrspace(3)* %a.1 to i8 addrspace(3)* addrspace(3)*
@@ -122,7 +122,7 @@ entry:
 ; CHECK: load <2 x i32>
 ; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 0
 ; CHECK: inttoptr i32 [[ELT0]] to i8 addrspace(3)*
-define void @merge_load_ptr32_i32(i32 addrspace(3)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_ptr32_i32(i32 addrspace(3)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)*
   %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
@@ -137,7 +137,7 @@ entry:
 ; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr0 to i32
 ; CHECK: insertelement <2 x i32> undef, i32 [[ELT0]], i32 0
 ; CHECK: store <2 x i32>
-define void @merge_store_ptr32_i32(i32 addrspace(3)* nocapture %a, i8 addrspace(3)* %ptr0, i32 %val1) #0 {
+define amdgpu_kernel void @merge_store_ptr32_i32(i32 addrspace(3)* nocapture %a, i8 addrspace(3)* %ptr0, i32 %val1) #0 {
 entry:
   %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)*
   %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
@@ -152,7 +152,7 @@ entry:
 ; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr1 to i32
 ; CHECK: insertelement <2 x i32> %{{[^ ]+}}, i32 [[ELT1]], i32 1
 ; CHECK: store <2 x i32>
-define void @merge_store_i32_ptr32(i8 addrspace(3)* addrspace(3)* nocapture %a, i32 %val0, i8 addrspace(3)* %ptr1) #0 {
+define amdgpu_kernel void @merge_store_i32_ptr32(i8 addrspace(3)* addrspace(3)* nocapture %a, i32 %val0, i8 addrspace(3)* %ptr1) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i32 1
   %a.cast = bitcast i8 addrspace(3)* addrspace(3)* %a to i32 addrspace(3)*
@@ -166,7 +166,7 @@ entry:
 ; CHECK-LABEL: @no_merge_store_ptr32_i64(
 ; CHECK: store i8 addrspace(3)*
 ; CHECK: store i64
-define void @no_merge_store_ptr32_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(3)* %ptr0, i64 %val1) #0 {
+define amdgpu_kernel void @no_merge_store_ptr32_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(3)* %ptr0, i64 %val1) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)*
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -181,7 +181,7 @@ entry:
 ; CHECK-LABEL: @no_merge_store_i64_ptr32(
 ; CHECK: store i64
 ; CHECK: store i8 addrspace(3)*
-define void @no_merge_store_i64_ptr32(i8 addrspace(3)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(3)* %ptr1) #0 {
+define amdgpu_kernel void @no_merge_store_i64_ptr32(i8 addrspace(3)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(3)* %ptr1) #0 {
 entry:
   %a.1 =  getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a, i64 1
   %a.cast = bitcast i8 addrspace(3)* addrspace(1)* %a to i64 addrspace(1)*
@@ -195,7 +195,7 @@ entry:
 ; CHECK-LABEL: @no_merge_load_i64_ptr32(
 ; CHECK: load i64,
 ; CHECK: load i8 addrspace(3)*,
-define void @no_merge_load_i64_ptr32(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @no_merge_load_i64_ptr32(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
   %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(3)* addrspace(1)*
@@ -209,7 +209,7 @@ entry:
 ; CHECK-LABEL: @no_merge_load_ptr32_i64(
 ; CHECK: load i8 addrspace(3)*,
 ; CHECK: load i64,
-define void @no_merge_load_ptr32_i64(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @no_merge_load_ptr32_i64(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)*
   %a.1 =  getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -226,7 +226,7 @@ entry:
 ; CHECK: load <2 x i8 addrspace(1)*>
 ; CHECK: store <2 x i8 addrspace(1)*>
 ; CHECK: store <2 x i8 addrspace(1)*>
-define void @merge_v2p1i8_v2p1i8(<2 x i8 addrspace(1)*> addrspace(1)* nocapture noalias %a, <2 x i8 addrspace(1)*> addrspace(1)* nocapture readonly noalias %b) #0 {
+define amdgpu_kernel void @merge_v2p1i8_v2p1i8(<2 x i8 addrspace(1)*> addrspace(1)* nocapture noalias %a, <2 x i8 addrspace(1)*> addrspace(1)* nocapture readonly noalias %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b, i64 1
@@ -245,7 +245,7 @@ entry:
 ; CHECK: [[ELT0_INT:%[^ ]+]] = inttoptr i64 [[ELT0]] to i8 addrspace(1)*
 ; CHECK: [[ELT1_INT:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
 ; CHECK: bitcast i64 [[ELT1_INT]] to double
-define void @merge_load_ptr64_f64(double addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_ptr64_f64(double addrspace(1)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 =  getelementptr inbounds double, double addrspace(1)* %a, i64 1
@@ -262,7 +262,7 @@ entry:
 ; CHECK: bitcast i64 [[ELT0]] to double
 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
 ; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)*
-define void @merge_load_f64_ptr64(double addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_f64_ptr64(double addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
   %a.1.cast = bitcast double addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)*
@@ -279,7 +279,7 @@ entry:
 ; CHECK: [[ELT1_INT:%[^ ]+]] = bitcast double %val1 to i64
 ; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1
 ; CHECK: store <2 x i64>
-define void @merge_store_ptr64_f64(double addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, double %val1) #0 {
+define amdgpu_kernel void @merge_store_ptr64_f64(double addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, double %val1) #0 {
 entry:
   %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
@@ -296,7 +296,7 @@ entry:
 ; CHECK: [[ELT1_INT:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64
 ; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1
 ; CHECK: store <2 x i64>
-define void @merge_store_f64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, double %val0, i8 addrspace(1)* %ptr1) #0 {
+define amdgpu_kernel void @merge_store_f64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, double %val0, i8 addrspace(1)* %ptr1) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
   %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to double addrspace(1)*

Modified: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll (original)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 ; CHECK: store <4 x float>
 
 ; Function Attrs: nounwind
-define void @store_vectorize_with_alias(i8 addrspace(1)* %a, i8 addrspace(1)* %b) #0 {
+define amdgpu_kernel void @store_vectorize_with_alias(i8 addrspace(1)* %a, i8 addrspace(1)* %b) #0 {
 bb:
   %tmp = bitcast i8 addrspace(1)* %b to float addrspace(1)*
   %tmp1 = load float, float addrspace(1)* %tmp, align 4

Modified: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll (original)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll Tue Mar 21 16:39:51 2017
@@ -16,7 +16,7 @@ declare void @use_v2i9(<2 x i9>)
 ; CHECK-LABEL: @merge_store_2_constants_i1(
 ; CHECK: store i1
 ; CHECK: store i1
-define void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1
   store i1 true, i1 addrspace(1)* %out.gep.1
   store i1 false, i1 addrspace(1)* %out
@@ -26,7 +26,7 @@ define void @merge_store_2_constants_i1(
 ; CHECK-LABEL: @merge_store_2_constants_i2(
 ; CHECK: store i2 1
 ; CHECK: store i2 -1
-define void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1
   store i2 1, i2 addrspace(1)* %out.gep.1
   store i2 -1, i2 addrspace(1)* %out
@@ -36,7 +36,7 @@ define void @merge_store_2_constants_i2(
 ; CHECK-LABEL: @merge_different_store_sizes_i1_i8(
 ; CHECK: store i1 true
 ; CHECK: store i8 123
-define void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
   %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   store i1 true, i1 addrspace(1)* %out.i1
@@ -47,7 +47,7 @@ define void @merge_different_store_sizes
 ; CHECK-LABEL: @merge_different_store_sizes_i8_i1(
 ; CHECK: store i8 123
 ; CHECK: store i1 true
-define void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
   %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -58,7 +58,7 @@ define void @merge_different_store_sizes
 ; CHECK-LABEL: @merge_store_2_constant_structs(
 ; CHECK: store %struct.foo
 ; CHECK: store %struct.foo
-define void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1
   store %struct.foo { i32 12, i8 3 }, %struct.foo addrspace(1)* %out.gep.1
   store %struct.foo { i32 92, i8 9 }, %struct.foo addrspace(1)* %out
@@ -69,7 +69,7 @@ define void @merge_store_2_constant_stru
 ; CHECK-LABEL: @merge_store_2_constants_v2i2(
 ; CHECK: store <2 x i2>
 ; CHECK: store <2 x i2>
-define void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1
   store <2 x i2> <i2 1, i2 -1>, <2 x i2> addrspace(1)* %out.gep.1
   store <2 x i2> <i2 -1, i2 1>, <2 x i2> addrspace(1)* %out
@@ -81,7 +81,7 @@ define void @merge_store_2_constants_v2i
 ; CHECK-LABEL: @merge_store_2_constants_v4i2(
 ; CHECK: store <4 x i2>
 ; CHECK: store <4 x i2>
-define void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1
   store <4 x i2> <i2 1, i2 -1, i2 1, i2 -1>, <4 x i2> addrspace(1)* %out.gep.1
   store <4 x i2> <i2 -1, i2 1, i2 -1, i2 1>, <4 x i2> addrspace(1)* %out
@@ -91,7 +91,7 @@ define void @merge_store_2_constants_v4i
 ; CHECK-LABEL: @merge_load_2_constants_i1(
 ; CHECK: load i1
 ; CHECK: load i1
-define void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1
   %x = load i1, i1 addrspace(1)* %out.gep.1
   %y = load i1, i1 addrspace(1)* %out
@@ -103,7 +103,7 @@ define void @merge_load_2_constants_i1(i
 ; CHECK-LABEL: @merge_load_2_constants_i2(
 ; CHECK: load i2
 ; CHECK: load i2
-define void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1
   %x = load i2, i2 addrspace(1)* %out.gep.1
   %y = load i2, i2 addrspace(1)* %out
@@ -115,7 +115,7 @@ define void @merge_load_2_constants_i2(i
 ; CHECK-LABEL: @merge_different_load_sizes_i1_i8(
 ; CHECK: load i1
 ; CHECK: load i8
-define void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
   %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   %x = load i1, i1 addrspace(1)* %out.i1
@@ -128,7 +128,7 @@ define void @merge_different_load_sizes_
 ; CHECK-LABEL: @merge_different_load_sizes_i8_i1(
 ; CHECK: load i8
 ; CHECK: load i1
-define void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
   %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1
   %x = load i8, i8 addrspace(1)* %out.gep.1
@@ -141,7 +141,7 @@ define void @merge_different_load_sizes_
 ; CHECK-LABEL: @merge_load_2_constant_structs(
 ; CHECK: load %struct.foo
 ; CHECK: load %struct.foo
-define void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1
   %x = load %struct.foo, %struct.foo addrspace(1)* %out.gep.1
   %y = load %struct.foo, %struct.foo addrspace(1)* %out
@@ -153,7 +153,7 @@ define void @merge_load_2_constant_struc
 ; CHECK-LABEL: @merge_load_2_constants_v2i2(
 ; CHECK: load <2 x i2>
 ; CHECK: load <2 x i2>
-define void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1
   %x = load <2 x i2>, <2 x i2> addrspace(1)* %out.gep.1
   %y = load <2 x i2>, <2 x i2> addrspace(1)* %out
@@ -165,7 +165,7 @@ define void @merge_load_2_constants_v2i2
 ; CHECK-LABEL: @merge_load_2_constants_v4i2(
 ; CHECK: load <4 x i2>
 ; CHECK: load <4 x i2>
-define void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1
   %x = load <4 x i2>, <4 x i2> addrspace(1)* %out.gep.1
   %y = load <4 x i2>, <4 x i2> addrspace(1)* %out
@@ -177,7 +177,7 @@ define void @merge_load_2_constants_v4i2
 ; CHECK-LABEL: @merge_store_2_constants_i9(
 ; CHECK: store i9 3
 ; CHECK: store i9 -5
-define void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i9, i9 addrspace(1)* %out, i32 1
   store i9 3, i9 addrspace(1)* %out.gep.1
   store i9 -5, i9 addrspace(1)* %out
@@ -187,7 +187,7 @@ define void @merge_store_2_constants_i9(
 ; CHECK-LABEL: @merge_load_2_constants_v2i9(
 ; CHECK: load <2 x i9>
 ; CHECK: load <2 x i9>
-define void @merge_load_2_constants_v2i9(<2 x i9> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_v2i9(<2 x i9> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <2 x i9>, <2 x i9> addrspace(1)* %out, i32 1
   %x = load <2 x i9>, <2 x i9> addrspace(1)* %out.gep.1
   %y = load <2 x i9>, <2 x i9> addrspace(1)* %out

Modified: llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll (original)
+++ llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll Tue Mar 21 16:39:51 2017
@@ -17,7 +17,7 @@ target datalayout = "e-p:32:32-p1:64:64-
 ; OPT: %tmp7 = atomicrmw add i32 addrspace(3)* %lsr.iv1, i32 undef seq_cst
 ; OPT: %0 = atomicrmw add i32 addrspace(3)* %lsr.iv1, i32 %tmp8 seq_cst
 ; OPT: br i1 %exitcond
-define void @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
@@ -54,7 +54,7 @@ bb:
 ; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383
 ; OPT: %tmp4 = cmpxchg i32 addrspace(3)* %scevgep4, i32 undef, i32 undef seq_cst monotonic
-define void @test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge

Modified: llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll (original)
+++ llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll Tue Mar 21 16:39:51 2017
@@ -10,7 +10,7 @@ target datalayout = "e-p:32:32-p1:64:64-
 ; OPT: %lsr.iv2 = phi i8 addrspace(1)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(1)* %lsr.iv2, i64 4095
 ; OPT: load i8, i8 addrspace(1)* %scevgep4, align 1
-define void @test_global_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
@@ -48,7 +48,7 @@ bb:
 ; OPT: {{^}}.lr.ph:
 ; OPT: %lsr.iv3 = phi i8 addrspace(1)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(1)* %lsr.iv3, i64 1
-define void @test_global_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
@@ -83,7 +83,7 @@ bb:
 ; OPT: %lsr.iv2 = phi i8 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(3)* %lsr.iv2, i32 65535
 ; OPT: %tmp4 = load i8, i8 addrspace(3)* %scevgep4, align 1
-define void @test_local_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
@@ -122,7 +122,7 @@ bb:
 ; OPT: {{^}}.lr.ph:
 ; OPT: %lsr.iv3 = phi i8 addrspace(3)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(3)* %lsr.iv3, i32 1
-define void @test_local_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge

Modified: llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll (original)
+++ llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll Tue Mar 21 16:39:51 2017
@@ -15,7 +15,7 @@ target triple = "amdgcn--"
 ;CHECK: buffer_store_dword
 ;CHECK: s_branch [[LOOP_LABEL]]
 
-define void @foo() {
+define amdgpu_kernel void @foo() {
 entry:
   br label %loop
 

Modified: llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll (original)
+++ llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll Tue Mar 21 16:39:51 2017
@@ -16,7 +16,7 @@ target datalayout = "e-p:32:32-p1:64:64-
 ; CHECK: bb:
 ; CHECK: inttoptr i32 %lsr.iv.next2 to i8 addrspace(3)*
 ; CHECK: %c1 = icmp ne i8 addrspace(3)*
-define void @local_cmp_user(i32 %arg0) nounwind {
+define amdgpu_kernel void @local_cmp_user(i32 %arg0) nounwind {
 entry:
   br label %bb11
 
@@ -47,7 +47,7 @@ bb13:
 ; CHECK: bb:
 ; CHECK: inttoptr i64 %lsr.iv.next2 to i8 addrspace(1)*
 ; CHECK: icmp ne i8 addrspace(1)* %t
-define void @global_cmp_user(i64 %arg0) nounwind {
+define amdgpu_kernel void @global_cmp_user(i64 %arg0) nounwind {
 entry:
   br label %bb11
 
@@ -78,7 +78,7 @@ bb13:
 ; CHECK: bb:
 ; CHECK: %idxprom = sext i32 %lsr.iv1 to i64
 ; CHECK: getelementptr i8, i8 addrspace(1)* %t, i64 %idxprom
-define void @global_gep_user(i32 %arg0) nounwind {
+define amdgpu_kernel void @global_gep_user(i32 %arg0) nounwind {
 entry:
   br label %bb11
 
@@ -108,7 +108,7 @@ bb13:
 
 ; CHECK: bb
 ; CHECK: %p = getelementptr i8, i8 addrspace(1)* %t, i64 %ii.ext
-define void @global_sext_scale_user(i32 %arg0) nounwind {
+define amdgpu_kernel void @global_sext_scale_user(i32 %arg0) nounwind {
 entry:
   br label %bb11
 

Modified: llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll (original)
+++ llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll Tue Mar 21 16:39:51 2017
@@ -14,7 +14,7 @@ target datalayout = "e-p:32:32-p1:64:64-
 
 ; CHECK: %scevgep = getelementptr i32, i32 addrspace(3)* %tmp1, i32 4
 ; CHECK:%tmp14 = load i32, i32 addrspace(3)* %scevgep
-define void @lsr_crash_preserve_addrspace_unknown_type() #0 {
+define amdgpu_kernel void @lsr_crash_preserve_addrspace_unknown_type() #0 {
 bb:
   br label %bb1
 

Modified: llvm/trunk/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll (original)
+++ llvm/trunk/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@
 ; CHECK: call void @llvm.amdgcn.s.barrier()
 ; CHECK: call void @llvm.amdgcn.s.barrier()
 ; CHECK-NOT: br
-define void @test_unroll_convergent_barrier(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(1)* noalias nocapture %in) #0 {
+define amdgpu_kernel void @test_unroll_convergent_barrier(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(1)* noalias nocapture %in) #0 {
 entry:
   br label %for.body
 

Modified: llvm/trunk/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll (original)
+++ llvm/trunk/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll Tue Mar 21 16:39:51 2017
@@ -7,7 +7,7 @@
 ; CHECK:       store i32 %tmp15, i32 addrspace(1)* %arrayidx7, align 4
 ; CHECK:       ret void
 
-define void @non_invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) {
+define amdgpu_kernel void @non_invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) {
 entry:
   %arr = alloca [64 x i32], align 4
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -40,7 +40,7 @@ for.body:
 ; CHECK:       br i1 %[[exitcond]]
 ; CHECK-NOT:   icmp eq i32 %{{.*}}, 100
 
-define void @invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) {
+define amdgpu_kernel void @invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) {
 entry:
   %arr = alloca [64 x i32], align 4
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -82,7 +82,7 @@ for.body6:
 ; CHECK:       icmp eq i32 %{{.*}}, 100
 ; CHECK:       br
 
-define void @too_big(i32 addrspace(1)* nocapture %a, i32 %x) {
+define amdgpu_kernel void @too_big(i32 addrspace(1)* nocapture %a, i32 %x) {
 entry:
   %arr = alloca [256 x i32], align 4
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -116,7 +116,7 @@ for.body:
 ; CHECK:       icmp eq i32 %{{.*}}, 100
 ; CHECK:       br
 
-define void @dynamic_size_alloca(i32 addrspace(1)* nocapture %a, i32 %n, i32 %x) {
+define amdgpu_kernel void @dynamic_size_alloca(i32 addrspace(1)* nocapture %a, i32 %n, i32 %x) {
 entry:
   %arr = alloca i32, i32 %n, align 4
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1

Modified: llvm/trunk/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll (original)
+++ llvm/trunk/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll Tue Mar 21 16:39:51 2017
@@ -3,14 +3,14 @@
 ; Check that loop unswitch happened and condition hoisted out of the loop.
 ; Condition is uniform so all targets should perform unswitching.
 
-; CHECK-LABEL: {{^}}define void @uniform_unswitch
+; CHECK-LABEL: {{^}}define amdgpu_kernel void @uniform_unswitch
 ; CHECK: entry:
 ; CHECK-NEXT: [[LOOP_COND:%[a-z0-9]+]] = icmp
 ; CHECK-NEXT: [[IF_COND:%[a-z0-9]+]] = icmp eq i32 %x, 123456
 ; CHECK-NEXT: and i1 [[LOOP_COND]], [[IF_COND]]
 ; CHECK-NEXT: br i1
 
-define void @uniform_unswitch(i32 * nocapture %out, i32 %n, i32 %x) {
+define amdgpu_kernel void @uniform_unswitch(i32 * nocapture %out, i32 %n, i32 %x) {
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body.lr.ph, label %for.cond.cleanup
@@ -42,14 +42,14 @@ for.inc:
 
 ; Check that loop unswitch does not happen if condition is divergent.
 
-; CHECK-LABEL: {{^}}define void @divergent_unswitch
+; CHECK-LABEL: {{^}}define amdgpu_kernel void @divergent_unswitch
 ; CHECK: entry:
 ; CHECK: icmp
 ; CHECK: [[IF_COND:%[a-z0-9]+]] = icmp {{.*}} 567890
 ; CHECK: br label
 ; CHECK: br i1 [[IF_COND]]
 
-define void @divergent_unswitch(i32 * nocapture %out, i32 %n) {
+define amdgpu_kernel void @divergent_unswitch(i32 * nocapture %out, i32 %n) {
 entry:
   %cmp9 = icmp sgt i32 %n, 0
   br i1 %cmp9, label %for.body.lr.ph, label %for.cond.cleanup

Modified: llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll Tue Mar 21 16:39:51 2017
@@ -7,7 +7,7 @@
 ; CHECK: store i32
 ; CHECK-NOT: store i32
 ; CHECK: ret
-define void @small_loop(i32* nocapture %inArray, i32 %size) nounwind {
+define amdgpu_kernel void @small_loop(i32* nocapture %inArray, i32 %size) nounwind {
 entry:
   %0 = icmp sgt i32 %size, 0
   br i1 %0, label %loop, label %exit

Modified: llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@ target datalayout = "e-p:32:32:32-p3:16:
 
 
 ; Simple 3-pair chain with loads and stores
-define void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, double addrspace(3)* %c) {
+define amdgpu_kernel void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, double addrspace(3)* %c) {
 ; CHECK-LABEL: @test1_as_3_3_3(
 ; CHECK: load <2 x double>, <2 x double> addrspace(3)*
 ; CHECK: load <2 x double>, <2 x double> addrspace(3)*
@@ -29,7 +29,7 @@ define void @test1_as_3_3_3(double addrs
   ret void
 }
 
-define void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) {
+define amdgpu_kernel void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) {
 ; CHECK-LABEL: @test1_as_3_0_0(
 ; CHECK: load <2 x double>, <2 x double> addrspace(3)*
 ; CHECK: load <2 x double>, <2 x double>*
@@ -49,7 +49,7 @@ define void @test1_as_3_0_0(double addrs
   ret void
 }
 
-define void @test1_as_0_0_3(double* %a, double* %b, double addrspace(3)* %c) {
+define amdgpu_kernel void @test1_as_0_0_3(double* %a, double* %b, double addrspace(3)* %c) {
 ; CHECK-LABEL: @test1_as_0_0_3(
 ; CHECK: load <2 x double>, <2 x double>*
 ; CHECK: load <2 x double>, <2 x double>*

Modified: llvm/trunk/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll (original)
+++ llvm/trunk/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@ target datalayout = "e-p:32:32-p1:64:64-
 ; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 1
 ; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 32
 ; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 33
-define void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
+define amdgpu_kernel void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
   %tmp = sext i32 %y to i64
   %tmp1 = sext i32 %x to i64
   %tmp2 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp1, i64 %tmp
@@ -42,7 +42,7 @@ define void @sum_of_array(i32 %x, i32 %y
 ; IR: add i32 %x, 256
 ; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
 ; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-define void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
+define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
   %tmp = sext i32 %y to i64
   %tmp1 = sext i32 %x to i64
   %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp1, i64 %tmp
@@ -74,7 +74,7 @@ define void @sum_of_array_over_max_mubuf
 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 255
 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 16128
 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 16383
-define void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
+define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
   %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %x, i32 %y
   %tmp4 = load float, float addrspace(3)* %tmp2, align 4
   %tmp5 = fadd float %tmp4, 0.000000e+00

Modified: llvm/trunk/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll (original)
+++ llvm/trunk/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@ target datalayout = "e-p:32:32-p1:64:64-
 ; CHECK-LABEL: @slsr_after_reassociate_global_geps_mubuf_max_offset(
 ; CHECK: [[b1:%[0-9]+]] = getelementptr float, float addrspace(1)* %arr, i64 [[bump:%[0-9]+]]
 ; CHECK: [[b2:%[0-9]+]] = getelementptr float, float addrspace(1)* [[b1]], i64 [[bump]]
-define void @slsr_after_reassociate_global_geps_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_global_geps_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 1023
@@ -33,7 +33,7 @@ bb:
 ; CHECK: %tmp = sext i32 %j1 to i64
 ; CHECK: getelementptr inbounds float, float addrspace(1)* %arr, i64 %tmp
 ; CHECK: getelementptr inbounds float, float addrspace(1)* %arr, i64 %tmp5
-define void @slsr_after_reassociate_global_geps_over_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_global_geps_over_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 1024
@@ -61,7 +61,7 @@ bb:
 
 ; CHECK: [[B2:%[0-9]+]] = getelementptr float, float addrspace(3)* [[B1]], i32 %i
 ; CHECK: getelementptr inbounds float, float addrspace(3)* [[B2]], i32 16383
-define void @slsr_after_reassociate_lds_geps_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_lds_geps_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 16383
@@ -86,7 +86,7 @@ bb:
 ; CHECK: getelementptr inbounds float, float addrspace(3)* %arr, i32 %j1
 ; CHECK: %j2 = add i32 %j1, %i
 ; CHECK: getelementptr inbounds float, float addrspace(3)* %arr, i32 %j2
-define void @slsr_after_reassociate_lds_geps_over_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_lds_geps_over_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 16384




More information about the llvm-commits mailing list