[llvm] r298444 - AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 21 14:40:01 PDT 2017


Author: arsenm
Date: Tue Mar 21 16:39:51 2017
New Revision: 298444

URL: http://llvm.org/viewvc/llvm-project?rev=298444&view=rev
Log:
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel

Currently the default C calling convention functions are treated
the same as compute kernels. Make this explicit so the default
calling convention can be changed to a non-kernel.

Converted with perl -pi -e 's/define void/define amdgpu_kernel void/'
on the relevant test directories (and undoing in one place that actually
wanted a non-kernel).

Modified:
    llvm/trunk/test/Analysis/CostModel/AMDGPU/add-sub.ll
    llvm/trunk/test/Analysis/CostModel/AMDGPU/bit-ops.ll
    llvm/trunk/test/Analysis/CostModel/AMDGPU/br.ll
    llvm/trunk/test/Analysis/CostModel/AMDGPU/extractelement.ll
    llvm/trunk/test/Analysis/CostModel/AMDGPU/fabs.ll
    llvm/trunk/test/Analysis/CostModel/AMDGPU/fadd.ll
    llvm/trunk/test/Analysis/CostModel/AMDGPU/fdiv.ll
    llvm/trunk/test/Analysis/CostModel/AMDGPU/fmul.ll
    llvm/trunk/test/Analysis/CostModel/AMDGPU/fsub.ll
    llvm/trunk/test/Analysis/CostModel/AMDGPU/insertelement.ll
    llvm/trunk/test/Analysis/CostModel/AMDGPU/mul.ll
    llvm/trunk/test/Analysis/CostModel/AMDGPU/shifts.ll
    llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
    llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
    llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll
    llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
    llvm/trunk/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
    llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
    llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
    llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
    llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
    llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
    llvm/trunk/test/CodeGen/AMDGPU/add-debug.ll
    llvm/trunk/test/CodeGen/AMDGPU/add.i16.ll
    llvm/trunk/test/CodeGen/AMDGPU/add.ll
    llvm/trunk/test/CodeGen/AMDGPU/add.v2i16.ll
    llvm/trunk/test/CodeGen/AMDGPU/add_i128.ll
    llvm/trunk/test/CodeGen/AMDGPU/add_i64.ll
    llvm/trunk/test/CodeGen/AMDGPU/addrspacecast-captured.ll
    llvm/trunk/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
    llvm/trunk/test/CodeGen/AMDGPU/addrspacecast.ll
    llvm/trunk/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
    llvm/trunk/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
    llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
    llvm/trunk/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
    llvm/trunk/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
    llvm/trunk/test/CodeGen/AMDGPU/and-gcn.ll
    llvm/trunk/test/CodeGen/AMDGPU/and.ll
    llvm/trunk/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
    llvm/trunk/test/CodeGen/AMDGPU/annotate-kernel-features.ll
    llvm/trunk/test/CodeGen/AMDGPU/anonymous-gv.ll
    llvm/trunk/test/CodeGen/AMDGPU/anyext.ll
    llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
    llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
    llvm/trunk/test/CodeGen/AMDGPU/ashr.v2i16.ll
    llvm/trunk/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
    llvm/trunk/test/CodeGen/AMDGPU/atomic_load_add.ll
    llvm/trunk/test/CodeGen/AMDGPU/atomic_load_sub.ll
    llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
    llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
    llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
    llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
    llvm/trunk/test/CodeGen/AMDGPU/attr-unparseable.ll
    llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll
    llvm/trunk/test/CodeGen/AMDGPU/basic-loop.ll
    llvm/trunk/test/CodeGen/AMDGPU/bfe-patterns.ll
    llvm/trunk/test/CodeGen/AMDGPU/bfe_uint.ll
    llvm/trunk/test/CodeGen/AMDGPU/bfi_int.ll
    llvm/trunk/test/CodeGen/AMDGPU/bfm.ll
    llvm/trunk/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
    llvm/trunk/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
    llvm/trunk/test/CodeGen/AMDGPU/bitreverse.ll
    llvm/trunk/test/CodeGen/AMDGPU/br_cc.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/branch-relax-spill.ll
    llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll
    llvm/trunk/test/CodeGen/AMDGPU/bswap.ll
    llvm/trunk/test/CodeGen/AMDGPU/build_vector.ll
    llvm/trunk/test/CodeGen/AMDGPU/call.ll
    llvm/trunk/test/CodeGen/AMDGPU/captured-frame-index.ll
    llvm/trunk/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
    llvm/trunk/test/CodeGen/AMDGPU/cf-stack-bug.ll
    llvm/trunk/test/CodeGen/AMDGPU/cf_end.ll
    llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
    llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
    llvm/trunk/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
    llvm/trunk/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
    llvm/trunk/test/CodeGen/AMDGPU/coalescer_remat.ll
    llvm/trunk/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
    llvm/trunk/test/CodeGen/AMDGPU/combine_vloads.ll
    llvm/trunk/test/CodeGen/AMDGPU/commute-compares.ll
    llvm/trunk/test/CodeGen/AMDGPU/commute_modifiers.ll
    llvm/trunk/test/CodeGen/AMDGPU/concat_vectors.ll
    llvm/trunk/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
    llvm/trunk/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
    llvm/trunk/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
    llvm/trunk/test/CodeGen/AMDGPU/convergent-inlineasm.ll
    llvm/trunk/test/CodeGen/AMDGPU/copy-illegal-type.ll
    llvm/trunk/test/CodeGen/AMDGPU/copy-to-reg.ll
    llvm/trunk/test/CodeGen/AMDGPU/ctlz.ll
    llvm/trunk/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
    llvm/trunk/test/CodeGen/AMDGPU/ctpop.ll
    llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll
    llvm/trunk/test/CodeGen/AMDGPU/cttz_zero_undef.ll
    llvm/trunk/test/CodeGen/AMDGPU/cube.ll
    llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
    llvm/trunk/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
    llvm/trunk/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
    llvm/trunk/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
    llvm/trunk/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
    llvm/trunk/test/CodeGen/AMDGPU/debug.ll
    llvm/trunk/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
    llvm/trunk/test/CodeGen/AMDGPU/debugger-insert-nops.ll
    llvm/trunk/test/CodeGen/AMDGPU/debugger-reserve-regs.ll
    llvm/trunk/test/CodeGen/AMDGPU/default-fp-mode.ll
    llvm/trunk/test/CodeGen/AMDGPU/detect-dead-lanes.mir
    llvm/trunk/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
    llvm/trunk/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
    llvm/trunk/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
    llvm/trunk/test/CodeGen/AMDGPU/ds-sub-offset.ll
    llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll
    llvm/trunk/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
    llvm/trunk/test/CodeGen/AMDGPU/ds_read2_superreg.ll
    llvm/trunk/test/CodeGen/AMDGPU/ds_read2st64.ll
    llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll
    llvm/trunk/test/CodeGen/AMDGPU/ds_write2st64.ll
    llvm/trunk/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
    llvm/trunk/test/CodeGen/AMDGPU/early-if-convert-cost.ll
    llvm/trunk/test/CodeGen/AMDGPU/early-if-convert.ll
    llvm/trunk/test/CodeGen/AMDGPU/elf.r600.ll
    llvm/trunk/test/CodeGen/AMDGPU/empty-function.ll
    llvm/trunk/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
    llvm/trunk/test/CodeGen/AMDGPU/endcf-loop-header.ll
    llvm/trunk/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
    llvm/trunk/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll
    llvm/trunk/test/CodeGen/AMDGPU/extload-align.ll
    llvm/trunk/test/CodeGen/AMDGPU/extload-private.ll
    llvm/trunk/test/CodeGen/AMDGPU/extload.ll
    llvm/trunk/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
    llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
    llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
    llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
    llvm/trunk/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
    llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fabs.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fabs.ll
    llvm/trunk/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
    llvm/trunk/test/CodeGen/AMDGPU/fadd.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fadd.ll
    llvm/trunk/test/CodeGen/AMDGPU/fadd64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.ll
    llvm/trunk/test/CodeGen/AMDGPU/fceil.ll
    llvm/trunk/test/CodeGen/AMDGPU/fceil64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fcmp-cnd.ll
    llvm/trunk/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
    llvm/trunk/test/CodeGen/AMDGPU/fcmp.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fcmp.ll
    llvm/trunk/test/CodeGen/AMDGPU/fcmp64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fconst64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f32.ll
    llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fdiv.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fdiv.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fdiv.ll
    llvm/trunk/test/CodeGen/AMDGPU/ffloor.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/ffloor.ll
    llvm/trunk/test/CodeGen/AMDGPU/flat-address-space.ll
    llvm/trunk/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
    llvm/trunk/test/CodeGen/AMDGPU/flat-scratch-reg.ll
    llvm/trunk/test/CodeGen/AMDGPU/flat_atomics.ll
    llvm/trunk/test/CodeGen/AMDGPU/flat_atomics_i64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fma-combine.ll
    llvm/trunk/test/CodeGen/AMDGPU/fma.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fma.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmax3.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmax3.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmaxnum.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmaxnum.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmed3.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmin3.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.ll
    llvm/trunk/test/CodeGen/AMDGPU/fminnum.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fminnum.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmul.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmul.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmul64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f32.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fnearbyint.ll
    llvm/trunk/test/CodeGen/AMDGPU/fneg-combines.ll
    llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.ll
    llvm/trunk/test/CodeGen/AMDGPU/fneg.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fneg.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fneg.ll
    llvm/trunk/test/CodeGen/AMDGPU/fp-classify.ll
    llvm/trunk/test/CodeGen/AMDGPU/fp16_to_fp32.ll
    llvm/trunk/test/CodeGen/AMDGPU/fp16_to_fp64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fp32_to_fp16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fp_to_sint.ll
    llvm/trunk/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fp_to_uint.ll
    llvm/trunk/test/CodeGen/AMDGPU/fpext.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fpext.ll
    llvm/trunk/test/CodeGen/AMDGPU/fptosi.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fptoui.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fptrunc.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fptrunc.ll
    llvm/trunk/test/CodeGen/AMDGPU/fract.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fract.ll
    llvm/trunk/test/CodeGen/AMDGPU/frem.ll
    llvm/trunk/test/CodeGen/AMDGPU/fsqrt.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/fsqrt.ll
    llvm/trunk/test/CodeGen/AMDGPU/fsub.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/fsub.ll
    llvm/trunk/test/CodeGen/AMDGPU/fsub64.ll
    llvm/trunk/test/CodeGen/AMDGPU/ftrunc.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/ftrunc.ll
    llvm/trunk/test/CodeGen/AMDGPU/gep-address-space.ll
    llvm/trunk/test/CodeGen/AMDGPU/global-constant.ll
    llvm/trunk/test/CodeGen/AMDGPU/global-directive.ll
    llvm/trunk/test/CodeGen/AMDGPU/global-extload-i16.ll
    llvm/trunk/test/CodeGen/AMDGPU/global-variable-relocs.ll
    llvm/trunk/test/CodeGen/AMDGPU/global_atomics.ll
    llvm/trunk/test/CodeGen/AMDGPU/global_atomics_i64.ll
    llvm/trunk/test/CodeGen/AMDGPU/gv-const-addrspace.ll
    llvm/trunk/test/CodeGen/AMDGPU/gv-offset-folding.ll
    llvm/trunk/test/CodeGen/AMDGPU/half.ll
    llvm/trunk/test/CodeGen/AMDGPU/hsa-default-device.ll
    llvm/trunk/test/CodeGen/AMDGPU/hsa-fp-mode.ll
    llvm/trunk/test/CodeGen/AMDGPU/hsa-globals.ll
    llvm/trunk/test/CodeGen/AMDGPU/hsa-group-segment.ll
    llvm/trunk/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
    llvm/trunk/test/CodeGen/AMDGPU/i1-copy-phi.ll
    llvm/trunk/test/CodeGen/AMDGPU/i8-to-double-to-float.ll
    llvm/trunk/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
    llvm/trunk/test/CodeGen/AMDGPU/icmp.i16.ll
    llvm/trunk/test/CodeGen/AMDGPU/icmp64.ll
    llvm/trunk/test/CodeGen/AMDGPU/image-attributes.ll
    llvm/trunk/test/CodeGen/AMDGPU/image-resource-id.ll
    llvm/trunk/test/CodeGen/AMDGPU/imm.ll
    llvm/trunk/test/CodeGen/AMDGPU/imm16.ll
    llvm/trunk/test/CodeGen/AMDGPU/immv216.ll
    llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
    llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll
    llvm/trunk/test/CodeGen/AMDGPU/indirect-private-64.ll
    llvm/trunk/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll
    llvm/trunk/test/CodeGen/AMDGPU/infinite-loop.ll
    llvm/trunk/test/CodeGen/AMDGPU/inline-asm.ll
    llvm/trunk/test/CodeGen/AMDGPU/inline-calls.ll
    llvm/trunk/test/CodeGen/AMDGPU/inline-constraints.ll
    llvm/trunk/test/CodeGen/AMDGPU/inlineasm-16.ll
    llvm/trunk/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
    llvm/trunk/test/CodeGen/AMDGPU/inlineasm-packed.ll
    llvm/trunk/test/CodeGen/AMDGPU/insert_subreg.ll
    llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll
    llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
    llvm/trunk/test/CodeGen/AMDGPU/inserted-wait-states.mir
    llvm/trunk/test/CodeGen/AMDGPU/internalize.ll
    llvm/trunk/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
    llvm/trunk/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
    llvm/trunk/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir
    llvm/trunk/test/CodeGen/AMDGPU/kcache-fold.ll
    llvm/trunk/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
    llvm/trunk/test/CodeGen/AMDGPU/kernel-args.ll
    llvm/trunk/test/CodeGen/AMDGPU/large-alloca-compute.ll
    llvm/trunk/test/CodeGen/AMDGPU/large-constant-initializer.ll
    llvm/trunk/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
    llvm/trunk/test/CodeGen/AMDGPU/lds-alignment.ll
    llvm/trunk/test/CodeGen/AMDGPU/lds-initializer.ll
    llvm/trunk/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
    llvm/trunk/test/CodeGen/AMDGPU/lds-oqap-crash.ll
    llvm/trunk/test/CodeGen/AMDGPU/lds-output-queue.ll
    llvm/trunk/test/CodeGen/AMDGPU/lds-size.ll
    llvm/trunk/test/CodeGen/AMDGPU/lds-zero-initializer.ll
    llvm/trunk/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
    llvm/trunk/test/CodeGen/AMDGPU/literals.ll
    llvm/trunk/test/CodeGen/AMDGPU/liveness.mir
    llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.SI.export.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.cos.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.cos.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.dbg.value.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.exp2.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.floor.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.fma.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.log2.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.log2.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.memcpy.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.r600.tex.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.rint.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.rint.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.rint.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.round.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.round.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.sin.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.sin.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-constant-f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-constant-i1.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-constant-i16.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-constant-i32.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-constant-i64.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-constant-i8.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-global-f32.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-global-f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-global-i1.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-global-i16.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-global-i32.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-global-i64.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-global-i8.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-local-f32.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-local-f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-local-i1.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-local-i16.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-local-i32.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-local-i64.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-local-i8.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-weird-sizes.ll
    llvm/trunk/test/CodeGen/AMDGPU/local-64.ll
    llvm/trunk/test/CodeGen/AMDGPU/local-atomics.ll
    llvm/trunk/test/CodeGen/AMDGPU/local-atomics64.ll
    llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
    llvm/trunk/test/CodeGen/AMDGPU/local-memory.ll
    llvm/trunk/test/CodeGen/AMDGPU/local-memory.r600.ll
    llvm/trunk/test/CodeGen/AMDGPU/loop-address.ll
    llvm/trunk/test/CodeGen/AMDGPU/loop-idiom.ll
    llvm/trunk/test/CodeGen/AMDGPU/loop_break.ll
    llvm/trunk/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
    llvm/trunk/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
    llvm/trunk/test/CodeGen/AMDGPU/lshr.v2i16.ll
    llvm/trunk/test/CodeGen/AMDGPU/mad-combine.ll
    llvm/trunk/test/CodeGen/AMDGPU/mad24-get-global-id.ll
    llvm/trunk/test/CodeGen/AMDGPU/mad_int24.ll
    llvm/trunk/test/CodeGen/AMDGPU/mad_uint24.ll
    llvm/trunk/test/CodeGen/AMDGPU/madak.ll
    llvm/trunk/test/CodeGen/AMDGPU/madmk.ll
    llvm/trunk/test/CodeGen/AMDGPU/max.i16.ll
    llvm/trunk/test/CodeGen/AMDGPU/max.ll
    llvm/trunk/test/CodeGen/AMDGPU/max3.ll
    llvm/trunk/test/CodeGen/AMDGPU/mem-builtins.ll
    llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll
    llvm/trunk/test/CodeGen/AMDGPU/min.ll
    llvm/trunk/test/CodeGen/AMDGPU/min3.ll
    llvm/trunk/test/CodeGen/AMDGPU/missing-store.ll
    llvm/trunk/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
    llvm/trunk/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
    llvm/trunk/test/CodeGen/AMDGPU/mubuf.ll
    llvm/trunk/test/CodeGen/AMDGPU/mul.ll
    llvm/trunk/test/CodeGen/AMDGPU/mul_int24.ll
    llvm/trunk/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
    llvm/trunk/test/CodeGen/AMDGPU/mul_uint24-r600.ll
    llvm/trunk/test/CodeGen/AMDGPU/multilevel-break.ll
    llvm/trunk/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
    llvm/trunk/test/CodeGen/AMDGPU/no-shrink-extloads.ll
    llvm/trunk/test/CodeGen/AMDGPU/opencl-image-metadata.ll
    llvm/trunk/test/CodeGen/AMDGPU/operand-folding.ll
    llvm/trunk/test/CodeGen/AMDGPU/operand-spacing.ll
    llvm/trunk/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
    llvm/trunk/test/CodeGen/AMDGPU/or.ll
    llvm/trunk/test/CodeGen/AMDGPU/over-max-lds-size.ll
    llvm/trunk/test/CodeGen/AMDGPU/pack.v2f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/pack.v2i16.ll
    llvm/trunk/test/CodeGen/AMDGPU/packetizer.ll
    llvm/trunk/test/CodeGen/AMDGPU/parallelandifcollapse.ll
    llvm/trunk/test/CodeGen/AMDGPU/parallelorifcollapse.ll
    llvm/trunk/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
    llvm/trunk/test/CodeGen/AMDGPU/predicates.ll
    llvm/trunk/test/CodeGen/AMDGPU/private-access-no-objects.ll
    llvm/trunk/test/CodeGen/AMDGPU/private-element-size.ll
    llvm/trunk/test/CodeGen/AMDGPU/private-memory-atomics.ll
    llvm/trunk/test/CodeGen/AMDGPU/private-memory-broken.ll
    llvm/trunk/test/CodeGen/AMDGPU/private-memory-r600.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-globals.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
    llvm/trunk/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll
    llvm/trunk/test/CodeGen/AMDGPU/r600.alu-limits.ll
    llvm/trunk/test/CodeGen/AMDGPU/r600.bitcast.ll
    llvm/trunk/test/CodeGen/AMDGPU/r600.global_atomics.ll
    llvm/trunk/test/CodeGen/AMDGPU/r600.private-memory.ll
    llvm/trunk/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
    llvm/trunk/test/CodeGen/AMDGPU/rcp-pattern.ll
    llvm/trunk/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
    llvm/trunk/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
    llvm/trunk/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
    llvm/trunk/test/CodeGen/AMDGPU/read_register.ll
    llvm/trunk/test/CodeGen/AMDGPU/readcyclecounter.ll
    llvm/trunk/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
    llvm/trunk/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
    llvm/trunk/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
    llvm/trunk/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
    llvm/trunk/test/CodeGen/AMDGPU/register-count-comments.ll
    llvm/trunk/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
    llvm/trunk/test/CodeGen/AMDGPU/rename-independent-subregs.mir
    llvm/trunk/test/CodeGen/AMDGPU/reorder-stores.ll
    llvm/trunk/test/CodeGen/AMDGPU/rotl.i64.ll
    llvm/trunk/test/CodeGen/AMDGPU/rotl.ll
    llvm/trunk/test/CodeGen/AMDGPU/rotr.i64.ll
    llvm/trunk/test/CodeGen/AMDGPU/rotr.ll
    llvm/trunk/test/CodeGen/AMDGPU/rsq.ll
    llvm/trunk/test/CodeGen/AMDGPU/s_addk_i32.ll
    llvm/trunk/test/CodeGen/AMDGPU/s_movk_i32.ll
    llvm/trunk/test/CodeGen/AMDGPU/s_mulk_i32.ll
    llvm/trunk/test/CodeGen/AMDGPU/sad.ll
    llvm/trunk/test/CodeGen/AMDGPU/saddo.ll
    llvm/trunk/test/CodeGen/AMDGPU/salu-to-valu.ll
    llvm/trunk/test/CodeGen/AMDGPU/sampler-resource-id.ll
    llvm/trunk/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir
    llvm/trunk/test/CodeGen/AMDGPU/scalar_to_vector.ll
    llvm/trunk/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
    llvm/trunk/test/CodeGen/AMDGPU/schedule-global-loads.ll
    llvm/trunk/test/CodeGen/AMDGPU/schedule-if-2.ll
    llvm/trunk/test/CodeGen/AMDGPU/schedule-if.ll
    llvm/trunk/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
    llvm/trunk/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
    llvm/trunk/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
    llvm/trunk/test/CodeGen/AMDGPU/scratch-buffer.ll
    llvm/trunk/test/CodeGen/AMDGPU/sdiv.ll
    llvm/trunk/test/CodeGen/AMDGPU/sdivrem24.ll
    llvm/trunk/test/CodeGen/AMDGPU/sdivrem64.ll
    llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole.ll
    llvm/trunk/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
    llvm/trunk/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
    llvm/trunk/test/CodeGen/AMDGPU/select-i1.ll
    llvm/trunk/test/CodeGen/AMDGPU/select-opt.ll
    llvm/trunk/test/CodeGen/AMDGPU/select-vectors.ll
    llvm/trunk/test/CodeGen/AMDGPU/select.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/select.ll
    llvm/trunk/test/CodeGen/AMDGPU/select64.ll
    llvm/trunk/test/CodeGen/AMDGPU/selectcc-cnd.ll
    llvm/trunk/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
    llvm/trunk/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
    llvm/trunk/test/CodeGen/AMDGPU/selectcc-opt.ll
    llvm/trunk/test/CodeGen/AMDGPU/selectcc.ll
    llvm/trunk/test/CodeGen/AMDGPU/set-dx10.ll
    llvm/trunk/test/CodeGen/AMDGPU/setcc-equivalent.ll
    llvm/trunk/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
    llvm/trunk/test/CodeGen/AMDGPU/setcc-opt.ll
    llvm/trunk/test/CodeGen/AMDGPU/setcc.ll
    llvm/trunk/test/CodeGen/AMDGPU/setcc64.ll
    llvm/trunk/test/CodeGen/AMDGPU/sext-eliminate.ll
    llvm/trunk/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
    llvm/trunk/test/CodeGen/AMDGPU/sext-in-reg.ll
    llvm/trunk/test/CodeGen/AMDGPU/sgpr-control-flow.ll
    llvm/trunk/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
    llvm/trunk/test/CodeGen/AMDGPU/sgpr-copy.ll
    llvm/trunk/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
    llvm/trunk/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
    llvm/trunk/test/CodeGen/AMDGPU/shift-i64-opts.ll
    llvm/trunk/test/CodeGen/AMDGPU/shl.ll
    llvm/trunk/test/CodeGen/AMDGPU/shl.v2i16.ll
    llvm/trunk/test/CodeGen/AMDGPU/shl_add_constant.ll
    llvm/trunk/test/CodeGen/AMDGPU/shl_add_ptr.ll
    llvm/trunk/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
    llvm/trunk/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
    llvm/trunk/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
    llvm/trunk/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll
    llvm/trunk/test/CodeGen/AMDGPU/si-annotate-cf.ll
    llvm/trunk/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
    llvm/trunk/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir
    llvm/trunk/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
    llvm/trunk/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
    llvm/trunk/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
    llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
    llvm/trunk/test/CodeGen/AMDGPU/si-vector-hang.ll
    llvm/trunk/test/CodeGen/AMDGPU/sign_extend.ll
    llvm/trunk/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
    llvm/trunk/test/CodeGen/AMDGPU/sint_to_fp.ll
    llvm/trunk/test/CodeGen/AMDGPU/sitofp.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/smed3.ll
    llvm/trunk/test/CodeGen/AMDGPU/sminmax.ll
    llvm/trunk/test/CodeGen/AMDGPU/sminmax.v2i16.ll
    llvm/trunk/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
    llvm/trunk/test/CodeGen/AMDGPU/smrd.ll
    llvm/trunk/test/CodeGen/AMDGPU/sopk-compares.ll
    llvm/trunk/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
    llvm/trunk/test/CodeGen/AMDGPU/spill-cfg-position.ll
    llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll
    llvm/trunk/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
    llvm/trunk/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
    llvm/trunk/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
    llvm/trunk/test/CodeGen/AMDGPU/splitkit.mir
    llvm/trunk/test/CodeGen/AMDGPU/sra.ll
    llvm/trunk/test/CodeGen/AMDGPU/srem.ll
    llvm/trunk/test/CodeGen/AMDGPU/srl.ll
    llvm/trunk/test/CodeGen/AMDGPU/ssubo.ll
    llvm/trunk/test/CodeGen/AMDGPU/store-barrier.ll
    llvm/trunk/test/CodeGen/AMDGPU/store-global.ll
    llvm/trunk/test/CodeGen/AMDGPU/store-local.ll
    llvm/trunk/test/CodeGen/AMDGPU/store-private.ll
    llvm/trunk/test/CodeGen/AMDGPU/store-v3i64.ll
    llvm/trunk/test/CodeGen/AMDGPU/store-vector-ptrs.ll
    llvm/trunk/test/CodeGen/AMDGPU/store_typed.ll
    llvm/trunk/test/CodeGen/AMDGPU/structurize.ll
    llvm/trunk/test/CodeGen/AMDGPU/structurize1.ll
    llvm/trunk/test/CodeGen/AMDGPU/sub.i16.ll
    llvm/trunk/test/CodeGen/AMDGPU/sub.ll
    llvm/trunk/test/CodeGen/AMDGPU/sub.v2i16.ll
    llvm/trunk/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
    llvm/trunk/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
    llvm/trunk/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
    llvm/trunk/test/CodeGen/AMDGPU/subreg-intervals.mir
    llvm/trunk/test/CodeGen/AMDGPU/target-cpu.ll
    llvm/trunk/test/CodeGen/AMDGPU/trap.ll
    llvm/trunk/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
    llvm/trunk/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
    llvm/trunk/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/trunc-store-i1.ll
    llvm/trunk/test/CodeGen/AMDGPU/trunc-store.ll
    llvm/trunk/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
    llvm/trunk/test/CodeGen/AMDGPU/trunc.ll
    llvm/trunk/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
    llvm/trunk/test/CodeGen/AMDGPU/uaddo.ll
    llvm/trunk/test/CodeGen/AMDGPU/udiv.ll
    llvm/trunk/test/CodeGen/AMDGPU/udivrem.ll
    llvm/trunk/test/CodeGen/AMDGPU/udivrem24.ll
    llvm/trunk/test/CodeGen/AMDGPU/udivrem64.ll
    llvm/trunk/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
    llvm/trunk/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
    llvm/trunk/test/CodeGen/AMDGPU/uint_to_fp.ll
    llvm/trunk/test/CodeGen/AMDGPU/uitofp.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/umed3.ll
    llvm/trunk/test/CodeGen/AMDGPU/unaligned-load-store.ll
    llvm/trunk/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
    llvm/trunk/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
    llvm/trunk/test/CodeGen/AMDGPU/uniform-cfg.ll
    llvm/trunk/test/CodeGen/AMDGPU/uniform-crash.ll
    llvm/trunk/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
    llvm/trunk/test/CodeGen/AMDGPU/unknown-processor.ll
    llvm/trunk/test/CodeGen/AMDGPU/unroll.ll
    llvm/trunk/test/CodeGen/AMDGPU/unsupported-cc.ll
    llvm/trunk/test/CodeGen/AMDGPU/urem.ll
    llvm/trunk/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
    llvm/trunk/test/CodeGen/AMDGPU/usubo.ll
    llvm/trunk/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll
    llvm/trunk/test/CodeGen/AMDGPU/v_cndmask.ll
    llvm/trunk/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
    llvm/trunk/test/CodeGen/AMDGPU/v_mac.ll
    llvm/trunk/test/CodeGen/AMDGPU/v_mac_f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/v_madak_f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll
    llvm/trunk/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
    llvm/trunk/test/CodeGen/AMDGPU/vector-alloca.ll
    llvm/trunk/test/CodeGen/AMDGPU/vector-extract-insert.ll
    llvm/trunk/test/CodeGen/AMDGPU/vectorize-global-local.ll
    llvm/trunk/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
    llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
    llvm/trunk/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
    llvm/trunk/test/CodeGen/AMDGPU/vop-shrink.ll
    llvm/trunk/test/CodeGen/AMDGPU/vselect.ll
    llvm/trunk/test/CodeGen/AMDGPU/vselect64.ll
    llvm/trunk/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
    llvm/trunk/test/CodeGen/AMDGPU/vtx-schedule.ll
    llvm/trunk/test/CodeGen/AMDGPU/waitcnt-flat.ll
    llvm/trunk/test/CodeGen/AMDGPU/waitcnt.mir
    llvm/trunk/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
    llvm/trunk/test/CodeGen/AMDGPU/write_register.ll
    llvm/trunk/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
    llvm/trunk/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll
    llvm/trunk/test/CodeGen/AMDGPU/xor.ll
    llvm/trunk/test/CodeGen/AMDGPU/zero_extend.ll
    llvm/trunk/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
    llvm/trunk/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir
    llvm/trunk/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir
    llvm/trunk/test/CodeGen/MIR/AMDGPU/intrinsics.mir
    llvm/trunk/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir
    llvm/trunk/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
    llvm/trunk/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll
    llvm/trunk/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll
    llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
    llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
    llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
    llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll
    llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/select.ll
    llvm/trunk/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll
    llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
    llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
    llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
    llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
    llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
    llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
    llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
    llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
    llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
    llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
    llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
    llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
    llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
    llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
    llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
    llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
    llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
    llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll
    llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll
    llvm/trunk/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
    llvm/trunk/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll
    llvm/trunk/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll
    llvm/trunk/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll
    llvm/trunk/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll
    llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
    llvm/trunk/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
    llvm/trunk/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll

Modified: llvm/trunk/test/Analysis/CostModel/AMDGPU/add-sub.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/AMDGPU/add-sub.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/AMDGPU/add-sub.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/AMDGPU/add-sub.ll Tue Mar 21 16:39:51 2017
@@ -3,7 +3,7 @@
 
 ; CHECK: 'add_i32'
 ; CHECK: estimated cost of 1 for {{.*}} add i32
-define void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %add = add i32 %vec, %b
   store i32 %add, i32 addrspace(1)* %out
@@ -12,7 +12,7 @@ define void @add_i32(i32 addrspace(1)* %
 
 ; CHECK: 'add_v2i32'
 ; CHECK: estimated cost of 2 for {{.*}} add <2 x i32>
-define void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
+define amdgpu_kernel void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
   %add = add <2 x i32> %vec, %b
   store <2 x i32> %add, <2 x i32> addrspace(1)* %out
@@ -21,7 +21,7 @@ define void @add_v2i32(<2 x i32> addrspa
 
 ; CHECK: 'add_v3i32'
 ; CHECK: estimated cost of 3 for {{.*}} add <3 x i32>
-define void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
+define amdgpu_kernel void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
   %add = add <3 x i32> %vec, %b
   store <3 x i32> %add, <3 x i32> addrspace(1)* %out
@@ -30,7 +30,7 @@ define void @add_v3i32(<3 x i32> addrspa
 
 ; CHECK: 'add_v4i32'
 ; CHECK: estimated cost of 4 for {{.*}} add <4 x i32>
-define void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
+define amdgpu_kernel void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
   %add = add <4 x i32> %vec, %b
   store <4 x i32> %add, <4 x i32> addrspace(1)* %out
@@ -39,7 +39,7 @@ define void @add_v4i32(<4 x i32> addrspa
 
 ; CHECK: 'add_i64'
 ; CHECK: estimated cost of 2 for {{.*}} add i64
-define void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %add = add i64 %vec, %b
   store i64 %add, i64 addrspace(1)* %out
@@ -48,7 +48,7 @@ define void @add_i64(i64 addrspace(1)* %
 
 ; CHECK: 'add_v2i64'
 ; CHECK: estimated cost of 4 for {{.*}} add <2 x i64>
-define void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
+define amdgpu_kernel void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
   %add = add <2 x i64> %vec, %b
   store <2 x i64> %add, <2 x i64> addrspace(1)* %out
@@ -57,7 +57,7 @@ define void @add_v2i64(<2 x i64> addrspa
 
 ; CHECK: 'add_v3i64'
 ; CHECK: estimated cost of 6 for {{.*}} add <3 x i64>
-define void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
+define amdgpu_kernel void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
   %add = add <3 x i64> %vec, %b
   store <3 x i64> %add, <3 x i64> addrspace(1)* %out
@@ -66,7 +66,7 @@ define void @add_v3i64(<3 x i64> addrspa
 
 ; CHECK: 'add_v4i64'
 ; CHECK: estimated cost of 8 for {{.*}} add <4 x i64>
-define void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
+define amdgpu_kernel void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
   %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
   %add = add <4 x i64> %vec, %b
   store <4 x i64> %add, <4 x i64> addrspace(1)* %out
@@ -75,7 +75,7 @@ define void @add_v4i64(<4 x i64> addrspa
 
 ; CHECK: 'add_v16i64'
 ; CHECK: estimated cost of 32 for {{.*}} add <16 x i64>
-define void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 {
+define amdgpu_kernel void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 {
   %vec = load <16 x i64>, <16 x i64> addrspace(1)* %vaddr
   %add = add <16 x i64> %vec, %b
   store <16 x i64> %add, <16 x i64> addrspace(1)* %out
@@ -84,7 +84,7 @@ define void @add_v16i64(<16 x i64> addrs
 
 ; CHECK: 'add_i16'
 ; CHECK: estimated cost of 1 for {{.*}} add i16
-define void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
+define amdgpu_kernel void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
   %vec = load i16, i16 addrspace(1)* %vaddr
   %add = add i16 %vec, %b
   store i16 %add, i16 addrspace(1)* %out
@@ -93,7 +93,7 @@ define void @add_i16(i16 addrspace(1)* %
 
 ; CHECK: 'add_v2i16'
 ; CHECK: estimated cost of 2 for {{.*}} add <2 x i16>
-define void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
+define amdgpu_kernel void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
   %add = add <2 x i16> %vec, %b
   store <2 x i16> %add, <2 x i16> addrspace(1)* %out
@@ -102,7 +102,7 @@ define void @add_v2i16(<2 x i16> addrspa
 
 ; CHECK: 'sub_i32'
 ; CHECK: estimated cost of 1 for {{.*}} sub i32
-define void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %sub = sub i32 %vec, %b
   store i32 %sub, i32 addrspace(1)* %out
@@ -111,7 +111,7 @@ define void @sub_i32(i32 addrspace(1)* %
 
 ; CHECK: 'sub_i64'
 ; CHECK: estimated cost of 2 for {{.*}} sub i64
-define void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %sub = sub i64 %vec, %b
   store i64 %sub, i64 addrspace(1)* %out
@@ -119,7 +119,7 @@ define void @sub_i64(i64 addrspace(1)* %
 }
 ; CHECK: 'sub_i16'
 ; CHECK: estimated cost of 1 for {{.*}} sub i16
-define void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
+define amdgpu_kernel void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
   %vec = load i16, i16 addrspace(1)* %vaddr
   %sub = sub i16 %vec, %b
   store i16 %sub, i16 addrspace(1)* %out
@@ -128,7 +128,7 @@ define void @sub_i16(i16 addrspace(1)* %
 
 ; CHECK: 'sub_v2i16'
 ; CHECK: estimated cost of 2 for {{.*}} sub <2 x i16>
-define void @sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
+define amdgpu_kernel void @sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
   %sub = sub <2 x i16> %vec, %b
   store <2 x i16> %sub, <2 x i16> addrspace(1)* %out

Modified: llvm/trunk/test/Analysis/CostModel/AMDGPU/bit-ops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/AMDGPU/bit-ops.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/AMDGPU/bit-ops.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/AMDGPU/bit-ops.ll Tue Mar 21 16:39:51 2017
@@ -2,7 +2,7 @@
 
 ; CHECK: 'or_i32'
 ; CHECK: estimated cost of 1 for {{.*}} or i32
-define void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = or i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -11,7 +11,7 @@ define void @or_i32(i32 addrspace(1)* %o
 
 ; CHECK: 'or_i64'
 ; CHECK: estimated cost of 2 for {{.*}} or i64
-define void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = or i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @or_i64(i64 addrspace(1)* %o
 
 ; CHECK: 'xor_i32'
 ; CHECK: estimated cost of 1 for {{.*}} xor i32
-define void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = xor i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @xor_i32(i32 addrspace(1)* %
 
 ; CHECK: 'xor_i64'
 ; CHECK: estimated cost of 2 for {{.*}} xor i64
-define void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = xor i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -39,7 +39,7 @@ define void @xor_i64(i64 addrspace(1)* %
 
 ; CHECK: 'and_i32'
 ; CHECK: estimated cost of 1 for {{.*}} and i32
-define void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = and i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -48,7 +48,7 @@ define void @and_i32(i32 addrspace(1)* %
 
 ; CHECK: 'and_i64'
 ; CHECK: estimated cost of 2 for {{.*}} and i64
-define void @and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = and i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out

Modified: llvm/trunk/test/Analysis/CostModel/AMDGPU/br.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/AMDGPU/br.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/AMDGPU/br.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/AMDGPU/br.ll Tue Mar 21 16:39:51 2017
@@ -4,7 +4,7 @@
 ; CHECK: estimated cost of 10 for instruction: br i1
 ; CHECK: estimated cost of 10 for instruction: br label
 ; CHECK: estimated cost of 10 for instruction: ret void
-define void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
 bb0:
   br i1 undef, label %bb1, label %bb2
 
@@ -21,7 +21,7 @@ bb2:
 
 ; CHECK: 'test_switch_cost'
 ; CHECK: Unknown cost for instruction:   switch
-define void @test_switch_cost(i32 %a) #0 {
+define amdgpu_kernel void @test_switch_cost(i32 %a) #0 {
 entry:
   switch i32 %a, label %default [
     i32 0, label %case0

Modified: llvm/trunk/test/Analysis/CostModel/AMDGPU/extractelement.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/AMDGPU/extractelement.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/AMDGPU/extractelement.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/AMDGPU/extractelement.ll Tue Mar 21 16:39:51 2017
@@ -2,7 +2,7 @@
 
 ; CHECK: 'extractelement_v2i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i32>
-define void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
   %elt = extractelement <2 x i32> %vec, i32 1
   store i32 %elt, i32 addrspace(1)* %out
@@ -11,7 +11,7 @@ define void @extractelement_v2i32(i32 ad
 
 ; CHECK: 'extractelement_v2f32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x float>
-define void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %elt = extractelement <2 x float> %vec, i32 1
   store float %elt, float addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @extractelement_v2f32(float
 
 ; CHECK: 'extractelement_v3i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i32>
-define void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
   %elt = extractelement <3 x i32> %vec, i32 1
   store i32 %elt, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @extractelement_v3i32(i32 ad
 
 ; CHECK: 'extractelement_v4i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i32>
-define void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
   %elt = extractelement <4 x i32> %vec, i32 1
   store i32 %elt, i32 addrspace(1)* %out
@@ -38,7 +38,7 @@ define void @extractelement_v4i32(i32 ad
 
 ; CHECK: 'extractelement_v8i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i32>
-define void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
   %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
   %elt = extractelement <8 x i32> %vec, i32 1
   store i32 %elt, i32 addrspace(1)* %out
@@ -48,7 +48,7 @@ define void @extractelement_v8i32(i32 ad
 ; FIXME: Should be non-0
 ; CHECK: 'extractelement_v8i32_dynindex'
 ; CHECK: estimated cost of 2 for {{.*}} extractelement <8 x i32>
-define void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
+define amdgpu_kernel void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
   %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
   %elt = extractelement <8 x i32> %vec, i32 %idx
   store i32 %elt, i32 addrspace(1)* %out
@@ -57,7 +57,7 @@ define void @extractelement_v8i32_dynind
 
 ; CHECK: 'extractelement_v2i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i64>
-define void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
   %elt = extractelement <2 x i64> %vec, i64 1
   store i64 %elt, i64 addrspace(1)* %out
@@ -66,7 +66,7 @@ define void @extractelement_v2i64(i64 ad
 
 ; CHECK: 'extractelement_v3i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i64>
-define void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
   %elt = extractelement <3 x i64> %vec, i64 1
   store i64 %elt, i64 addrspace(1)* %out
@@ -75,7 +75,7 @@ define void @extractelement_v3i64(i64 ad
 
 ; CHECK: 'extractelement_v4i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i64>
-define void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
   %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
   %elt = extractelement <4 x i64> %vec, i64 1
   store i64 %elt, i64 addrspace(1)* %out
@@ -84,7 +84,7 @@ define void @extractelement_v4i64(i64 ad
 
 ; CHECK: 'extractelement_v8i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i64>
-define void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
   %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
   %elt = extractelement <8 x i64> %vec, i64 1
   store i64 %elt, i64 addrspace(1)* %out
@@ -93,7 +93,7 @@ define void @extractelement_v8i64(i64 ad
 
 ; CHECK: 'extractelement_v4i8'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i8>
-define void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
   %vec = load <4 x i8>, <4 x i8> addrspace(1)* %vaddr
   %elt = extractelement <4 x i8> %vec, i8 1
   store i8 %elt, i8 addrspace(1)* %out
@@ -102,7 +102,7 @@ define void @extractelement_v4i8(i8 addr
 
 ; CHECK: 'extractelement_v2i16'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i16>
-define void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
   %elt = extractelement <2 x i16> %vec, i16 1
   store i16 %elt, i16 addrspace(1)* %out

Modified: llvm/trunk/test/Analysis/CostModel/AMDGPU/fabs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/AMDGPU/fabs.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/AMDGPU/fabs.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/AMDGPU/fabs.ll Tue Mar 21 16:39:51 2017
@@ -2,7 +2,7 @@
 
 ; CHECK: 'fabs_f32'
 ; CHECK: estimated cost of 0 for {{.*}} call float @llvm.fabs.f32
-define void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %fabs = call float @llvm.fabs.f32(float %vec) #1
   store float %fabs, float addrspace(1)* %out
@@ -11,7 +11,7 @@ define void @fabs_f32(float addrspace(1)
 
 ; CHECK: 'fabs_v2f32'
 ; CHECK: estimated cost of 0 for {{.*}} call <2 x float> @llvm.fabs.v2f32
-define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %vec) #1
   store <2 x float> %fabs, <2 x float> addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @fabs_v2f32(<2 x float> addr
 
 ; CHECK: 'fabs_v3f32'
 ; CHECK: estimated cost of 0 for {{.*}} call <3 x float> @llvm.fabs.v3f32
-define void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %vec) #1
   store <3 x float> %fabs, <3 x float> addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @fabs_v3f32(<3 x float> addr
 
 ; CHECK: 'fabs_f64'
 ; CHECK: estimated cost of 0 for {{.*}} call double @llvm.fabs.f64
-define void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %fabs = call double @llvm.fabs.f64(double %vec) #1
   store double %fabs, double addrspace(1)* %out
@@ -38,7 +38,7 @@ define void @fabs_f64(double addrspace(1
 
 ; CHECK: 'fabs_v2f64'
 ; CHECK: estimated cost of 0 for {{.*}} call <2 x double> @llvm.fabs.v2f64
-define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %vec) #1
   store <2 x double> %fabs, <2 x double> addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @fabs_v2f64(<2 x double> add
 
 ; CHECK: 'fabs_v3f64'
 ; CHECK: estimated cost of 0 for {{.*}} call <3 x double> @llvm.fabs.v3f64
-define void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %fabs = call <3 x double> @llvm.fabs.v3f64(<3 x double> %vec) #1
   store <3 x double> %fabs, <3 x double> addrspace(1)* %out
@@ -56,7 +56,7 @@ define void @fabs_v3f64(<3 x double> add
 
 ; CHECK: 'fabs_f16'
 ; CHECK: estimated cost of 0 for {{.*}} call half @llvm.fabs.f16
-define void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %fabs = call half @llvm.fabs.f16(half %vec) #1
   store half %fabs, half addrspace(1)* %out
@@ -65,7 +65,7 @@ define void @fabs_f16(half addrspace(1)*
 
 ; CHECK: 'fabs_v2f16'
 ; CHECK: estimated cost of 0 for {{.*}} call <2 x half> @llvm.fabs.v2f16
-define void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %vec) #1
   store <2 x half> %fabs, <2 x half> addrspace(1)* %out
@@ -74,7 +74,7 @@ define void @fabs_v2f16(<2 x half> addrs
 
 ; CHECK: 'fabs_v3f16'
 ; CHECK: estimated cost of 0 for {{.*}} call <3 x half> @llvm.fabs.v3f16
-define void @fabs_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
   %fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %vec) #1
   store <3 x half> %fabs, <3 x half> addrspace(1)* %out

Modified: llvm/trunk/test/Analysis/CostModel/AMDGPU/fadd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/AMDGPU/fadd.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/AMDGPU/fadd.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/AMDGPU/fadd.ll Tue Mar 21 16:39:51 2017
@@ -3,7 +3,7 @@
 
 ; ALL: 'fadd_f32'
 ; ALL: estimated cost of 1 for {{.*}} fadd float
-define void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fadd float %vec, %b
   store float %add, float addrspace(1)* %out
@@ -12,7 +12,7 @@ define void @fadd_f32(float addrspace(1)
 
 ; ALL: 'fadd_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fadd <2 x float>
-define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fadd <2 x float> %vec, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@ define void @fadd_v2f32(<2 x float> addr
 
 ; ALL: 'fadd_v3f32'
 ; ALL: estimated cost of 3 for {{.*}} fadd <3 x float>
-define void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fadd <3 x float> %vec, %b
   store <3 x float> %add, <3 x float> addrspace(1)* %out
@@ -31,7 +31,7 @@ define void @fadd_v3f32(<3 x float> addr
 ; ALL: 'fadd_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fadd double
 ; SLOWF64: estimated cost of 3 for {{.*}} fadd double
-define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fadd double %vec, %b
   store double %add, double addrspace(1)* %out
@@ -41,7 +41,7 @@ define void @fadd_f64(double addrspace(1
 ; ALL: 'fadd_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fadd <2 x double>
 ; SLOWF64: estimated cost of 6 for {{.*}} fadd <2 x double>
-define void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fadd <2 x double> %vec, %b
   store <2 x double> %add, <2 x double> addrspace(1)* %out
@@ -51,7 +51,7 @@ define void @fadd_v2f64(<2 x double> add
 ; ALL: 'fadd_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fadd <3 x double>
 ; SLOWF64: estimated cost of 9 for {{.*}} fadd <3 x double>
-define void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fadd <3 x double> %vec, %b
   store <3 x double> %add, <3 x double> addrspace(1)* %out
@@ -60,7 +60,7 @@ define void @fadd_v3f64(<3 x double> add
 
 ; ALL 'fadd_f16'
 ; ALL estimated cost of 1 for {{.*}} fadd half
-define void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fadd half %vec, %b
   store half %add, half addrspace(1)* %out
@@ -69,7 +69,7 @@ define void @fadd_f16(half addrspace(1)*
 
 ; ALL 'fadd_v2f16'
 ; ALL estimated cost of 2 for {{.*}} fadd <2 x half>
-define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fadd <2 x half> %vec, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @fadd_v2f16(<2 x half> addrs
 
 ; ALL 'fadd_v4f16'
 ; ALL estimated cost of 4 for {{.*}} fadd <4 x half>
-define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fadd <4 x half> %vec, %b
   store <4 x half> %add, <4 x half> addrspace(1)* %out

Modified: llvm/trunk/test/Analysis/CostModel/AMDGPU/fdiv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/AMDGPU/fdiv.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/AMDGPU/fdiv.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/AMDGPU/fdiv.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 
 ; CHECK: 'fdiv_f32'
 ; ALL: estimated cost of 10 for {{.*}} fdiv float
-define void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fdiv float %vec, %b
   store float %add, float addrspace(1)* %out
@@ -14,7 +14,7 @@ define void @fdiv_f32(float addrspace(1)
 
 ; ALL: 'fdiv_v2f32'
 ; ALL: estimated cost of 20 for {{.*}} fdiv <2 x float>
-define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fdiv <2 x float> %vec, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out
@@ -23,7 +23,7 @@ define void @fdiv_v2f32(<2 x float> addr
 
 ; ALL: 'fdiv_v3f32'
 ; ALL: estimated cost of 30 for {{.*}} fdiv <3 x float>
-define void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fdiv <3 x float> %vec, %b
   store <3 x float> %add, <3 x float> addrspace(1)* %out
@@ -35,7 +35,7 @@ define void @fdiv_v3f32(<3 x float> addr
 ; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double
 ; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double
 ; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double
-define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fdiv double %vec, %b
   store double %add, double addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @fdiv_f64(double addrspace(1
 ; CISLOWF64: estimated cost of 66 for {{.*}} fdiv <2 x double>
 ; SIFASTF64: estimated cost of 64 for {{.*}} fdiv <2 x double>
 ; SISLOWF64: estimated cost of 72 for {{.*}} fdiv <2 x double>
-define void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fdiv <2 x double> %vec, %b
   store <2 x double> %add, <2 x double> addrspace(1)* %out
@@ -59,7 +59,7 @@ define void @fdiv_v2f64(<2 x double> add
 ; CISLOWF64: estimated cost of 99 for {{.*}} fdiv <3 x double>
 ; SIFASTF64: estimated cost of 96 for {{.*}} fdiv <3 x double>
 ; SISLOWF64: estimated cost of 108 for {{.*}} fdiv <3 x double>
-define void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fdiv <3 x double> %vec, %b
   store <3 x double> %add, <3 x double> addrspace(1)* %out
@@ -68,7 +68,7 @@ define void @fdiv_v3f64(<3 x double> add
 
 ; ALL: 'fdiv_f16'
 ; ALL: estimated cost of 10 for {{.*}} fdiv half
-define void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fdiv half %vec, %b
   store half %add, half addrspace(1)* %out
@@ -77,7 +77,7 @@ define void @fdiv_f16(half addrspace(1)*
 
 ; ALL: 'fdiv_v2f16'
 ; ALL: estimated cost of 20 for {{.*}} fdiv <2 x half>
-define void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fdiv <2 x half> %vec, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out
@@ -86,7 +86,7 @@ define void @fdiv_v2f16(<2 x half> addrs
 
 ; ALL: 'fdiv_v4f16'
 ; ALL: estimated cost of 40 for {{.*}} fdiv <4 x half>
-define void @fdiv_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fdiv_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fdiv <4 x half> %vec, %b
   store <4 x half> %add, <4 x half> addrspace(1)* %out

Modified: llvm/trunk/test/Analysis/CostModel/AMDGPU/fmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/AMDGPU/fmul.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/AMDGPU/fmul.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/AMDGPU/fmul.ll Tue Mar 21 16:39:51 2017
@@ -3,7 +3,7 @@
 
 ; ALL: 'fmul_f32'
 ; ALL: estimated cost of 1 for {{.*}} fmul float
-define void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fmul float %vec, %b
   store float %add, float addrspace(1)* %out
@@ -12,7 +12,7 @@ define void @fmul_f32(float addrspace(1)
 
 ; ALL: 'fmul_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fmul <2 x float>
-define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fmul <2 x float> %vec, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@ define void @fmul_v2f32(<2 x float> addr
 
 ; ALL: 'fmul_v3f32'
 ; ALL: estimated cost of 3 for {{.*}} fmul <3 x float>
-define void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fmul <3 x float> %vec, %b
   store <3 x float> %add, <3 x float> addrspace(1)* %out
@@ -31,7 +31,7 @@ define void @fmul_v3f32(<3 x float> addr
 ; ALL: 'fmul_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fmul double
 ; SLOWF64: estimated cost of 3 for {{.*}} fmul double
-define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fmul double %vec, %b
   store double %add, double addrspace(1)* %out
@@ -41,7 +41,7 @@ define void @fmul_f64(double addrspace(1
 ; ALL: 'fmul_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fmul <2 x double>
 ; SLOWF64: estimated cost of 6 for {{.*}} fmul <2 x double>
-define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fmul <2 x double> %vec, %b
   store <2 x double> %add, <2 x double> addrspace(1)* %out
@@ -51,7 +51,7 @@ define void @fmul_v2f64(<2 x double> add
 ; ALL: 'fmul_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fmul <3 x double>
 ; SLOWF64: estimated cost of 9 for {{.*}} fmul <3 x double>
-define void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fmul <3 x double> %vec, %b
   store <3 x double> %add, <3 x double> addrspace(1)* %out
@@ -60,7 +60,7 @@ define void @fmul_v3f64(<3 x double> add
 
 ; ALL 'fmul_f16'
 ; ALL estimated cost of 1 for {{.*}} fmul half
-define void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fmul half %vec, %b
   store half %add, half addrspace(1)* %out
@@ -69,7 +69,7 @@ define void @fmul_f16(half addrspace(1)*
 
 ; ALL 'fmul_v2f16'
 ; ALL estimated cost of 2 for {{.*}} fmul <2 x half>
-define void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fmul <2 x half> %vec, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @fmul_v2f16(<2 x half> addrs
 
 ; ALL 'fmul_v4f16'
 ; ALL estimated cost of 4 for {{.*}} fmul <4 x half>
-define void @fmul_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fmul_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fmul <4 x half> %vec, %b
   store <4 x half> %add, <4 x half> addrspace(1)* %out

Modified: llvm/trunk/test/Analysis/CostModel/AMDGPU/fsub.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/AMDGPU/fsub.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/AMDGPU/fsub.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/AMDGPU/fsub.ll Tue Mar 21 16:39:51 2017
@@ -3,7 +3,7 @@
 
 ; ALL: 'fsub_f32'
 ; ALL: estimated cost of 1 for {{.*}} fsub float
-define void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fsub float %vec, %b
   store float %add, float addrspace(1)* %out
@@ -12,7 +12,7 @@ define void @fsub_f32(float addrspace(1)
 
 ; ALL: 'fsub_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fsub <2 x float>
-define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fsub <2 x float> %vec, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@ define void @fsub_v2f32(<2 x float> addr
 
 ; ALL: 'fsub_v3f32'
 ; ALL: estimated cost of 3 for {{.*}} fsub <3 x float>
-define void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fsub <3 x float> %vec, %b
   store <3 x float> %add, <3 x float> addrspace(1)* %out
@@ -31,7 +31,7 @@ define void @fsub_v3f32(<3 x float> addr
 ; ALL: 'fsub_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fsub double
 ; SLOWF64: estimated cost of 3 for {{.*}} fsub double
-define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fsub double %vec, %b
   store double %add, double addrspace(1)* %out
@@ -41,7 +41,7 @@ define void @fsub_f64(double addrspace(1
 ; ALL: 'fsub_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fsub <2 x double>
 ; SLOWF64: estimated cost of 6 for {{.*}} fsub <2 x double>
-define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fsub <2 x double> %vec, %b
   store <2 x double> %add, <2 x double> addrspace(1)* %out
@@ -51,7 +51,7 @@ define void @fsub_v2f64(<2 x double> add
 ; ALL: 'fsub_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fsub <3 x double>
 ; SLOWF64: estimated cost of 9 for {{.*}} fsub <3 x double>
-define void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fsub <3 x double> %vec, %b
   store <3 x double> %add, <3 x double> addrspace(1)* %out
@@ -60,7 +60,7 @@ define void @fsub_v3f64(<3 x double> add
 
 ; ALL: 'fsub_f16'
 ; ALL: estimated cost of 1 for {{.*}} fsub half
-define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fsub half %vec, %b
   store half %add, half addrspace(1)* %out
@@ -69,7 +69,7 @@ define void @fsub_f16(half addrspace(1)*
 
 ; ALL: 'fsub_v2f16'
 ; ALL: estimated cost of 2 for {{.*}} fsub <2 x half>
-define void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fsub <2 x half> %vec, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @fsub_v2f16(<2 x half> addrs
 
 ; ALL: 'fsub_v4f16'
 ; ALL: estimated cost of 4 for {{.*}} fsub <4 x half>
-define void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fsub <4 x half> %vec, %b
   store <4 x half> %add, <4 x half> addrspace(1)* %out

Modified: llvm/trunk/test/Analysis/CostModel/AMDGPU/insertelement.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/AMDGPU/insertelement.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/AMDGPU/insertelement.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/AMDGPU/insertelement.ll Tue Mar 21 16:39:51 2017
@@ -2,7 +2,7 @@
 
 ; CHECK: 'insertelement_v2i32'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i32>
-define void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
   %insert = insertelement <2 x i32> %vec, i32 1, i32 123
   store <2 x i32> %insert, <2 x i32> addrspace(1)* %out
@@ -11,7 +11,7 @@ define void @insertelement_v2i32(<2 x i3
 
 ; CHECK: 'insertelement_v2i64'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i64>
-define void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
   %insert = insertelement <2 x i64> %vec, i64 1, i64 123
   store <2 x i64> %insert, <2 x i64> addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @insertelement_v2i64(<2 x i6
 
 ; CHECK: 'insertelement_v2i16'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i16>
-define void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
   %insert = insertelement <2 x i16> %vec, i16 1, i16 123
   store <2 x i16> %insert, <2 x i16> addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @insertelement_v2i16(<2 x i1
 
 ; CHECK: 'insertelement_v2i8'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i8>
-define void @insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
   %vec = load <2 x i8>, <2 x i8> addrspace(1)* %vaddr
   %insert = insertelement <2 x i8> %vec, i8 1, i8 123
   store <2 x i8> %insert, <2 x i8> addrspace(1)* %out

Modified: llvm/trunk/test/Analysis/CostModel/AMDGPU/mul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/AMDGPU/mul.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/AMDGPU/mul.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/AMDGPU/mul.ll Tue Mar 21 16:39:51 2017
@@ -2,7 +2,7 @@
 
 ; CHECK: 'mul_i32'
 ; CHECK: estimated cost of 3 for {{.*}} mul i32
-define void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %mul = mul i32 %vec, %b
   store i32 %mul, i32 addrspace(1)* %out
@@ -11,7 +11,7 @@ define void @mul_i32(i32 addrspace(1)* %
 
 ; CHECK: 'mul_v2i32'
 ; CHECK: estimated cost of 6 for {{.*}} mul <2 x i32>
-define void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
+define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
   %mul = mul <2 x i32> %vec, %b
   store <2 x i32> %mul, <2 x i32> addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @mul_v2i32(<2 x i32> addrspa
 
 ; CHECK: 'mul_v3i32'
 ; CHECK: estimated cost of 9 for {{.*}} mul <3 x i32>
-define void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
+define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
   %mul = mul <3 x i32> %vec, %b
   store <3 x i32> %mul, <3 x i32> addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @mul_v3i32(<3 x i32> addrspa
 
 ; CHECK: 'mul_v4i32'
 ; CHECK: estimated cost of 12 for {{.*}} mul <4 x i32>
-define void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
+define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
   %mul = mul <4 x i32> %vec, %b
   store <4 x i32> %mul, <4 x i32> addrspace(1)* %out
@@ -38,7 +38,7 @@ define void @mul_v4i32(<4 x i32> addrspa
 
 ; CHECK: 'mul_i64'
 ; CHECK: estimated cost of 16 for {{.*}} mul i64
-define void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %mul = mul i64 %vec, %b
   store i64 %mul, i64 addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @mul_i64(i64 addrspace(1)* %
 
 ; CHECK: 'mul_v2i64'
 ; CHECK: estimated cost of 32 for {{.*}} mul <2 x i64>
-define void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
   %mul = mul <2 x i64> %vec, %b
   store <2 x i64> %mul, <2 x i64> addrspace(1)* %out
@@ -56,7 +56,7 @@ define void @mul_v2i64(<2 x i64> addrspa
 
 ; CHECK: 'mul_v3i64'
 ; CHECK: estimated cost of 48 for {{.*}} mul <3 x i64>
-define void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
   %mul = mul <3 x i64> %vec, %b
   store <3 x i64> %mul, <3 x i64> addrspace(1)* %out
@@ -65,7 +65,7 @@ define void @mul_v3i64(<3 x i64> addrspa
 
 ; CHECK: 'mul_v4i64'
 ; CHECK: estimated cost of 64 for {{.*}} mul <4 x i64>
-define void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
   %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
   %mul = mul <4 x i64> %vec, %b
   store <4 x i64> %mul, <4 x i64> addrspace(1)* %out
@@ -75,7 +75,7 @@ define void @mul_v4i64(<4 x i64> addrspa
 
 ; CHECK: 'mul_v8i64'
 ; CHECK: estimated cost of 128 for {{.*}} mul <8 x i64>
-define void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 {
   %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
   %mul = mul <8 x i64> %vec, %b
   store <8 x i64> %mul, <8 x i64> addrspace(1)* %out

Modified: llvm/trunk/test/Analysis/CostModel/AMDGPU/shifts.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/AMDGPU/shifts.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/AMDGPU/shifts.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/AMDGPU/shifts.ll Tue Mar 21 16:39:51 2017
@@ -3,7 +3,7 @@
 
 ; ALL: 'shl_i32'
 ; ALL: estimated cost of 1 for {{.*}} shl i32
-define void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = shl i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -13,7 +13,7 @@ define void @shl_i32(i32 addrspace(1)* %
 ; ALL: 'shl_i64'
 ; FAST64: estimated cost of 2 for {{.*}} shl i64
 ; SLOW64: estimated cost of 3 for {{.*}} shl i64
-define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = shl i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -22,7 +22,7 @@ define void @shl_i64(i64 addrspace(1)* %
 
 ; ALL: 'lshr_i32'
 ; ALL: estimated cost of 1 for {{.*}} lshr i32
-define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = lshr i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -32,7 +32,7 @@ define void @lshr_i32(i32 addrspace(1)*
 ; ALL: 'lshr_i64'
 ; FAST64: estimated cost of 2 for {{.*}} lshr i64
 ; SLOW64: estimated cost of 3 for {{.*}} lshr i64
-define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = lshr i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -41,7 +41,7 @@ define void @lshr_i64(i64 addrspace(1)*
 
 ; ALL: 'ashr_i32'
 ; ALL: estimated cost of 1 for {{.*}} ashr i32
-define void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = ashr i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -51,7 +51,7 @@ define void @ashr_i32(i32 addrspace(1)*
 ; ALL: 'ashr_i64'
 ; FAST64: estimated cost of 2 for {{.*}} ashr i64
 ; SLOW64: estimated cost of 3 for {{.*}} ashr i64
-define void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = ashr i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out

Modified: llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll (original)
+++ llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll Tue Mar 21 16:39:51 2017
@@ -1,7 +1,7 @@
 ; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s
 
 ; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
-define void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 {
+define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 {
   %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
   store i32 %swizzle, i32 addrspace(1)* %out, align 4
   ret void

Modified: llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll (original)
+++ llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 ; CHECK: DIVERGENT:  %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4
 
 ; The post dominator tree does not have a root node in this case
-define void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 {
+define amdgpu_kernel void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 {
 bb0:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp2 = sext i32 %tmp to i64

Modified: llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll (original)
+++ llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll Tue Mar 21 16:39:51 2017
@@ -1,7 +1,7 @@
 ; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s
 
 ; CHECK: DIVERGENT:  %tmp = cmpxchg volatile
-define void @unreachable_loop(i32 %tidx) #0 {
+define amdgpu_kernel void @unreachable_loop(i32 %tidx) #0 {
 entry:
   unreachable
 

Modified: llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll (original)
+++ llvm/trunk/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll Tue Mar 21 16:39:51 2017
@@ -7,35 +7,35 @@ declare i32 @llvm.amdgcn.mbcnt.lo(i32, i
 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
 
 ; CHECK: DIVERGENT:  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
-define void @workitem_id_x() #1 {
+define amdgpu_kernel void @workitem_id_x() #1 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %id.x, i32 addrspace(1)* undef
   ret void
 }
 
 ; CHECK: DIVERGENT:  %id.y = call i32 @llvm.amdgcn.workitem.id.y()
-define void @workitem_id_y() #1 {
+define amdgpu_kernel void @workitem_id_y() #1 {
   %id.y = call i32 @llvm.amdgcn.workitem.id.y()
   store volatile i32 %id.y, i32 addrspace(1)* undef
   ret void
 }
 
 ; CHECK: DIVERGENT:  %id.z = call i32 @llvm.amdgcn.workitem.id.z()
-define void @workitem_id_z() #1 {
+define amdgpu_kernel void @workitem_id_z() #1 {
   %id.z = call i32 @llvm.amdgcn.workitem.id.z()
   store volatile i32 %id.z, i32 addrspace(1)* undef
   ret void
 }
 
 ; CHECK: DIVERGENT:  %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0)
-define void @mbcnt_lo() #1 {
+define amdgpu_kernel void @mbcnt_lo() #1 {
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0)
   store volatile i32 %mbcnt.lo, i32 addrspace(1)* undef
   ret void
 }
 
 ; CHECK: DIVERGENT:  %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
-define void @mbcnt_hi() #1 {
+define amdgpu_kernel void @mbcnt_hi() #1 {
   %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
   store volatile i32 %mbcnt.hi, i32 addrspace(1)* undef
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/32-bit-local-address-space.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/32-bit-local-address-space.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/32-bit-local-address-space.ll Tue Mar 21 16:39:51 2017
@@ -13,7 +13,7 @@
 ; FUNC-LABEL: {{^}}local_address_load:
 ; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]]
 ; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
-define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = load i32, i32 addrspace(3)* %in
   store i32 %0, i32 addrspace(1)* %out
@@ -24,7 +24,7 @@ entry:
 ; SI: s_add_i32 [[SPTR:s[0-9]]]
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_read_b32 [[VPTR]]
-define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
+define amdgpu_kernel void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
 entry:
   %0 = getelementptr i32, i32 addrspace(3)* %in, i32 %offset
   %1 = load i32, i32 addrspace(3)* %0
@@ -35,7 +35,7 @@ entry:
 ; FUNC-LABEL: {{^}}local_address_gep_const_offset:
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4
-define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(3)* %in, i32 1
   %1 = load i32, i32 addrspace(3)* %0
@@ -48,7 +48,7 @@ entry:
 ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_read_b32 [[VPTR]]
-define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(3)* %in, i32 16385
   %1 = load i32, i32 addrspace(3)* %0
@@ -60,7 +60,7 @@ entry:
 ; SI: v_cmp_ne_u32
 ; SI-NOT: v_cmp_ne_u32
 ; SI: v_cndmask_b32
-define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
+define amdgpu_kernel void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
   %cmp = icmp ne i32 addrspace(3)* %lds, null
   %x = select i1 %cmp, i32 123, i32 456
   store i32 %x, i32 addrspace(1)* %out
@@ -71,7 +71,7 @@ define void @null_32bit_lds_ptr(i32 addr
 ; SI: s_mul_i32
 ; SI-NEXT: s_add_i32
 ; SI: ds_read_b32
-define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
+define amdgpu_kernel void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
   %ptr = getelementptr [3 x float], [3 x float] addrspace(3)* %lds, i32 %tid, i32 0
   %val = load float, float addrspace(3)* %ptr
   store float %val, float addrspace(1)* %out
@@ -83,7 +83,7 @@ define void @mul_32bit_ptr(float addrspa
 ; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
 ; SI: ds_read_b32 v{{[0-9]+}}, [[REG]]
-define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
+define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
   %val = load float, float addrspace(3)* @g_lds
   store float %val, float addrspace(1)* %out
   ret void
@@ -95,14 +95,14 @@ define void @infer_ptr_alignment_global_
 
 ; FUNC-LABEL: {{^}}global_ptr:
 ; SI: ds_write_b32
-define void @global_ptr() nounwind {
+define amdgpu_kernel void @global_ptr() nounwind {
   store i32 addrspace(3)* getelementptr ([16383 x i32], [16383 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_address_store:
 ; SI: ds_write_b32
-define void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
+define amdgpu_kernel void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
   store i32 %val, i32 addrspace(3)* %out
   ret void
 }
@@ -111,7 +111,7 @@ define void @local_address_store(i32 add
 ; SI: s_add_i32 [[SADDR:s[0-9]+]],
 ; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
 ; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}}
-define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
+define amdgpu_kernel void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 %offset
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void
@@ -121,7 +121,7 @@ define void @local_address_gep_store(i32
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
 ; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4
-define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
+define amdgpu_kernel void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 1
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void
@@ -132,7 +132,7 @@ define void @local_address_gep_const_off
 ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}}
-define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
+define amdgpu_kernel void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 16385
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir Tue Mar 21 16:39:51 2017
@@ -4,7 +4,7 @@
 # REQUIRES: global-isel
 
 --- |
-  define void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
+  define amdgpu_kernel void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
 ...
 ---
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 # REQUIRES: global-isel
 
 --- |
-  define void @smrd_imm(i32 addrspace(2)* %const0) { ret void }
+  define amdgpu_kernel void @smrd_imm(i32 addrspace(2)* %const0) { ret void }
 ...
 ---
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir Tue Mar 21 16:39:51 2017
@@ -4,7 +4,7 @@
 # REQUIRES: global-isel
 
 --- |
-  define void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
+  define amdgpu_kernel void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
 ...
 ---
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir Tue Mar 21 16:39:51 2017
@@ -3,12 +3,12 @@
 # REQUIRES: global-isel
 
 --- |
-  define void @load_constant(i32 addrspace(2)* %ptr0) { ret void }
-  define void @load_global_uniform(i32 addrspace(1)* %ptr1) {
+  define amdgpu_kernel void @load_constant(i32 addrspace(2)* %ptr0) { ret void }
+  define amdgpu_kernel void @load_global_uniform(i32 addrspace(1)* %ptr1) {
     %tmp0 = load i32, i32 addrspace(1)* %ptr1
     ret void
   }
-  define void @load_global_non_uniform(i32 addrspace(1)* %ptr2) {
+  define amdgpu_kernel void @load_global_non_uniform(i32 addrspace(1)* %ptr2) {
     %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
     %tmp1 = getelementptr i32, i32 addrspace(1)* %ptr2, i32 %tmp0
     %tmp2 = load i32, i32 addrspace(1)* %tmp1

Modified: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/smrd.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/smrd.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/smrd.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 ; GCN-LABEL: {{^}}smrd0:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
-define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
   %1 = load i32, i32 addrspace(2)* %0
@@ -21,7 +21,7 @@ entry:
 ; GCN-LABEL: {{^}}smrd1:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
   %1 = load i32, i32 addrspace(2)* %0
@@ -36,7 +36,7 @@ entry:
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 ; GCN: s_endpgm
-define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
   %1 = load i32, i32 addrspace(2)* %0
@@ -51,7 +51,7 @@ entry:
 ; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
 ; TODO: Add VI checks
 ; XGCN: s_endpgm
-define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
   %1 = load i32, i32 addrspace(2)* %0
@@ -65,7 +65,7 @@ entry:
 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
   %1 = load i32, i32 addrspace(2)* %0
@@ -79,7 +79,7 @@ entry:
 ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
   %1 = load i32, i32 addrspace(2)* %0

Modified: llvm/trunk/test/CodeGen/AMDGPU/add-debug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/add-debug.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/add-debug.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/add-debug.ll Tue Mar 21 16:39:51 2017
@@ -3,7 +3,7 @@
 ; REQUIRES: asserts
 
 ; Check that SelectionDAGDumper does not crash on int_SI_if.
-define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else

Modified: llvm/trunk/test/CodeGen/AMDGPU/add.i16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/add.i16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/add.i16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/add.i16.ll Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -23,7 +23,7 @@ define void @v_test_add_i16(i16 addrspac
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x7b, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -38,7 +38,7 @@ define void @v_test_add_i16_constant(i16
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xfffffcb3, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -53,7 +53,7 @@ define void @v_test_add_i16_neg_constant
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], -1, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -69,7 +69,7 @@ define void @v_test_add_i16_inline_neg1(
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_dword [[ADD]]
-define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -89,7 +89,7 @@ define void @v_test_add_i16_zext_to_i32(
 ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
 ; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -109,7 +109,7 @@ define void @v_test_add_i16_zext_to_i64(
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]],  [[B]], [[A]]
 ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: buffer_store_dword [[SEXT]]
-define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -130,7 +130,7 @@ define void @v_test_add_i16_sext_to_i32(
 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid

Modified: llvm/trunk/test/CodeGen/AMDGPU/add.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/add.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/add.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/add.ll Tue Mar 21 16:39:51 2017
@@ -8,7 +8,7 @@
 ;SI: v_add_i32_e32 [[REG:v[0-9]+]], vcc, {{v[0-9]+, v[0-9]+}}
 ;SI-NOT: [[REG]]
 ;SI: buffer_store_dword [[REG]],
-define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -24,7 +24,7 @@ define void @test1(i32 addrspace(1)* %ou
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -44,7 +44,7 @@ define void @test2(<2 x i32> addrspace(1
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -71,7 +71,7 @@ define void @test4(<4 x i32> addrspace(1
 ; SI: s_add_i32
 ; SI: s_add_i32
 ; SI: s_add_i32
-define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
+define amdgpu_kernel void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
 entry:
   %0 = add <8 x i32> %a, %b
   store <8 x i32> %0, <8 x i32> addrspace(1)* %out
@@ -112,7 +112,7 @@ entry:
 ; SI: s_add_i32
 ; SI: s_add_i32
 ; SI: s_add_i32
-define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
+define amdgpu_kernel void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
 entry:
   %0 = add <16 x i32> %a, %b
   store <16 x i32> %0, <16 x i32> addrspace(1)* %out
@@ -129,7 +129,7 @@ entry:
 ; EG-DAG: ADD_INT
 ; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
-define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = add i64 %a, %b
   store i64 %0, i64 addrspace(1)* %out
@@ -150,7 +150,7 @@ entry:
 ; EG-DAG: ADD_INT
 ; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
-define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
 entry:
   %0 = load i64, i64 addrspace(1)* %in
   %1 = add i64 %a, %0
@@ -169,7 +169,7 @@ entry:
 ; EG-DAG: ADD_INT
 ; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
-define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else

Modified: llvm/trunk/test/CodeGen/AMDGPU/add.v2i16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/add.v2i16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/add.v2i16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/add.v2i16.ll Tue Mar 21 16:39:51 2017
@@ -7,7 +7,7 @@
 
 ; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -27,7 +27,7 @@ define void @v_test_add_v2i16(<2 x i16>
 
 ; VI: s_add_i32
 ; VI: s_add_i32
-define void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
+define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
   %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
   %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1
   %add = add <2 x i16> %a, %b
@@ -41,7 +41,7 @@ define void @s_test_add_v2i16(<2 x i16>
 
 ; VI: s_add_i32
 ; VI: s_add_i32
-define void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
+define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
   %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
   %add = add <2 x i16> %a, %a
   store <2 x i16> %add, <2 x i16> addrspace(1)* %out
@@ -54,7 +54,7 @@ define void @s_test_add_self_v2i16(<2 x
 
 ; VI: v_add_i32
 ; VI: v_add_i32
-define void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
+define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
   %add = add <2 x i16> %a, %b
   store <2 x i16> %add, <2 x i16> addrspace(1)* %out
   ret void
@@ -66,7 +66,7 @@ define void @s_test_add_v2i16_kernarg(<2
 
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x1c8, v{{[0-9]+}}
-define void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -83,7 +83,7 @@ define void @v_test_add_v2i16_constant(<
 
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}}
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffc21, v{{[0-9]+}}
-define void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -102,7 +102,7 @@ define void @v_test_add_v2i16_neg_consta
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]]
 ; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -121,7 +121,7 @@ define void @v_test_add_v2i16_inline_neg
 ; VI-NOT: v_add_u16
 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -141,7 +141,7 @@ define void @v_test_add_v2i16_inline_lo_
 ; VI-NOT: v_add_u16
 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -173,7 +173,7 @@ define void @v_test_add_v2i16_inline_fp_
 ; VI-NOT: and
 ; VI-NOT: shl
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
-define void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -208,7 +208,7 @@ define void @v_test_add_v2i16_zext_to_v2
 ; VI: v_add_u16_e32
 
 ; VI: buffer_store_dwordx4
-define void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -236,7 +236,7 @@ define void @v_test_add_v2i16_zext_to_v2
 ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; VI: buffer_store_dwordx2
-define void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -264,7 +264,7 @@ define void @v_test_add_v2i16_sext_to_v2
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-define void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid

Modified: llvm/trunk/test/CodeGen/AMDGPU/add_i128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/add_i128.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/add_i128.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/add_i128.ll Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@
 ; GCN-NEXT: v_addc_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 ; GCN-NEXT: v_addc_u32_e32 v[[HI:[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]],
-define void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr i128, i128 addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr i128, i128 addrspace(1)* %inB, i32 %tid
@@ -23,7 +23,7 @@ define void @test_i128_vreg(i128 addrspa
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
-define void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
+define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
   %foo = load i128, i128 addrspace(1)* %in, align 8
   %result = add i128 %foo, %a
   store i128 %result, i128 addrspace(1)* %out
@@ -35,7 +35,7 @@ define void @sgpr_operand(i128 addrspace
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
-define void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
+define amdgpu_kernel void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
   %foo = load i128, i128 addrspace(1)* %in, align 8
   %result = add i128 %a, %foo
   store i128 %result, i128 addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @sgpr_operand_reversed(i128
 ; GCN: s_addc_u32
 ; GCN: s_addc_u32
 ; GCN: s_addc_u32
-define void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) {
+define amdgpu_kernel void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) {
   %result = add i128 %a, %b
   store i128 %result, i128 addrspace(1)* %out
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/add_i64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/add_i64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/add_i64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/add_i64.ll Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 ; SI-LABEL: {{^}}test_i64_vreg:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
@@ -21,7 +21,7 @@ define void @test_i64_vreg(i64 addrspace
 ; SI-LABEL: {{^}}sgpr_operand:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
+define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
   %foo = load i64, i64 addrspace(1)* %in, align 8
   %result = add i64 %foo, %a
   store i64 %result, i64 addrspace(1)* %out
@@ -34,7 +34,7 @@ define void @sgpr_operand(i64 addrspace(
 ; SI-LABEL: {{^}}sgpr_operand_reversed:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
+define amdgpu_kernel void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
   %foo = load i64, i64 addrspace(1)* %in, align 8
   %result = add i64 %a, %foo
   store i64 %result, i64 addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @sgpr_operand_reversed(i64 a
 ; SI: s_addc_u32
 ; SI: s_add_u32
 ; SI: s_addc_u32
-define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
+define amdgpu_kernel void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
   %result = add <2 x i64> %a, %b
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
@@ -58,7 +58,7 @@ define void @test_v2i64_sreg(<2 x i64> a
 ; SI: v_addc_u32
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
@@ -76,7 +76,7 @@ define void @test_v2i64_vreg(<2 x i64> a
 ; SI-NOT: addc
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
-define void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %add = add i64 %b, %a
   %trunc = trunc i64 %add to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8

Modified: llvm/trunk/test/CodeGen/AMDGPU/addrspacecast-captured.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/addrspacecast-captured.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/addrspacecast-captured.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/addrspacecast-captured.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@ declare void @consume_ptr2int(i32) #0
 ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
 ; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
 ; CHECK: store i32 %ptr2int, i32 addrspace(1)* %out
-define void @addrspacecast_captured(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @addrspacecast_captured(i32 addrspace(1)* %out) #0 {
 entry:
   %data = alloca i32, align 4
   %cast = addrspacecast i32* %data to i32 addrspace(4)*
@@ -22,7 +22,7 @@ entry:
 ; CHECK: %data = alloca i32, align 4
 ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
 ; CHECK: store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %out
-define void @addrspacecast_captured_store(i32 addrspace(4)* addrspace(1)* %out) #0 {
+define amdgpu_kernel void @addrspacecast_captured_store(i32 addrspace(4)* addrspace(1)* %out) #0 {
 entry:
   %data = alloca i32, align 4
   %cast = addrspacecast i32* %data to i32 addrspace(4)*
@@ -35,7 +35,7 @@ entry:
 ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
 ; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
 ; CHECK: call void @consume_ptr2int(i32 %ptr2int)
-define void @addrspacecast_captured_call() #0 {
+define amdgpu_kernel void @addrspacecast_captured_call() #0 {
 entry:
   %data = alloca i32, align 4
   %cast = addrspacecast i32* %data to i32 addrspace(4)*

Modified: llvm/trunk/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll Tue Mar 21 16:39:51 2017
@@ -9,57 +9,57 @@ declare void @llvm.memcpy.p1i32.p4i32.i3
 @global.arr = unnamed_addr addrspace(1) global [256 x i32] undef, align 4
 
 ; HSA: @store_cast_0_flat_to_group_addrspacecast() #1
-define void @store_cast_0_flat_to_group_addrspacecast() #1 {
+define amdgpu_kernel void @store_cast_0_flat_to_group_addrspacecast() #1 {
   store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
   ret void
 }
 
 ; HSA: @store_cast_0_group_to_flat_addrspacecast() #2
-define void @store_cast_0_group_to_flat_addrspacecast() #1 {
+define amdgpu_kernel void @store_cast_0_group_to_flat_addrspacecast() #1 {
   store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*)
   ret void
 }
 
-; HSA: define void @store_constant_cast_group_gv_to_flat() #2
-define void @store_constant_cast_group_gv_to_flat() #1 {
+; HSA: define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #2
+define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #1 {
   store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds.i32 to i32 addrspace(4)*)
   ret void
 }
 
 ; HSA: @store_constant_cast_group_gv_gep_to_flat() #2
-define void @store_constant_cast_group_gv_gep_to_flat() #1 {
+define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat() #1 {
   store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
   ret void
 }
 
 ; HSA: @store_constant_cast_global_gv_to_flat() #1
-define void @store_constant_cast_global_gv_to_flat() #1 {
+define amdgpu_kernel void @store_constant_cast_global_gv_to_flat() #1 {
   store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global.i32 to i32 addrspace(4)*)
   ret void
 }
 
 ; HSA: @store_constant_cast_global_gv_gep_to_flat() #1
-define void @store_constant_cast_global_gv_gep_to_flat() #1 {
+define amdgpu_kernel void @store_constant_cast_global_gv_gep_to_flat() #1 {
   store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(1)* @global.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
   ret void
 }
 
 ; HSA: @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   %val = load i32, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
   store i32 %val, i32 addrspace(1)* %out
   ret void
 }
 
 ; HSA: @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   %val = atomicrmw add i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 1 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
 }
 
 ; HSA: @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   %val = cmpxchg i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst
   %val0 = extractvalue { i32, i1 } %val, 0
   store i32 %val0, i32 addrspace(1)* %out
@@ -67,28 +67,28 @@ define void @cmpxchg_constant_cast_group
 }
 
 ; HSA: @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   call void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* %out, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 32, i32 4, i1 false)
   ret void
 }
 
 ; Can't just search the pointer value
 ; HSA: @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #2
-define void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 {
+define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 {
   store i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 addrspace(4)* addrspace(1)* %out
   ret void
 }
 
 ; Can't just search pointer types
 ; HSA: @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #2
-define void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 {
   store i64 ptrtoint (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i64), i64 addrspace(1)* %out
   ret void
 }
 
 ; Cast group to flat, do GEP, cast back to group
 ; HSA: @store_constant_cast_group_gv_gep_to_flat_to_group() #2
-define void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 {
+define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 {
   store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*)
   ret void
 }

Modified: llvm/trunk/test/CodeGen/AMDGPU/addrspacecast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/addrspacecast.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/addrspacecast.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/addrspacecast.ll Tue Mar 21 16:39:51 2017
@@ -28,7 +28,7 @@
 
 ; CI: NumSgprs: {{[0-9][0-9]+}}
 ; GFX9: NumSgprs: {{[0-9]+}}
-define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %stof
   ret void
@@ -58,7 +58,7 @@ define void @use_group_to_flat_addrspace
 
 ; CI: NumSgprs: {{[0-9][0-9]+}}
 ; GFX9: NumSgprs: {{[0-9]+}}
-define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
+define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
   %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %stof
   ret void
@@ -73,7 +73,7 @@ define void @use_private_to_flat_addrspa
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
-define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %stof
   ret void
@@ -85,7 +85,7 @@ define void @use_global_to_flat_addrspac
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
-define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
   %ld = load volatile i32, i32 addrspace(4)* %stof
   ret void
@@ -102,7 +102,7 @@ define void @use_constant_to_flat_addrsp
 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
 ; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
-define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
   store volatile i32 0, i32 addrspace(3)* %ftos
   ret void
@@ -119,7 +119,7 @@ define void @use_flat_to_group_addrspace
 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
 ; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
-define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
   store volatile i32 0, i32* %ftos
   ret void
@@ -133,7 +133,7 @@ define void @use_flat_to_private_addrspa
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
-define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
   store volatile i32 0, i32 addrspace(1)* %ftos
   ret void
@@ -144,7 +144,7 @@ define void @use_flat_to_global_addrspac
 
 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
 ; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0
-define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
   load volatile i32, i32 addrspace(2)* %ftos
   ret void
@@ -158,7 +158,7 @@ define void @use_flat_to_constant_addrsp
 ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
-define void @cast_0_group_to_flat_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(3)* null to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %cast
   ret void
@@ -168,7 +168,7 @@ define void @cast_0_group_to_flat_addrsp
 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
 ; HSA: ds_write_b32 [[PTR]], [[K]]
-define void @cast_0_flat_to_group_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(3)*
   store volatile i32 7, i32 addrspace(3)* %cast
   ret void
@@ -179,7 +179,7 @@ define void @cast_0_flat_to_group_addrsp
 ; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
-define void @cast_neg1_group_to_flat_addrspacecast() #0 {
+define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %cast
   ret void
@@ -189,7 +189,7 @@ define void @cast_neg1_group_to_flat_add
 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
 ; HSA: ds_write_b32 [[PTR]], [[K]]
-define void @cast_neg1_flat_to_group_addrspacecast() #0 {
+define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(4)* inttoptr (i64 -1 to i32 addrspace(4)*) to i32 addrspace(3)*
   store volatile i32 7, i32 addrspace(3)* %cast
   ret void
@@ -204,7 +204,7 @@ define void @cast_neg1_flat_to_group_add
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
-define void @cast_0_private_to_flat_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32* null to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %cast
   ret void
@@ -214,7 +214,7 @@ define void @cast_0_private_to_flat_addr
 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
 ; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
-define void @cast_0_flat_to_private_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)*
   store volatile i32 7, i32* %cast
   ret void
@@ -226,7 +226,7 @@ define void @cast_0_flat_to_private_addr
 ; HSA-LABEL: {{^}}branch_use_flat_i32:
 ; HSA: flat_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}
 ; HSA: s_endpgm
-define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
+define amdgpu_kernel void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
 entry:
   %cmp = icmp ne i32 %c, 0
   br i1 %cmp, label %local, label %global
@@ -259,7 +259,7 @@ end:
 ; HSA: flat_store_dword
 ; HSA: s_barrier
 ; HSA: flat_load_dword
-define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
+define amdgpu_kernel void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
   %alloca = alloca i32, i32 9, align 4
   %x = call i32 @llvm.amdgcn.workitem.id.x() #2
   %pptr = getelementptr i32, i32* %alloca, i32 %x

Modified: llvm/trunk/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/amdgcn.bitcast.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/amdgcn.bitcast.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/amdgcn.bitcast.ll Tue Mar 21 16:39:51 2017
@@ -16,7 +16,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}i8ptr_v16i8ptr:
 ; SI: s_endpgm
-define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
   %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0
@@ -24,7 +24,7 @@ entry:
   ret void
 }
 
-define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %load = load float, float addrspace(1)* %in, align 4
   %fadd32 = fadd float %load, 1.0
   %bc = bitcast float %fadd32 to <2 x i16>
@@ -33,7 +33,7 @@ define void @f32_to_v2i16(<2 x i16> addr
   ret void
 }
 
-define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
   %add.v2i16 = add <2 x i16> %load, <i16 2, i16 2>
   %bc = bitcast <2 x i16> %add.v2i16 to float
@@ -42,7 +42,7 @@ define void @v2i16_to_f32(float addrspac
   ret void
 }
 
-define void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %load = load float, float addrspace(1)* %in, align 4
   %fadd32 = fadd float %load, 1.0
   %bc = bitcast float %fadd32 to <2 x half>
@@ -51,7 +51,7 @@ define void @f32_to_v2f16(<2 x half> add
   ret void
 }
 
-define void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in) nounwind {
   %load = load <2 x half>, <2 x half> addrspace(1)* %in, align 4
   %add.v2f16 = fadd <2 x half> %load, <half 2.0, half 2.0>
   %bc = bitcast <2 x half> %add.v2f16 to float
@@ -60,14 +60,14 @@ define void @v2f16_to_f32(float addrspac
   ret void
 }
 
-define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %bc = bitcast <4 x i8> %load to i32
   store i32 %bc, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %bc = bitcast i32 %load to <4 x i8>
   store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
@@ -76,7 +76,7 @@ define void @i32_to_v4i8(<4 x i8> addrsp
 
 ; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64:
 ; SI: s_endpgm
-define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %add = add <2 x i32> %val, <i32 4, i32 9>
   %bc = bitcast <2 x i32> %add to double
@@ -87,7 +87,7 @@ define void @bitcast_v2i32_to_f64(double
 
 ; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32:
 ; SI: s_endpgm
-define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
   %val = load double, double addrspace(1)* %in, align 8
   %add = fadd double %val, 4.0
   %bc = bitcast double %add to <2 x i32>
@@ -96,7 +96,7 @@ define void @bitcast_f64_to_v2i32(<2 x i
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2i64_to_v2f64:
-define void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
+define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
@@ -112,7 +112,7 @@ end:
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2f64_to_v2i64:
-define void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
+define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end

Modified: llvm/trunk/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/amdgcn.private-memory.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/amdgcn.private-memory.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/amdgcn.private-memory.ll Tue Mar 21 16:39:51 2017
@@ -15,7 +15,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-NOT: v0
 ; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, v0, v{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]]
-define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = alloca [2 x i32]
   %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0

Modified: llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll Tue Mar 21 16:39:51 2017
@@ -4,7 +4,7 @@
 
 ; NOOP-LABEL: @noop_fdiv_fpmath(
 ; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0
-define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
+define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
   %md.25ulp = fdiv float %a, %b, !fpmath !0
   store volatile float %md.25ulp, float addrspace(1)* %out
   ret void
@@ -18,7 +18,7 @@ define void @noop_fdiv_fpmath(float addr
 ; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
 ; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
 ; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
+define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
   %no.md = fdiv float %a, %b
   store volatile float %no.md, float addrspace(1)* %out
 
@@ -51,7 +51,7 @@ define void @fdiv_fpmath(float addrspace
 ; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
 ; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}}
 ; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0
-define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
   %no.md = fdiv float 1.0, %x
   store volatile float %no.md, float addrspace(1)* %out
 
@@ -89,7 +89,7 @@ define void @rcp_fdiv_fpmath(float addrs
 ; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
 ; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0
 ; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
-define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
+define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
   %no.md = fdiv <2 x float> %a, %b
   store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
 
@@ -120,7 +120,7 @@ define void @fdiv_fpmath_vector(<2 x flo
 ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
 ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
-define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
   %no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
   store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
 
@@ -158,7 +158,7 @@ define void @rcp_fdiv_fpmath_vector(<2 x
 ; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
 ; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp
-define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
   %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
   store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
 
@@ -186,7 +186,7 @@ define void @rcp_fdiv_fpmath_vector_nons
 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp
-define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
   %x.insert = insertelement <2 x float> %x, float 1.0, i32 0
 
   %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
@@ -206,7 +206,7 @@ define void @rcp_fdiv_fpmath_vector_part
 ; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
 ; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
   %no.md = fdiv float %a, %b
   store volatile float %no.md, float addrspace(1)* %out
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/amdgpu.private-memory.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/amdgpu.private-memory.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/amdgpu.private-memory.ll Tue Mar 21 16:39:51 2017
@@ -80,7 +80,7 @@
 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !0
 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !0
 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !0
-define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -102,7 +102,7 @@ entry:
 
 ; OPT-LABEL: @high_alignment(
 ; OPT: getelementptr inbounds [256 x [8 x i32]], [256 x [8 x i32]] addrspace(3)* @high_alignment.stack, i32 0, i32 %{{[0-9]+}}
-define void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [8 x i32], align 16
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -127,7 +127,7 @@ entry:
 ; OPT: alloca [5 x i32]
 
 ; SI-NOT: ds_write
-define void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -162,7 +162,7 @@ entry:
 ; SI-NOT: v_movrel
 %struct.point = type { i32, i32 }
 
-define void @multiple_structs(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 {
 entry:
   %a = alloca %struct.point
   %b = alloca %struct.point
@@ -191,7 +191,7 @@ entry:
 ; R600-NOT: MOVA_INT
 ; SI-NOT: v_movrel
 
-define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 entry:
   %prv_array_const = alloca [2 x i32]
   %prv_array = alloca [2 x i32]
@@ -235,7 +235,7 @@ for.end:
 ; SI-PROMOTE: s_load_dword [[IDX:s[0-9]+]]
 ; SI-PROMOTE: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 16
 ; SI-PROMOTE: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[SCALED_IDX]], 16
-define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i16]
   %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
@@ -258,7 +258,7 @@ entry:
 
 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0
 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0
-define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i8]
   %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
@@ -281,7 +281,7 @@ entry:
 ;
 ; A total of 5 bytes should be allocated and used.
 ; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ;
-define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %0 = alloca [3 x i8], align 1
   %1 = alloca [2 x i8], align 1
@@ -305,7 +305,7 @@ entry:
   ret void
 }
 
-define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i8]]
   %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
@@ -319,7 +319,7 @@ entry:
   ret void
 }
 
-define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i32]]
   %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
@@ -332,7 +332,7 @@ entry:
   ret void
 }
 
-define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i64]]
   %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
@@ -347,7 +347,7 @@ entry:
 
 %struct.pair32 = type { i32, i32 }
 
-define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x %struct.pair32]]
   %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
@@ -360,7 +360,7 @@ entry:
   ret void
 }
 
-define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x %struct.pair32]
   %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
@@ -373,7 +373,7 @@ entry:
   ret void
 }
 
-define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
@@ -394,7 +394,7 @@ entry:
 ; SI-NOT: ds_write
 ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
 ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 ;
-define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32]
   %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   store i32 5, i32* %tmp0
@@ -410,7 +410,7 @@ define void @ptrtoint(i32 addrspace(1)*
 ; OPT-LABEL: @pointer_typed_alloca(
 ; OPT:  getelementptr inbounds [256 x i32 addrspace(1)*], [256 x i32 addrspace(1)*] addrspace(3)* @pointer_typed_alloca.A.addr, i32 0, i32 %{{[0-9]+}}
 ; OPT: load i32 addrspace(1)*, i32 addrspace(1)* addrspace(3)* %{{[0-9]+}}, align 4
-define void @pointer_typed_alloca(i32 addrspace(1)* %A) {
+define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
@@ -462,7 +462,7 @@ entry:
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
 
-define void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
   %alloca = alloca [2 x <16 x i32>]
   %tmp0 = getelementptr [2 x <16 x i32>], [2 x <16 x i32>]* %alloca, i32 0, i32 %a
   %tmp5 = load <16 x i32>, <16 x i32>* %tmp0
@@ -506,7 +506,7 @@ define void @v16i32_stack(<16 x i32> add
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
 
-define void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
   %alloca = alloca [2 x <16 x float>]
   %tmp0 = getelementptr [2 x <16 x float>], [2 x <16 x float>]* %alloca, i32 0, i32 %a
   %tmp5 = load <16 x float>, <16 x float>* %tmp0
@@ -522,7 +522,7 @@ define void @v16float_stack(<16 x float>
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
 
-define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
   %alloca = alloca [16 x <2 x float>]
   %tmp0 = getelementptr [16 x <2 x float>], [16 x <2 x float>]* %alloca, i32 0, i32 %a
   %tmp5 = load <2 x float>, <2 x float>* %tmp0
@@ -533,7 +533,7 @@ define void @v2float_stack(<2 x float> a
 ; OPT-LABEL: @direct_alloca_read_0xi32(
 ; OPT: store [0 x i32] undef, [0 x i32] addrspace(3)*
 ; OPT: load [0 x i32], [0 x i32] addrspace(3)*
-define void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [0 x i32]
   store [0 x i32] [], [0 x i32]* %tmp
@@ -545,7 +545,7 @@ entry:
 ; OPT-LABEL: @direct_alloca_read_1xi32(
 ; OPT: store [1 x i32] zeroinitializer, [1 x i32] addrspace(3)*
 ; OPT: load [1 x i32], [1 x i32] addrspace(3)*
-define void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [1 x i32]
   store [1 x i32] [i32 0], [1 x i32]* %tmp

Modified: llvm/trunk/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll Tue Mar 21 16:39:51 2017
@@ -12,7 +12,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].X
-define void @ngroups_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -27,7 +27,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y
-define void @ngroups_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -42,7 +42,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z
-define void @ngroups_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -57,7 +57,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].W
-define void @global_size_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -72,7 +72,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].X
-define void @global_size_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -87,7 +87,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y
-define void @global_size_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -102,7 +102,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Z
-define void @local_size_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -117,7 +117,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].W
-define void @local_size_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -132,7 +132,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[2].X
-define void @local_size_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -153,7 +153,7 @@ entry:
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @tgid_x_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_x_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -165,7 +165,7 @@ entry:
 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
 
 ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
-define void @tgid_y_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_y_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -181,7 +181,7 @@ entry:
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @tgid_z_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_z_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -194,7 +194,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tidig_x_legacy:
 ; GCN-NOHSA: buffer_store_dword v0
-define void @tidig_x_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_x_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -208,7 +208,7 @@ entry:
 ; FUNC-LABEL: {{^}}tidig_y_legacy:
 
 ; GCN-NOHSA: buffer_store_dword v1
-define void @tidig_y_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_y_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -221,7 +221,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tidig_z_legacy:
 ; GCN-NOHSA: buffer_store_dword v2
-define void @tidig_z_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_z_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
   store i32 %0, i32 addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/and-gcn.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/and-gcn.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/and-gcn.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/and-gcn.ll Tue Mar 21 16:39:51 2017
@@ -4,7 +4,7 @@
 ; FUNC-LABEL: {{^}}v_and_i64_br:
 ; SI: v_and_b32
 ; SI: v_and_b32
-define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
 entry:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0

Modified: llvm/trunk/test/CodeGen/AMDGPU/and.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/and.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/and.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/and.ll Tue Mar 21 16:39:51 2017
@@ -11,7 +11,7 @@ declare i32 @llvm.r600.read.tidig.x() #0
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -31,7 +31,7 @@ define void @test2(<2 x i32> addrspace(1
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -42,7 +42,7 @@ define void @test4(<4 x i32> addrspace(1
 
 ; FUNC-LABEL: {{^}}s_and_i32:
 ; SI: s_and_b32
-define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %and = and i32 %a, %b
   store i32 %and, i32 addrspace(1)* %out, align 4
   ret void
@@ -50,7 +50,7 @@ define void @s_and_i32(i32 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}s_and_constant_i32:
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687
-define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
   %and = and i32 %a, 1234567
   store i32 %and, i32 addrspace(1)* %out, align 4
   ret void
@@ -66,7 +66,7 @@ define void @s_and_constant_i32(i32 addr
 ; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]]
 ; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
 ; SI: buffer_store_dword [[VK]]
-define void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %and = and i32 %a, 1234567
 
   ; Just to stop future replacement of copy to vgpr + store with VALU op.
@@ -83,7 +83,7 @@ define void @s_and_multi_use_constant_i3
 ; SI: s_add_i32
 ; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, [[K]]
 ; SI: buffer_store_dword [[VK]]
-define void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %and = and i32 %a, 1234567
   %foo = add i32 %and, 1234567
   %bar = add i32 %foo, %b
@@ -93,7 +93,7 @@ define void @s_and_multi_use_constant_i3
 
 ; FUNC-LABEL: {{^}}v_and_i32_vgpr_vgpr:
 ; SI: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -109,7 +109,7 @@ define void @v_and_i32_vgpr_vgpr(i32 add
 ; SI-DAG: s_load_dword [[SA:s[0-9]+]]
 ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]]
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]]
-define void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -123,7 +123,7 @@ define void @v_and_i32_sgpr_vgpr(i32 add
 ; SI-DAG: s_load_dword [[SA:s[0-9]+]]
 ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]]
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]]
-define void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) {
+define amdgpu_kernel void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -135,7 +135,7 @@ define void @v_and_i32_vgpr_sgpr(i32 add
 
 ; FUNC-LABEL: {{^}}v_and_constant_i32
 ; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}}
-define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %and = and i32 %a, 1234567
   store i32 %and, i32 addrspace(1)* %out, align 4
@@ -144,7 +144,7 @@ define void @v_and_constant_i32(i32 addr
 
 ; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32
 ; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}}
-define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %and = and i32 %a, 64
   store i32 %and, i32 addrspace(1)* %out, align 4
@@ -153,7 +153,7 @@ define void @v_and_inline_imm_64_i32(i32
 
 ; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32
 ; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}}
-define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %and = and i32 %a, -16
   store i32 %and, i32 addrspace(1)* %out, align 4
@@ -162,7 +162,7 @@ define void @v_and_inline_imm_neg_16_i32
 
 ; FUNC-LABEL: {{^}}s_and_i64
 ; SI: s_and_b64
-define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %and = and i64 %a, %b
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -171,7 +171,7 @@ define void @s_and_i64(i64 addrspace(1)*
 ; FIXME: Should use SGPRs
 ; FUNC-LABEL: {{^}}s_and_i1:
 ; SI: v_and_b32
-define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
+define amdgpu_kernel void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
   %and = and i1 %a, %b
   store i1 %and, i1 addrspace(1)* %out
   ret void
@@ -181,7 +181,7 @@ define void @s_and_i1(i1 addrspace(1)* %
 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000{{$}}
 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80{{$}}
 ; SI: buffer_store_dwordx2
-define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
   %and = and i64 %a, 549756338176
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -191,7 +191,7 @@ define void @s_and_constant_i64(i64 addr
 ; XSI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x80000{{$}}
 ; XSI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0x80{{$}}
 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}
-define void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %and0 = and i64 %a, 549756338176
   %and1 = and i64 %b, 549756338176
   store volatile i64 %and0, i64 addrspace(1)* %out
@@ -205,7 +205,7 @@ define void @s_and_multi_use_constant_i6
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}}
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -223,7 +223,7 @@ define void @s_and_32_bit_constant_i64(i
 ; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
   %shl.a = shl i64 %a, 1
   %shl.b = shl i64 %b, 1
   %and0 = and i64 %shl.a, 62
@@ -238,7 +238,7 @@ define void @s_and_multi_use_inline_imm_
 ; FUNC-LABEL: {{^}}v_and_i64:
 ; SI: v_and_b32
 ; SI: v_and_b32
-define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
   %and = and i64 %a, %b
@@ -250,7 +250,7 @@ define void @v_and_i64(i64 addrspace(1)*
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, {{v[0-9]+}}
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}}
 ; SI: buffer_store_dwordx2
-define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 1231231234567
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -268,7 +268,7 @@ define void @v_and_constant_i64(i64 addr
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI1]]
 ; SI: buffer_store_dwordx2
 ; SI: buffer_store_dwordx2
-define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load volatile i64, i64 addrspace(1)* %aptr
   %b = load volatile i64, i64 addrspace(1)* %aptr
   %and0 = and i64 %a, 1231231234567
@@ -288,7 +288,7 @@ define void @v_and_multi_use_constant_i6
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]]
-define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load volatile i64, i64 addrspace(1)* %aptr
   %b = load volatile i64, i64 addrspace(1)* %aptr
   %and0 = and i64 %a, 63
@@ -304,7 +304,7 @@ define void @v_and_multi_use_inline_imm_
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]]
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -317,7 +317,7 @@ define void @v_and_i64_32_bit_constant(i
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 64
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -331,7 +331,7 @@ define void @v_and_inline_imm_i64(i64 ad
 ; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]]
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
-define void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, -8
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -344,7 +344,7 @@ define void @v_and_inline_neg_imm_i64(i6
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 64
 ; SI-NOT: and
 ; SI: buffer_store_dword
-define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 64
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -358,7 +358,7 @@ define void @s_and_inline_imm_64_i64(i64
 ; SI-NOT: and
 ; SI: s_add_u32
 ; SI-NEXT: s_addc_u32
-define void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
   %shl = shl i64 %a, 1
   %and = and i64 %shl, 64
   %add = add i64 %and, %b
@@ -372,7 +372,7 @@ define void @s_and_inline_imm_64_i64_nos
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 1
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -387,7 +387,7 @@ define void @s_and_inline_imm_1_i64(i64
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4607182418800017408
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -402,7 +402,7 @@ define void @s_and_inline_imm_1.0_i64(i6
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13830554455654793216
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -417,7 +417,7 @@ define void @s_and_inline_imm_neg_1.0_i6
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4602678819172646912
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -432,7 +432,7 @@ define void @s_and_inline_imm_0.5_i64(i6
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13826050856027422720
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -445,7 +445,7 @@ define void @s_and_inline_imm_neg_0.5_i6
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4611686018427387904
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -458,7 +458,7 @@ define void @s_and_inline_imm_2.0_i64(i6
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13835058055282163712
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -473,7 +473,7 @@ define void @s_and_inline_imm_neg_2.0_i6
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4616189618054758400
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -488,7 +488,7 @@ define void @s_and_inline_imm_4.0_i64(i6
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13839561654909534208
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -505,7 +505,7 @@ define void @s_and_inline_imm_neg_4.0_i6
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 1082130432
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -518,7 +518,7 @@ define void @s_and_inline_imm_f32_4.0_i6
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, -1065353216
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -531,7 +531,7 @@ define void @s_and_inline_imm_f32_neg_4.
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4647714815446351872
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -544,7 +544,7 @@ define void @s_and_inline_high_imm_f32_4
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13871086852301127680
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll Tue Mar 21 16:39:51 2017
@@ -11,22 +11,22 @@ declare i32 @llvm.amdgcn.workitem.id.z()
 declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
 declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
 
-; HSA: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workgroup.id.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workgroup.id.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -34,8 +34,8 @@ define void @multi_use_tgid_y(i32 addrsp
   ret void
 }
 
-; HSA: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -43,15 +43,15 @@ define void @use_tgid_x_y(i32 addrspace(
   ret void
 }
 
-; HSA: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workgroup.id.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -59,8 +59,8 @@ define void @use_tgid_x_z(i32 addrspace(
   ret void
 }
 
-; HSA: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -68,8 +68,8 @@ define void @use_tgid_y_z(i32 addrspace(
   ret void
 }
 
-; HSA: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
   %val2 = call i32 @llvm.amdgcn.workgroup.id.z()
@@ -79,29 +79,29 @@ define void @use_tgid_x_y_z(i32 addrspac
   ret void
 }
 
-; HSA: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workitem.id.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
-define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
+define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workitem.id.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
-define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
+define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workitem.id.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.x()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -109,8 +109,8 @@ define void @use_tidig_x_tgid_x(i32 addr
   ret void
 }
 
-; HSA: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
-define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
+define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.y()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -118,8 +118,8 @@ define void @use_tidig_y_tgid_y(i32 addr
   ret void
 }
 
-; HSA: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
-define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
+define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.x()
   %val1 = call i32 @llvm.amdgcn.workitem.id.y()
   %val2 = call i32 @llvm.amdgcn.workitem.id.z()
@@ -129,8 +129,8 @@ define void @use_tidig_x_y_z(i32 addrspa
   ret void
 }
 
-; HSA: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
-define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
+define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.x()
   %val1 = call i32 @llvm.amdgcn.workitem.id.y()
   %val2 = call i32 @llvm.amdgcn.workitem.id.z()
@@ -146,8 +146,8 @@ define void @use_all_workitems(i32 addrs
   ret void
 }
 
-; HSA: define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 {
-define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 {
+define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
   %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
   %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
   %val = load i32, i32 addrspace(2)* %bc
@@ -155,8 +155,8 @@ define void @use_dispatch_ptr(i32 addrsp
   ret void
 }
 
-; HSA: define void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 {
-define void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 {
+define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
   %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
   %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
   %val = load i32, i32 addrspace(2)* %bc
@@ -164,58 +164,58 @@ define void @use_queue_ptr(i32 addrspace
   ret void
 }
 
-; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
-define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
+define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
-define void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
+define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
   %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
   store volatile i32 0, i32 addrspace(3)* %ftos
   ret void
 }
 
-; HSA: define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
   store volatile i32 0, i32* %ftos
   ret void
 }
 
 ; No-op addrspacecast should not use queue ptr
-; HSA: define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
-define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
   %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
-define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
+define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
   %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
   %ld = load volatile i32, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
   store volatile i32 0, i32 addrspace(1)* %ftos
   ret void
 }
 
-; HSA: define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
   %ld = load volatile i32, i32 addrspace(2)* %ftos
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/annotate-kernel-features.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/annotate-kernel-features.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/annotate-kernel-features.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/annotate-kernel-features.ll Tue Mar 21 16:39:51 2017
@@ -12,22 +12,22 @@ declare i32 @llvm.r600.read.local.size.x
 declare i32 @llvm.r600.read.local.size.y() #0
 declare i32 @llvm.r600.read.local.size.z() #0
 
-; ALL: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tgid.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tgid.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
   %val1 = call i32 @llvm.r600.read.tgid.y()
@@ -35,8 +35,8 @@ define void @multi_use_tgid_y(i32 addrsp
   ret void
 }
 
-; ALL: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.x()
   %val1 = call i32 @llvm.r600.read.tgid.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -44,15 +44,15 @@ define void @use_tgid_x_y(i32 addrspace(
   ret void
 }
 
-; ALL: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tgid.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.x()
   %val1 = call i32 @llvm.r600.read.tgid.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -60,8 +60,8 @@ define void @use_tgid_x_z(i32 addrspace(
   ret void
 }
 
-; ALL: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.y()
   %val1 = call i32 @llvm.r600.read.tgid.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -69,8 +69,8 @@ define void @use_tgid_y_z(i32 addrspace(
   ret void
 }
 
-; ALL: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.x()
   %val1 = call i32 @llvm.r600.read.tgid.y()
   %val2 = call i32 @llvm.r600.read.tgid.z()
@@ -80,29 +80,29 @@ define void @use_tgid_x_y_z(i32 addrspac
   ret void
 }
 
-; ALL: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tidig.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
-define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
+define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tidig.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
-define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
+define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tidig.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.x()
   %val1 = call i32 @llvm.r600.read.tgid.x()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -110,8 +110,8 @@ define void @use_tidig_x_tgid_x(i32 addr
   ret void
 }
 
-; ALL: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
-define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
+define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.y()
   %val1 = call i32 @llvm.r600.read.tgid.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -119,8 +119,8 @@ define void @use_tidig_y_tgid_y(i32 addr
   ret void
 }
 
-; ALL: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
-define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
+define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.x()
   %val1 = call i32 @llvm.r600.read.tidig.y()
   %val2 = call i32 @llvm.r600.read.tidig.z()
@@ -130,8 +130,8 @@ define void @use_tidig_x_y_z(i32 addrspa
   ret void
 }
 
-; ALL: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
-define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
+define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.x()
   %val1 = call i32 @llvm.r600.read.tidig.y()
   %val2 = call i32 @llvm.r600.read.tidig.z()
@@ -147,25 +147,25 @@ define void @use_all_workitems(i32 addrs
   ret void
 }
 
-; HSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 {
-; NOHSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.local.size.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 {
-; NOHSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
-define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.local.size.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 {
-; NOHSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
-define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.local.size.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/anonymous-gv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/anonymous-gv.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/anonymous-gv.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/anonymous-gv.ll Tue Mar 21 16:39:51 2017
@@ -6,13 +6,13 @@
 ; CHECK-LABEL: {{^}}test:
 ; CHECK: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, __unnamed_1
 ; CHECK: s_endpgm
-define void @test() {
+define amdgpu_kernel void @test() {
   store i32 1, i32 addrspace(1)* @0
   ret void
 }
 
 ; CHECK-LABEL: {{^}}__unnamed_2:
 ; CHECK: s_endpgm
-define void @1() {
+define amdgpu_kernel void @1() {
   ret void
 }

Modified: llvm/trunk/test/CodeGen/AMDGPU/anyext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/anyext.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/anyext.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/anyext.ll Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.y()
 
 ; GCN-LABEL: {{^}}anyext_i1_i32:
 ; GCN: v_cndmask_b32_e64
-define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %tmp = icmp eq i32 %cond, 0
   %tmp1 = zext i1 %tmp to i8
@@ -22,7 +22,7 @@ entry:
 ; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], -1, [[ADD]]
 ; VI: v_and_b32_e32 [[AND:v[0-9]+]], 1, [[XOR]]
 ; VI: buffer_store_dword [[AND]]
-define void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) {
+define amdgpu_kernel void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) {
 entry:
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()

Modified: llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll Tue Mar 21 16:39:51 2017
@@ -24,7 +24,7 @@ declare void @llvm.amdgcn.s.barrier() #2
 
 ; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64
 ; SI-PROMOTE: ds_write_b32 [[PTRREG]]
-define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
+define amdgpu_kernel void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
   %alloca = alloca [16 x i32], align 16
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)

Modified: llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll Tue Mar 21 16:39:51 2017
@@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i
 ; SI-DAG: v_mul_lo_i32
 ; SI-DAG: v_mul_hi_i32
 ; SI: s_endpgm
-define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
   %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0

Modified: llvm/trunk/test/CodeGen/AMDGPU/ashr.v2i16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ashr.v2i16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ashr.v2i16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ashr.v2i16.ll Tue Mar 21 16:39:51 2017
@@ -13,7 +13,7 @@
 ; CIVI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CIVI: v_or_b32_e32
-define void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
+define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
   %result = ashr <2 x i16> %lhs, %rhs
   store <2 x i16> %result, <2 x i16> addrspace(1)* %out
   ret void
@@ -40,7 +40,7 @@ define void @s_ashr_v2i16(<2 x i16> addr
 ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
 ; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -57,7 +57,7 @@ define void @v_ashr_v2i16(<2 x i16> addr
 ; GFX9: s_load_dword [[RHS:s[0-9]+]]
 ; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
-define void @ashr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+define amdgpu_kernel void @ashr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -72,7 +72,7 @@ define void @ashr_v_s_v2i16(<2 x i16> ad
 ; GFX9: s_load_dword [[LHS:s[0-9]+]]
 ; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
-define void @ashr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+define amdgpu_kernel void @ashr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -86,7 +86,7 @@ define void @ashr_s_v_v2i16(<2 x i16> ad
 ; GCN-LABEL: {{^}}ashr_imm_v_v2i16:
 ; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], -4
-define void @ashr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @ashr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -100,7 +100,7 @@ define void @ashr_imm_v_v2i16(<2 x i16>
 ; GCN-LABEL: {{^}}ashr_v_imm_v2i16:
 ; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], 8, [[LHS]]
-define void @ashr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @ashr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -117,7 +117,7 @@ define void @ashr_v_imm_v2i16(<2 x i16>
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: {{buffer|flat}}_store_dwordx2
-define void @v_ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -135,7 +135,7 @@ define void @v_ashr_v4i16(<4 x i16> addr
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GCN: {{buffer|flat}}_store_dwordx2
-define void @ashr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @ashr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext

Modified: llvm/trunk/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll Tue Mar 21 16:39:51 2017
@@ -12,7 +12,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
 ; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
@@ -33,7 +33,7 @@ define void @lds_atomic_cmpxchg_ret_i32_
 ; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
   %result = extractvalue { i64, i1 } %pair, 0
@@ -45,7 +45,7 @@ define void @lds_atomic_cmpxchg_ret_i64_
 ; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -65,7 +65,7 @@ define void @lds_atomic_cmpxchg_ret_i32_
 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
 ; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
@@ -84,7 +84,7 @@ define void @lds_atomic_cmpxchg_noret_i3
 ; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
 ; GCN: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
   %result = extractvalue { i64, i1 } %pair, 0

Modified: llvm/trunk/test/CodeGen/AMDGPU/atomic_load_add.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/atomic_load_add.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/atomic_load_add.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/atomic_load_add.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_local:
 ; R600: LDS_ADD *
 ; SI: ds_add_u32
-define void @atomic_add_local(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_local(i32 addrspace(3)* %local) {
    %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
    ret void
 }
@@ -13,7 +13,7 @@ define void @atomic_add_local(i32 addrsp
 ; FUNC-LABEL: {{^}}atomic_add_local_const_offset:
 ; R600: LDS_ADD *
 ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
   %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
   ret void
@@ -22,7 +22,7 @@ define void @atomic_add_local_const_offs
 ; FUNC-LABEL: {{^}}atomic_add_ret_local:
 ; R600: LDS_ADD_RET *
 ; SI: ds_add_rtn_u32
-define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
@@ -31,7 +31,7 @@ define void @atomic_add_ret_local(i32 ad
 ; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset:
 ; R600: LDS_ADD_RET *
 ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
-define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
   %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/atomic_load_sub.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/atomic_load_sub.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/atomic_load_sub.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/atomic_load_sub.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}atomic_sub_local:
 ; R600: LDS_SUB *
 ; SI: ds_sub_u32
-define void @atomic_sub_local(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_local(i32 addrspace(3)* %local) {
    %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
    ret void
 }
@@ -13,7 +13,7 @@ define void @atomic_sub_local(i32 addrsp
 ; FUNC-LABEL: {{^}}atomic_sub_local_const_offset:
 ; R600: LDS_SUB *
 ; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
   %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
   ret void
@@ -22,7 +22,7 @@ define void @atomic_sub_local_const_offs
 ; FUNC-LABEL: {{^}}atomic_sub_ret_local:
 ; R600: LDS_SUB_RET *
 ; SI: ds_sub_rtn_u32
-define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
@@ -31,7 +31,7 @@ define void @atomic_sub_ret_local(i32 ad
 ; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset:
 ; R600: LDS_SUB_RET *
 ; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
-define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
   %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @min_64_max_64() #0 {
+define amdgpu_kernel void @min_64_max_64() #0 {
 entry:
   ret void
 }
@@ -16,7 +16,7 @@ attributes #0 = {"amdgpu-flat-work-group
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @min_64_max_128() #1 {
+define amdgpu_kernel void @min_64_max_128() #1 {
 entry:
   ret void
 }
@@ -27,7 +27,7 @@ attributes #1 = {"amdgpu-flat-work-group
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @min_128_max_128() #2 {
+define amdgpu_kernel void @min_128_max_128() #2 {
 entry:
   ret void
 }
@@ -39,7 +39,7 @@ attributes #2 = {"amdgpu-flat-work-group
 ; CHECK: NumSGPRsForWavesPerEU: 13
 ; CHECK: NumVGPRsForWavesPerEU: 32
 @var = addrspace(1) global float 0.0
-define void @min_1024_max_2048() #3 {
+define amdgpu_kernel void @min_1024_max_2048() #3 {
   %val0 = load volatile float, float addrspace(1)* @var
   %val1 = load volatile float, float addrspace(1)* @var
   %val2 = load volatile float, float addrspace(1)* @var

Modified: llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll Tue Mar 21 16:39:51 2017
@@ -8,7 +8,7 @@
 
 ; ALL: SGPRBlocks: 1
 ; ALL: NumSGPRsForWavesPerEU: 9
-define void @max_9_sgprs(i32 addrspace(1)* %out1,
+define amdgpu_kernel void @max_9_sgprs(i32 addrspace(1)* %out1,
 
                           i32 addrspace(1)* %out2,
                           i32 addrspace(1)* %out3,
@@ -49,7 +49,7 @@ define void @max_9_sgprs(i32 addrspace(1
 
 ; TOSMEM: SGPRBlocks: 1
 ; TOSMEM: NumSGPRsForWavesPerEU: 16
-define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
+define amdgpu_kernel void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
                                         i32 addrspace(1)* %out2,
                                         i32 addrspace(1)* %out3,
                                         i32 addrspace(1)* %out4,
@@ -90,7 +90,7 @@ stores:
 
 ; XALL: SGPRBlocks: 2
 ; XALL: NumSGPRsForWavesPerEU: 18
-;define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
+;define amdgpu_kernel void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
 ;                                        i32 addrspace(1)* %out2,
 ;                                        i32 addrspace(1)* %out3,
 ;                                        i32 addrspace(1)* %out4,

Modified: llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: {{^}}max_20_vgprs:
 ; CHECK: VGPRBlocks: 4
 ; CHECK: NumVGPRsForWavesPerEU: 20
-define void @max_20_vgprs() #1 {
+define amdgpu_kernel void @max_20_vgprs() #1 {
   %val0 = load volatile float, float addrspace(1)* @var
   %val1 = load volatile float, float addrspace(1)* @var
   %val2 = load volatile float, float addrspace(1)* @var

Modified: llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@
 ; CHECK: VGPRBlocks: 32
 ; CHECK: NumSGPRsForWavesPerEU: 102
 ; CHECK: NumVGPRsForWavesPerEU: 129
-define void @empty_exactly_1() #0 {
+define amdgpu_kernel void @empty_exactly_1() #0 {
 entry:
   ret void
 }
@@ -18,7 +18,7 @@ attributes #0 = {"amdgpu-waves-per-eu"="
 ; CHECK: VGPRBlocks: 10
 ; CHECK: NumSGPRsForWavesPerEU: 102
 ; CHECK: NumVGPRsForWavesPerEU: 41
-define void @empty_exactly_5() #1 {
+define amdgpu_kernel void @empty_exactly_5() #1 {
 entry:
   ret void
 }
@@ -30,7 +30,7 @@ attributes #1 = {"amdgpu-waves-per-eu"="
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_exactly_10() #2 {
+define amdgpu_kernel void @empty_exactly_10() #2 {
 entry:
   ret void
 }
@@ -42,7 +42,7 @@ attributes #2 = {"amdgpu-waves-per-eu"="
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_least_1() #3 {
+define amdgpu_kernel void @empty_at_least_1() #3 {
 entry:
   ret void
 }
@@ -54,7 +54,7 @@ attributes #3 = {"amdgpu-waves-per-eu"="
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_least_5() #4 {
+define amdgpu_kernel void @empty_at_least_5() #4 {
 entry:
   ret void
 }
@@ -66,7 +66,7 @@ attributes #4 = {"amdgpu-waves-per-eu"="
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_least_10() #5 {
+define amdgpu_kernel void @empty_at_least_10() #5 {
 entry:
   ret void
 }
@@ -80,7 +80,7 @@ attributes #5 = {"amdgpu-waves-per-eu"="
 ; CHECK: VGPRBlocks: 10
 ; CHECK: NumSGPRsForWavesPerEU: 102
 ; CHECK: NumVGPRsForWavesPerEU: 41
-define void @empty_at_most_5() #6 {
+define amdgpu_kernel void @empty_at_most_5() #6 {
 entry:
   ret void
 }
@@ -92,7 +92,7 @@ attributes #6 = {"amdgpu-waves-per-eu"="
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_most_10() #7 {
+define amdgpu_kernel void @empty_at_most_10() #7 {
 entry:
   ret void
 }
@@ -106,7 +106,7 @@ attributes #7 = {"amdgpu-waves-per-eu"="
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_between_5_and_10() #8 {
+define amdgpu_kernel void @empty_between_5_and_10() #8 {
 entry:
   ret void
 }
@@ -120,7 +120,7 @@ attributes #8 = {"amdgpu-waves-per-eu"="
 ; CHECK: VGPRBlocks: 5
 ; CHECK: NumSGPRsForWavesPerEU: 13
 ; CHECK: NumVGPRsForWavesPerEU: 24
-define void @exactly_10() #9 {
+define amdgpu_kernel void @exactly_10() #9 {
   %val0 = load volatile float, float addrspace(1)* @var
   %val1 = load volatile float, float addrspace(1)* @var
   %val2 = load volatile float, float addrspace(1)* @var

Modified: llvm/trunk/test/CodeGen/AMDGPU/attr-unparseable.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/attr-unparseable.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/attr-unparseable.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/attr-unparseable.ll Tue Mar 21 16:39:51 2017
@@ -1,56 +1,56 @@
 ; RUN: not llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s 2>&1 | FileCheck %s
 
 ; CHECK: can't parse integer attribute amdgpu-num-sgpr
-define void @unparseable_single_0() #0 {
+define amdgpu_kernel void @unparseable_single_0() #0 {
 entry:
   ret void
 }
 attributes #0 = {"amdgpu-num-sgpr"}
 
 ; CHECK: can't parse integer attribute amdgpu-num-sgpr
-define void @unparseable_single_1() #1 {
+define amdgpu_kernel void @unparseable_single_1() #1 {
 entry:
   ret void
 }
 attributes #1 = {"amdgpu-num-sgpr"="k"}
 
 ; CHECK: can't parse integer attribute amdgpu-num-sgpr
-define void @unparseable_single_2() #2 {
+define amdgpu_kernel void @unparseable_single_2() #2 {
 entry:
   ret void
 }
 attributes #2 = {"amdgpu-num-sgpr"="1,2"}
 
 ; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_0() #3 {
+define amdgpu_kernel void @unparseable_pair_0() #3 {
 entry:
   ret void
 }
 attributes #3 = {"amdgpu-flat-work-group-size"}
 
 ; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_1() #4 {
+define amdgpu_kernel void @unparseable_pair_1() #4 {
 entry:
   ret void
 }
 attributes #4 = {"amdgpu-flat-work-group-size"="k"}
 
 ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_2() #5 {
+define amdgpu_kernel void @unparseable_pair_2() #5 {
 entry:
   ret void
 }
 attributes #5 = {"amdgpu-flat-work-group-size"="1"}
 
 ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_3() #6 {
+define amdgpu_kernel void @unparseable_pair_3() #6 {
 entry:
   ret void
 }
 attributes #6 = {"amdgpu-flat-work-group-size"="1,k"}
 
 ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_4() #7 {
+define amdgpu_kernel void @unparseable_pair_4() #7 {
 entry:
   ret void
 }

Modified: llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll Tue Mar 21 16:39:51 2017
@@ -15,7 +15,7 @@
 
 ; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
-define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
+define amdgpu_kernel void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
   %cmp = icmp ne i32 %val, 0
   br i1 %cmp, label %store, label %end
 
@@ -39,7 +39,7 @@ end:
 
 ; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
-define void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
+define amdgpu_kernel void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
   %cmp0 = icmp ne i1 %val, 0
   br i1 %cmp0, label %store, label %end
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/basic-loop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/basic-loop.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/basic-loop.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/basic-loop.ll Tue Mar 21 16:39:51 2017
@@ -2,7 +2,7 @@
 ; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test_loop:
-define void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
+define amdgpu_kernel void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
 entry:
   br label %loop.body
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/bfe-patterns.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/bfe-patterns.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/bfe-patterns.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/bfe-patterns.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
 ; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[WIDTH]]
-define void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
   %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
@@ -32,7 +32,7 @@ define void @v_ubfe_sub_i32(i32 addrspac
 
 ; GCN: [[BFE]]
 ; GCN: [[SHL]]
-define void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
   %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
@@ -52,7 +52,7 @@ define void @v_ubfe_sub_multi_use_shl_i3
 ; GCN: s_load_dword [[WIDTH:s[0-9]+]]
 ; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]]
 ; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
-define void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
   %sub = sub i32 32, %width
@@ -68,7 +68,7 @@ define void @s_ubfe_sub_i32(i32 addrspac
 ; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
 ; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
 ; GCN-NEXT: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]]
-define void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
   %sub = sub i32 32, %width
@@ -83,7 +83,7 @@ define void @s_ubfe_sub_multi_use_shl_i3
 ; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
 ; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[WIDTH]]
-define void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
   %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
@@ -110,7 +110,7 @@ define void @v_sbfe_sub_i32(i32 addrspac
 
 ; GCN: [[BFE]]
 ; GCN: [[SHL]]
-define void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
   %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
@@ -130,7 +130,7 @@ define void @v_sbfe_sub_multi_use_shl_i3
 ; GCN: s_load_dword [[WIDTH:s[0-9]+]]
 ; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]]
 ; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
-define void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
   %sub = sub i32 32, %width
@@ -146,7 +146,7 @@ define void @s_sbfe_sub_i32(i32 addrspac
 ; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
 ; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
 ; GCN-NEXT: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]]
-define void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
   %sub = sub i32 32, %width

Modified: llvm/trunk/test/CodeGen/AMDGPU/bfe_uint.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/bfe_uint.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/bfe_uint.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/bfe_uint.ll Tue Mar 21 16:39:51 2017
@@ -2,7 +2,7 @@
 
 ; CHECK: {{^}}bfe_def:
 ; CHECK: BFE_UINT
-define void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
 entry:
   %0 = lshr i32 %x, 5
   %1 = and i32 %0, 15 ; 0xf
@@ -17,7 +17,7 @@ entry:
 
 ; CHECK: {{^}}bfe_shift:
 ; CHECK-NOT: BFE_UINT
-define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
 entry:
   %0 = lshr i32 %x, 16
   %1 = and i32 %0, 65535 ; 0xffff

Modified: llvm/trunk/test/CodeGen/AMDGPU/bfi_int.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/bfi_int.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/bfi_int.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/bfi_int.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 ; R600: BFI_INT
 ; SI:   @bfi_def
 ; SI:   v_bfi_b32
-define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = xor i32 %x, -1
   %1 = and i32 %z, %0
@@ -25,7 +25,7 @@ entry:
 ; R600: BFI_INT
 ; SI:   @bfi_sha256_ch
 ; SI:   v_bfi_b32
-define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = xor i32 %y, %z
   %1 = and i32 %x, %0
@@ -42,7 +42,7 @@ entry:
 ; SI: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}}
 ; SI: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}}
 
-define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = and i32 %x, %z
   %1 = or i32 %x, %z

Modified: llvm/trunk/test/CodeGen/AMDGPU/bfm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/bfm.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/bfm.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/bfm.ll Tue Mar 21 16:39:51 2017
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}bfm_pattern:
 ; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
   %a = shl i32 1, %x
   %b = sub i32 %a, 1
   %c = shl i32 %b, %y
@@ -14,7 +14,7 @@ define void @bfm_pattern(i32 addrspace(1
 
 ; FUNC-LABEL: {{^}}bfm_pattern_simple:
 ; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0
-define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 {
   %a = shl i32 1, %x
   %b = sub i32 %a, 1
   store i32 %b, i32 addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/bitcast-vector-extract.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/bitcast-vector-extract.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/bitcast-vector-extract.ll Tue Mar 21 16:39:51 2017
@@ -11,7 +11,7 @@
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) {
   %vec0.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8> to <8 x float>
   store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
 
@@ -27,7 +27,7 @@ define void @store_bitcast_constant_v8i3
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) {
   %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <8 x float>
   store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
 
@@ -43,7 +43,7 @@ define void @store_bitcast_constant_v4i6
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) {
   %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <4 x double>
   store volatile <4 x double> %vec0.bc, <4 x double> addrspace(1)* %out
 
@@ -59,7 +59,7 @@ define void @store_bitcast_constant_v4i6
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) {
   %vec0.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 8> to <8 x float>
   store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
 
@@ -70,7 +70,7 @@ define void @store_bitcast_constant_v8i3
 
 ; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source:
 ; GCN-NOT: store_dword
-define void @store_value_lowered_to_undef_bitcast_source(<2 x i32> addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
+define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source(<2 x i32> addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
   %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1
   %bc = bitcast i64 %undef to <2 x i32>
   store volatile <2 x i32> %bc, <2 x i32> addrspace(1)* %out
@@ -79,7 +79,7 @@ define void @store_value_lowered_to_unde
 
 ; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source_extractelt:
 ; GCN-NOT: store_dword
-define void @store_value_lowered_to_undef_bitcast_source_extractelt(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
+define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source_extractelt(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
   %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1
   %bc = bitcast i64 %undef to <2 x i32>
   %elt1 = extractelement <2 x i32> %bc, i32 1

Modified: llvm/trunk/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll Tue Mar 21 16:39:51 2017
@@ -7,7 +7,7 @@
 ; GCN-LABEL: {{^}}materialize_0_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_0_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_0_i32(i32 addrspace(1)* %out) {
   store i32 0, i32 addrspace(1)* %out
   ret void
 }
@@ -16,7 +16,7 @@ define void @materialize_0_i32(i32 addrs
 ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_0_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_0_i64(i64 addrspace(1)* %out) {
   store i64 0, i64 addrspace(1)* %out
   ret void
 }
@@ -24,7 +24,7 @@ define void @materialize_0_i64(i64 addrs
 ; GCN-LABEL: {{^}}materialize_neg1_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -1{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_neg1_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_neg1_i32(i32 addrspace(1)* %out) {
   store i32 -1, i32 addrspace(1)* %out
   ret void
 }
@@ -33,7 +33,7 @@ define void @materialize_neg1_i32(i32 ad
 ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_neg1_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_neg1_i64(i64 addrspace(1)* %out) {
   store i64 -1, i64 addrspace(1)* %out
   ret void
 }
@@ -41,7 +41,7 @@ define void @materialize_neg1_i64(i64 ad
 ; GCN-LABEL: {{^}}materialize_signbit_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_signbit_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_signbit_i32(i32 addrspace(1)* %out) {
   store i32 -2147483648, i32 addrspace(1)* %out
   ret void
 }
@@ -50,7 +50,7 @@ define void @materialize_signbit_i32(i32
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_signbit_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_signbit_i64(i64 addrspace(1)* %out) {
   store i64  -9223372036854775808, i64 addrspace(1)* %out
   ret void
 }
@@ -58,7 +58,7 @@ define void @materialize_signbit_i64(i64
 ; GCN-LABEL: {{^}}materialize_rev_neg16_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], -16{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
   store i32 268435455, i32 addrspace(1)* %out
   ret void
 }
@@ -67,7 +67,7 @@ define void @materialize_rev_neg16_i32(i
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], -16{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
   store i64  1152921504606846975, i64 addrspace(1)* %out
   ret void
 }
@@ -75,7 +75,7 @@ define void @materialize_rev_neg16_i64(i
 ; GCN-LABEL: {{^}}materialize_rev_neg17_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xf7ffffff{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
   store i32 -134217729, i32 addrspace(1)* %out
   ret void
 }
@@ -84,7 +84,7 @@ define void @materialize_rev_neg17_i32(i
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0xf7ffffff{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
   store i64 -576460752303423489, i64 addrspace(1)* %out
   ret void
 }
@@ -92,7 +92,7 @@ define void @materialize_rev_neg17_i64(i
 ; GCN-LABEL: {{^}}materialize_rev_64_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 64{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
   store i32 33554432, i32 addrspace(1)* %out
   ret void
 }
@@ -101,7 +101,7 @@ define void @materialize_rev_64_i32(i32
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 64{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
   store i64 144115188075855872, i64 addrspace(1)* %out
   ret void
 }
@@ -109,7 +109,7 @@ define void @materialize_rev_64_i64(i64
 ; GCN-LABEL: {{^}}materialize_rev_65_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x82000000{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
   store i32 -2113929216, i32 addrspace(1)* %out
   ret void
 }
@@ -118,7 +118,7 @@ define void @materialize_rev_65_i32(i32
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0x82000000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
   store i64 -9079256848778919936, i64 addrspace(1)* %out
   ret void
 }
@@ -126,7 +126,7 @@ define void @materialize_rev_65_i64(i64
 ; GCN-LABEL: {{^}}materialize_rev_3_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -2.0{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
   store i32 -1073741824, i32 addrspace(1)* %out
   ret void
 }
@@ -135,7 +135,7 @@ define void @materialize_rev_3_i32(i32 a
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], -2.0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
   store i64 -4611686018427387904, i64 addrspace(1)* %out
   ret void
 }
@@ -143,7 +143,7 @@ define void @materialize_rev_3_i64(i64 a
 ; GCN-LABEL: {{^}}materialize_rev_1.0_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1fc{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
   store i32 508, i32 addrspace(1)* %out
   ret void
 }
@@ -152,70 +152,70 @@ define void @materialize_rev_1.0_i32(i32
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0x1fc{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) {
   store i64 508, i64 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_0_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 0{{$}}
-define void @s_materialize_0_i32() {
+define amdgpu_kernel void @s_materialize_0_i32() {
   call void asm sideeffect "; use $0", "s"(i32 0)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_1_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 1{{$}}
-define void @s_materialize_1_i32() {
+define amdgpu_kernel void @s_materialize_1_i32() {
   call void asm sideeffect "; use $0", "s"(i32 1)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_neg1_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, -1{{$}}
-define void @s_materialize_neg1_i32() {
+define amdgpu_kernel void @s_materialize_neg1_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -1)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_signbit_i32:
 ; GCN: s_brev_b32 s{{[0-9]+}}, 1{{$}}
-define void @s_materialize_signbit_i32() {
+define amdgpu_kernel void @s_materialize_signbit_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -2147483648)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_64_i32:
 ; GCN: s_brev_b32 s{{[0-9]+}}, 64{{$}}
-define void @s_materialize_rev_64_i32() {
+define amdgpu_kernel void @s_materialize_rev_64_i32() {
   call void asm sideeffect "; use $0", "s"(i32 33554432)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_65_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 0x82000000{{$}}
-define void @s_materialize_rev_65_i32() {
+define amdgpu_kernel void @s_materialize_rev_65_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -2113929216)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_neg16_i32:
 ; GCN: s_brev_b32 s{{[0-9]+}}, -16{{$}}
-define void @s_materialize_rev_neg16_i32() {
+define amdgpu_kernel void @s_materialize_rev_neg16_i32() {
   call void asm sideeffect "; use $0", "s"(i32 268435455)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_neg17_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 0xf7ffffff{{$}}
-define void @s_materialize_rev_neg17_i32() {
+define amdgpu_kernel void @s_materialize_rev_neg17_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -134217729)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_1.0_i32:
 ; GCN: s_movk_i32 s{{[0-9]+}}, 0x1fc{{$}}
-define void @s_materialize_rev_1.0_i32() {
+define amdgpu_kernel void @s_materialize_rev_1.0_i32() {
   call void asm sideeffect "; use $0", "s"(i32 508)
   ret void
 }

Modified: llvm/trunk/test/CodeGen/AMDGPU/bitreverse.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/bitreverse.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/bitreverse.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/bitreverse.ll Tue Mar 21 16:39:51 2017
@@ -14,7 +14,7 @@ declare <4 x i64> @llvm.bitreverse.v4i64
 
 ; FUNC-LABEL: {{^}}s_brev_i16:
 ; SI: s_brev_b32 
-define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
+define amdgpu_kernel void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
   %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
   store i16 %brev, i16 addrspace(1)* %out
   ret void
@@ -22,7 +22,7 @@ define void @s_brev_i16(i16 addrspace(1)
 
 ; FUNC-LABEL: {{^}}v_brev_i16:
 ; SI: v_bfrev_b32_e32
-define void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 {
   %val = load i16, i16 addrspace(1)* %valptr
   %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
   store i16 %brev, i16 addrspace(1)* %out
@@ -35,7 +35,7 @@ define void @v_brev_i16(i16 addrspace(1)
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
 ; SI: s_endpgm
-define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
+define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
   %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
   store i32 %brev, i32 addrspace(1)* %out
   ret void
@@ -46,7 +46,7 @@ define void @s_brev_i32(i32 addrspace(1)
 ; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
   %val = load i32, i32 addrspace(1)* %valptr
   %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
   store i32 %brev, i32 addrspace(1)* %out
@@ -56,7 +56,7 @@ define void @v_brev_i32(i32 addrspace(1)
 ; FUNC-LABEL: {{^}}s_brev_v2i32:
 ; SI: s_brev_b32
 ; SI: s_brev_b32
-define void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 {
+define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 {
   %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
   store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
   ret void
@@ -65,7 +65,7 @@ define void @s_brev_v2i32(<2 x i32> addr
 ; FUNC-LABEL: {{^}}v_brev_v2i32:
 ; SI: v_bfrev_b32_e32
 ; SI: v_bfrev_b32_e32
-define void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr
   %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
   store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
@@ -73,7 +73,7 @@ define void @v_brev_v2i32(<2 x i32> addr
 }
 
 ; FUNC-LABEL: {{^}}s_brev_i64:
-define void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
+define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
   %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
   store i64 %brev, i64 addrspace(1)* %out
   ret void
@@ -81,7 +81,7 @@ define void @s_brev_i64(i64 addrspace(1)
 
 ; FUNC-LABEL: {{^}}v_brev_i64:
 ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0
-define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
   %val = load i64, i64 addrspace(1)* %valptr
   %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
   store i64 %brev, i64 addrspace(1)* %out
@@ -89,14 +89,14 @@ define void @v_brev_i64(i64 addrspace(1)
 }
 
 ; FUNC-LABEL: {{^}}s_brev_v2i64:
-define void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 {
+define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 {
   %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
   store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_brev_v2i64:
-define void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr
   %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
   store <2 x i64> %brev, <2 x i64> addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/br_cc.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/br_cc.f16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/br_cc.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/br_cc.f16.ll Tue Mar 21 16:39:51 2017
@@ -20,7 +20,7 @@
 ; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[B_F16]]
 ; GCN: s_endpgm
-define void @br_cc_f16(
+define amdgpu_kernel void @br_cc_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -59,7 +59,7 @@ two:
 ; GCN: two{{$}}
 ; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
 
-define void @br_cc_f16_imm_a(
+define amdgpu_kernel void @br_cc_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -92,7 +92,7 @@ two:
 ; VI:  v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}}
 ; GCN: buffer_store_short v[[B_F16]]
 ; GCN: s_endpgm
-define void @br_cc_f16_imm_b(
+define amdgpu_kernel void @br_cc_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:

Modified: llvm/trunk/test/CodeGen/AMDGPU/branch-relax-spill.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/branch-relax-spill.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/branch-relax-spill.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/branch-relax-spill.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 
 ; FAIL: LLVM ERROR: Error while trying to spill VCC from class SReg_64: Cannot scavenge register without an emergency spill slot!
 
-define void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 entry:
   %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={SGPR0}"() #0
   %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={SGPR1}"() #0

Modified: llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll Tue Mar 21 16:39:51 2017
@@ -26,7 +26,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 bb:
   %cmp = icmp eq i32 %cnd, 0
   br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
@@ -68,7 +68,7 @@ bb3:
 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 bb0:
   %cmp = icmp eq i32 %cnd, 0
   br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
@@ -108,7 +108,7 @@ bb3:
 ; GCN: [[ENDBB]]:
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 {
 bb0:
   %cmp = fcmp oeq float %cnd, 0.0
   br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
@@ -141,7 +141,7 @@ bb3:
 ; GCN: s_or_b64 exec, exec, [[SAVE]]
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = zext i32 %tid to i64
@@ -188,7 +188,7 @@ bb3:
 
 ; GCN-NEXT: [[ENDBB]]:
 ; GCN-NEXT: s_endpgm
-define void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 {
 bb:
   br label %bb2
 
@@ -243,7 +243,7 @@ bb3:
 ; GCN: buffer_store_dword [[BB4_K]]
 ; GCN-NEXT: s_endpgm
 ; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
-define void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
+define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 bb0:
   %tmp = icmp ne i32 %arg1, 0
   br i1 %tmp, label %bb2, label %bb3
@@ -285,7 +285,7 @@ bb4:
 ; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0{{$}}
 ; GCN-NEXT: s_setpc_b64 vcc
 ; GCN-NEXT .Lfunc_end{{[0-9]+}}:
-define void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
+define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 entry:
   br label %loop
 
@@ -342,7 +342,7 @@ loop:
 ; GCN-NEXT: v_nop_e64
 ; GCN-NEXT: ;;#ASMEND
 ; GCN-NEXT: s_endpgm
-define void @expand_requires_expand(i32 %cond0) #0 {
+define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
 bb0:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %cmp0 = icmp slt i32 %cond0, 0
@@ -399,7 +399,7 @@ bb3:
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 ; GCN-NEXT: s_sleep 5
 ; GCN-NEXT: s_endpgm
-define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
+define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %d_cmp = icmp ult i32 %tid, 16
@@ -462,7 +462,7 @@ endif:
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 ; GCN: buffer_store_dword
 ; GCN-NEXT: s_endpgm
-define void @analyze_mask_branch() #0 {
+define amdgpu_kernel void @analyze_mask_branch() #0 {
 entry:
   %reg = call float asm sideeffect "v_mov_b32_e64 $0, 0", "=v"()
   %cmp0 = fcmp ogt float %reg, 0.000000e+00

Modified: llvm/trunk/test/CodeGen/AMDGPU/bswap.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/bswap.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/bswap.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/bswap.ll Tue Mar 21 16:39:51 2017
@@ -17,7 +17,7 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x
 ; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[K]], [[TMP1]], [[TMP0]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
   store i32 %bswap, i32 addrspace(1)* %out, align 4
@@ -32,7 +32,7 @@ define void @test_bswap_i32(i32 addrspac
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
-define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
   store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8
@@ -53,7 +53,7 @@ define void @test_bswap_v2i32(<2 x i32>
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
-define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone
   store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16
@@ -86,7 +86,7 @@ define void @test_bswap_v4i32(<4 x i32>
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
-define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
   %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
   %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone
   store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32
@@ -95,21 +95,21 @@ define void @test_bswap_v8i32(<8 x i32>
 
 ; FUNC-LABEL: {{^}}test_bswap_i64:
 ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0
-define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
   store i64 %bswap, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-define void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
   %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone
   store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16
   ret void
 }
 
-define void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
   %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
   %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone
   store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32

Modified: llvm/trunk/test/CodeGen/AMDGPU/build_vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/build_vector.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/build_vector.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/build_vector.ll Tue Mar 21 16:39:51 2017
@@ -10,7 +10,7 @@
 ; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
 ; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
 ; SI: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}}
-define void @build_vector2 (<2 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @build_vector2 (<2 x i32> addrspace(1)* %out) {
 entry:
   store <2 x i32> <i32 5, i32 6>, <2 x i32> addrspace(1)* %out
   ret void
@@ -28,7 +28,7 @@ entry:
 ; SI-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7
 ; SI-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8
 ; SI: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}}
-define void @build_vector4 (<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @build_vector4 (<4 x i32> addrspace(1)* %out) {
 entry:
   store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32> addrspace(1)* %out
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/call.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/call.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/call.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/call.ll Tue Mar 21 16:39:51 2017
@@ -10,7 +10,7 @@
 
 declare i32 @external_function(i32) nounwind
 
-define void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -25,7 +25,7 @@ define i32 @defined_function(i32 %x) nou
   ret i32 %y
 }
 
-define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -35,7 +35,7 @@ define void @test_call(i32 addrspace(1)*
   ret void
 }
 
-define void @test_tail_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_tail_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr

Modified: llvm/trunk/test/CodeGen/AMDGPU/captured-frame-index.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/captured-frame-index.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/captured-frame-index.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/captured-frame-index.ll Tue Mar 21 16:39:51 2017
@@ -3,7 +3,7 @@
 ; GCN-LABEL: {{^}}store_fi_lifetime:
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %b = alloca i8
   call void @llvm.lifetime.start(i64 1, i8* %b)
@@ -18,7 +18,7 @@ entry:
 ; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 4{{$}}
 ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
 ; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO0]]
-define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
   %tmp = alloca float
   store float 4.0, float *%tmp
   store float* %tmp, float* addrspace(3)* %ptr
@@ -38,7 +38,7 @@ define void @stored_fi_to_lds(float* add
 
 ; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
 ; GCN: ds_write_b32  [[VLDSPTR]], [[FI1]]
-define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
   %tmp0 = alloca float
   %tmp1 = alloca float
   store float 4.0, float* %tmp0
@@ -54,7 +54,7 @@ define void @stored_fi_to_lds_2_small_ob
 ; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
-define void @stored_fi_to_self() #0 {
+define amdgpu_kernel void @stored_fi_to_self() #0 {
   %tmp = alloca i32*
 
   ; Avoid optimizing everything out
@@ -73,7 +73,7 @@ define void @stored_fi_to_self() #0 {
 
 ; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}}
 ; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}}
-define void @stored_fi_to_self_offset() #0 {
+define amdgpu_kernel void @stored_fi_to_self_offset() #0 {
   %tmp0 = alloca [512 x i32]
   %tmp1 = alloca i32*
 
@@ -98,7 +98,7 @@ define void @stored_fi_to_self_offset()
 
 ; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
 ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
-define void @stored_fi_to_fi() #0 {
+define amdgpu_kernel void @stored_fi_to_fi() #0 {
   %tmp0 = alloca i32*
   %tmp1 = alloca i32*
   %tmp2 = alloca i32*
@@ -118,7 +118,7 @@ define void @stored_fi_to_fi() #0 {
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
   %tmp = alloca float
   store float 0.0, float *%tmp
   store float* %tmp, float* addrspace(1)* %ptr
@@ -136,7 +136,7 @@ define void @stored_fi_to_global(float*
 
 ; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
 ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
   %tmp0 = alloca float
   %tmp1 = alloca float
   %tmp2 = alloca float
@@ -163,7 +163,7 @@ define void @stored_fi_to_global_2_small
 ; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 
 ; GCN: buffer_store_dword [[BASE_1_OFF_2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
   %tmp0 = alloca [4096 x i32]
   %tmp1 = alloca [4096 x i32]
   %gep0.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 0
@@ -186,7 +186,7 @@ define void @stored_fi_to_global_huge_fr
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC_HI]], g1 at gotpcrel32@hi+4
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 {
+define amdgpu_kernel void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 {
 entry:
   %b = alloca i32, align 4
   %tmp1 = load volatile i32*, i32* addrspace(1)* @g1, align 4

Modified: llvm/trunk/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cf-loop-on-constant.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cf-loop-on-constant.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cf-loop-on-constant.ll Tue Mar 21 16:39:51 2017
@@ -7,7 +7,7 @@
 ; GCN: ds_write_b32
 ; GCN: s_branch [[LABEL]]
 ; GCN: s_endpgm
-define void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   %cmp = icmp eq i32 %n, -1
   br i1 %cmp, label %for.exit, label %for.body
@@ -31,7 +31,7 @@ for.body:
 ; GCN: ds_read_b32
 ; GCN: ds_write_b32
 ; GCN: s_branch [[LABEL]]
-define void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   br label %for.body
 
@@ -52,7 +52,7 @@ for.body:
 ; GCN-LABEL: {{^}}loop_const_false:
 ; GCN-NOT: s_branch
 ; GCN: s_endpgm
-define void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   br label %for.body
 
@@ -74,7 +74,7 @@ for.body:
 ; GCN-LABEL: {{^}}loop_const_undef:
 ; GCN-NOT: s_branch
 ; GCN: s_endpgm
-define void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   br label %for.body
 
@@ -104,7 +104,7 @@ for.body:
 ; GCN: s_cbranch_vccnz [[LOOPBB]]
 ; GCN-NEXT: ; BB#2
 ; GCN-NEXT: s_endpgm
-define void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind {
+define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind {
 entry:
   br label %for.body
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/cf-stack-bug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cf-stack-bug.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cf-stack-bug.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cf-stack-bug.ll Tue Mar 21 16:39:51 2017
@@ -35,7 +35,7 @@
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested3:
-define void @nested3(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested3(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
@@ -68,7 +68,7 @@ end:
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested4:
-define void @nested4(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested4(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
@@ -109,7 +109,7 @@ end:
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested7:
-define void @nested7(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested7(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
@@ -174,7 +174,7 @@ end:
 ; BUG32: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested8:
-define void @nested8(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested8(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end

Modified: llvm/trunk/test/CodeGen/AMDGPU/cf_end.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cf_end.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cf_end.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cf_end.ll Tue Mar 21 16:39:51 2017
@@ -4,6 +4,6 @@
 
 ; EG: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x80]
 ; CM: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x88]
-define void @eop() {
+define amdgpu_kernel void @eop() {
   ret void
 }

Modified: llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll Tue Mar 21 16:39:51 2017
@@ -11,7 +11,7 @@
 ; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32:
 ; GCN: flat_load_dword
 ; GCN: {{^}}BB0_2:
-define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+define amdgpu_kernel void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
@@ -43,7 +43,7 @@ done:
 
 ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_global_i32:
 ; CI: buffer_load_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
-define void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
@@ -76,7 +76,7 @@ done:
 
 ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_constant_i32:
 ; CI: s_load_dword {{s[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-define void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7

Modified: llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cgp-addressing-modes.ll Tue Mar 21 16:39:51 2017
@@ -15,7 +15,7 @@ target datalayout = "e-p:32:32-p1:64:64-
 
 ; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
 ; GCN: {{^}}BB0_2:
-define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7
@@ -45,7 +45,7 @@ done:
 ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
 ; GCN: {{^}}BB1_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
@@ -72,7 +72,7 @@ done:
 ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
 ; GCN: {{^}}BB2_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095
@@ -99,7 +99,7 @@ done:
 ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
 ; GCN: {{^}}BB3_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096
@@ -131,7 +131,7 @@ done:
 ; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
 ; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
 ; GCN: {{^}}BB4_2:
-define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4
   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
@@ -172,7 +172,7 @@ done:
 ; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
 ; GCN: {{^BB[0-9]+}}_2:
 
-define void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4
   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
@@ -209,7 +209,7 @@ done:
 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 ; GCN: {{^BB[0-9]+}}_2:
-define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4
   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
@@ -241,7 +241,7 @@ done:
 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 ; GCN: {{^BB[0-9]+}}_2:
-define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
+define amdgpu_kernel void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
 entry:
   %offset.ext = zext i32 %offset to i64
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
@@ -271,7 +271,7 @@ done:
 ; GCN: s_and_saveexec_b64
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7
@@ -300,7 +300,7 @@ done:
 ; GCN: s_and_saveexec_b64
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255
@@ -333,7 +333,7 @@ done:
 
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256
@@ -365,7 +365,7 @@ done:
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295
@@ -396,7 +396,7 @@ done:
 ; GCN: s_addc_u32
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181
@@ -426,7 +426,7 @@ done:
 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}
 
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143
@@ -464,7 +464,7 @@ done:
 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
 
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144
@@ -494,7 +494,7 @@ done:
 ; GCN: s_load_dword [[SREG1:s[0-9]+]],
 ; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
 ; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5
-define void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
+define amdgpu_kernel void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
 entry:
   %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
   %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
@@ -521,7 +521,7 @@ bb34:
 ; OPT: if:
 ; OPT: %sunkaddr = ptrtoint i8 addrspace(2)* %in to i64
 ; OPT: %sunkaddr1 = add i64 %sunkaddr, 4095
-define void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
   %in.gep = getelementptr i8, i8 addrspace(2)* %in, i64 4095
@@ -548,7 +548,7 @@ done:
 ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28
 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)*
 ; OPT: %tmp1 = atomicrmw add i32 addrspace(3)* %sunkaddr2, i32 2 seq_cst
-define void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
@@ -574,7 +574,7 @@ done:
 ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28
 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)*
 ; OPT: %tmp1.struct = cmpxchg i32 addrspace(3)* %sunkaddr2, i32 undef, i32 2 seq_cst monotonic
-define void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
@@ -600,7 +600,7 @@ done:
 ; OPT: %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
 ; OPT: br i1
 ; OPT: cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic
-define void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
   %out.gep = getelementptr i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* %out, i32 999999
   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
@@ -627,7 +627,7 @@ done:
 ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28
 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)*
 ; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %sunkaddr2, i32 2)
-define void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
@@ -653,7 +653,7 @@ done:
 ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28
 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)*
 ; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %sunkaddr2, i32 2)
-define void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7

Modified: llvm/trunk/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll Tue Mar 21 16:39:51 2017
@@ -36,7 +36,7 @@
 ; GCN: BB0_3:
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @sink_ubfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
 entry:
   %shr = lshr i32 %arg1, 8
   br i1 undef, label %bb0, label %bb1
@@ -76,7 +76,7 @@ ret:
 ; OPT: ret
 
 ; GCN-LABEL: {{^}}sink_sbfe_i32:
-define void @sink_sbfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
+define amdgpu_kernel void @sink_sbfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
 entry:
   %shr = ashr i32 %arg1, 8
   br i1 undef, label %bb0, label %bb1
@@ -134,7 +134,7 @@ ret:
 ; GCN: BB2_3:
 ; GCN: buffer_store_short
 ; GCN: s_endpgm
-define void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 {
 entry:
   %shr = lshr i16 %arg1, 4
   br i1 undef, label %bb0, label %bb1
@@ -187,7 +187,7 @@ ret:
 
 ; GCN: BB3_3:
 ; GCN: buffer_store_dwordx2
-define void @sink_ubfe_i64_span_midpoint(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_span_midpoint(i64 addrspace(1)* %out, i64 %arg1) #0 {
 entry:
   %shr = lshr i64 %arg1, 30
   br i1 undef, label %bb0, label %bb1
@@ -236,7 +236,7 @@ ret:
 
 ; GCN: BB4_3:
 ; GCN: buffer_store_dwordx2
-define void @sink_ubfe_i64_low32(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_low32(i64 addrspace(1)* %out, i64 %arg1) #0 {
 entry:
   %shr = lshr i64 %arg1, 15
   br i1 undef, label %bb0, label %bb1
@@ -283,7 +283,7 @@ ret:
 
 ; GCN: BB5_3:
 ; GCN: buffer_store_dwordx2
-define void @sink_ubfe_i64_high32(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_high32(i64 addrspace(1)* %out, i64 %arg1) #0 {
 entry:
   %shr = lshr i64 %arg1, 35
   br i1 undef, label %bb0, label %bb1

Modified: llvm/trunk/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll Tue Mar 21 16:39:51 2017
@@ -8,7 +8,7 @@ declare i1 @llvm.amdgcn.class.f32(float,
 ; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
-define void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
+define amdgpu_kernel void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
 bb0:
   %tmp = icmp sgt i32 %arg1, 4
   %c = icmp eq i32 %arg3, 0
@@ -35,7 +35,7 @@ bb2:
 ; GCN-NOT: vcc
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
-define void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
+define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
 bb0:
   %tmp = icmp sgt i32 %arg1, 4
   %undef = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef)

Modified: llvm/trunk/test/CodeGen/AMDGPU/coalescer_remat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/coalescer_remat.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/coalescer_remat.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/coalescer_remat.ll Tue Mar 21 16:39:51 2017
@@ -13,7 +13,7 @@ declare float @llvm.fma.f32(float, float
 ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
 ; It's probably OK if this is slightly higher:
 ; CHECK: ; NumVgprs: 8
-define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
+define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
 entry:
   %cmpflag = icmp eq i32 %flag, 1
   br i1 %cmpflag, label %loop, label %exit

Modified: llvm/trunk/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll Tue Mar 21 16:39:51 2017
@@ -8,7 +8,7 @@
 ; SI-LLC-LABEL: {{^}}test:
 ; SI-LLC: s_mul_i32
 ; SI-LLC-NOT: mul
-define void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) {
+define amdgpu_kernel void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) {
 entry:
   %0 = mul nsw i32 %a, 3
   %1 = sext i32 %0 to i64

Modified: llvm/trunk/test/CodeGen/AMDGPU/combine_vloads.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/combine_vloads.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/combine_vloads.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/combine_vloads.ll Tue Mar 21 16:39:51 2017
@@ -12,7 +12,7 @@
 ; EG-LABEL: {{^}}combine_vloads:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
+define amdgpu_kernel void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
 entry:
   br label %for.body
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/commute-compares.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/commute-compares.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/commute-compares.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/commute-compares.ll Tue Mar 21 16:39:51 2017
@@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 
 ; GCN-LABEL: {{^}}commute_eq_64_i32:
 ; GCN: v_cmp_eq_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -21,7 +21,7 @@ define void @commute_eq_64_i32(i32 addrs
 
 ; GCN-LABEL: {{^}}commute_ne_64_i32:
 ; GCN: v_cmp_ne_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -36,7 +36,7 @@ define void @commute_ne_64_i32(i32 addrs
 ; GCN-LABEL: {{^}}commute_ne_litk_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039
 ; GCN: v_cmp_ne_u32_e32 vcc, [[K]], v{{[0-9]+}}
-define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -49,7 +49,7 @@ define void @commute_ne_litk_i32(i32 add
 
 ; GCN-LABEL: {{^}}commute_ugt_64_i32:
 ; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -62,7 +62,7 @@ define void @commute_ugt_64_i32(i32 addr
 
 ; GCN-LABEL: {{^}}commute_uge_64_i32:
 ; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}}
-define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -75,7 +75,7 @@ define void @commute_uge_64_i32(i32 addr
 
 ; GCN-LABEL: {{^}}commute_ult_64_i32:
 ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -88,7 +88,7 @@ define void @commute_ult_64_i32(i32 addr
 
 ; GCN-LABEL: {{^}}commute_ule_63_i32:
 ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -104,7 +104,7 @@ define void @commute_ule_63_i32(i32 addr
 ; GCN-LABEL: {{^}}commute_ule_64_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}}
 ; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}}
-define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -117,7 +117,7 @@ define void @commute_ule_64_i32(i32 addr
 
 ; GCN-LABEL: {{^}}commute_sgt_neg1_i32:
 ; GCN: v_cmp_lt_i32_e32 vcc, -1, v{{[0-9]+}}
-define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -130,7 +130,7 @@ define void @commute_sgt_neg1_i32(i32 ad
 
 ; GCN-LABEL: {{^}}commute_sge_neg2_i32:
 ; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}}
-define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -143,7 +143,7 @@ define void @commute_sge_neg2_i32(i32 ad
 
 ; GCN-LABEL: {{^}}commute_slt_neg16_i32:
 ; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}}
-define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -156,7 +156,7 @@ define void @commute_slt_neg16_i32(i32 a
 
 ; GCN-LABEL: {{^}}commute_sle_5_i32:
 ; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}}
-define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -173,7 +173,7 @@ define void @commute_sle_5_i32(i32 addrs
 
 ; GCN-LABEL: {{^}}commute_eq_64_i64:
 ; GCN: v_cmp_eq_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -186,7 +186,7 @@ define void @commute_eq_64_i64(i32 addrs
 
 ; GCN-LABEL: {{^}}commute_ne_64_i64:
 ; GCN: v_cmp_ne_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -199,7 +199,7 @@ define void @commute_ne_64_i64(i32 addrs
 
 ; GCN-LABEL: {{^}}commute_ugt_64_i64:
 ; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -212,7 +212,7 @@ define void @commute_ugt_64_i64(i32 addr
 
 ; GCN-LABEL: {{^}}commute_uge_64_i64:
 ; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -225,7 +225,7 @@ define void @commute_uge_64_i64(i32 addr
 
 ; GCN-LABEL: {{^}}commute_ult_64_i64:
 ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -238,7 +238,7 @@ define void @commute_ult_64_i64(i32 addr
 
 ; GCN-LABEL: {{^}}commute_ule_63_i64:
 ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -254,7 +254,7 @@ define void @commute_ule_63_i64(i32 addr
 ; GCN-LABEL: {{^}}commute_ule_64_i64:
 ; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}}
 ; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -267,7 +267,7 @@ define void @commute_ule_64_i64(i32 addr
 
 ; GCN-LABEL: {{^}}commute_sgt_neg1_i64:
 ; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -280,7 +280,7 @@ define void @commute_sgt_neg1_i64(i32 ad
 
 ; GCN-LABEL: {{^}}commute_sge_neg2_i64:
 ; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -293,7 +293,7 @@ define void @commute_sge_neg2_i64(i32 ad
 
 ; GCN-LABEL: {{^}}commute_slt_neg16_i64:
 ; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -306,7 +306,7 @@ define void @commute_slt_neg16_i64(i32 a
 
 ; GCN-LABEL: {{^}}commute_sle_5_i64:
 ; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -324,7 +324,7 @@ define void @commute_sle_5_i64(i32 addrs
 
 ; GCN-LABEL: {{^}}commute_oeq_2.0_f32:
 ; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -338,7 +338,7 @@ define void @commute_oeq_2.0_f32(i32 add
 
 ; GCN-LABEL: {{^}}commute_ogt_2.0_f32:
 ; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -351,7 +351,7 @@ define void @commute_ogt_2.0_f32(i32 add
 
 ; GCN-LABEL: {{^}}commute_oge_2.0_f32:
 ; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -364,7 +364,7 @@ define void @commute_oge_2.0_f32(i32 add
 
 ; GCN-LABEL: {{^}}commute_olt_2.0_f32:
 ; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -377,7 +377,7 @@ define void @commute_olt_2.0_f32(i32 add
 
 ; GCN-LABEL: {{^}}commute_ole_2.0_f32:
 ; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -390,7 +390,7 @@ define void @commute_ole_2.0_f32(i32 add
 
 ; GCN-LABEL: {{^}}commute_one_2.0_f32:
 ; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -403,7 +403,7 @@ define void @commute_one_2.0_f32(i32 add
 
 ; GCN-LABEL: {{^}}commute_ord_2.0_f32:
 ; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
-define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -416,7 +416,7 @@ define void @commute_ord_2.0_f32(i32 add
 
 ; GCN-LABEL: {{^}}commute_ueq_2.0_f32:
 ; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -429,7 +429,7 @@ define void @commute_ueq_2.0_f32(i32 add
 
 ; GCN-LABEL: {{^}}commute_ugt_2.0_f32:
 ; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -442,7 +442,7 @@ define void @commute_ugt_2.0_f32(i32 add
 
 ; GCN-LABEL: {{^}}commute_uge_2.0_f32:
 ; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -455,7 +455,7 @@ define void @commute_uge_2.0_f32(i32 add
 
 ; GCN-LABEL: {{^}}commute_ult_2.0_f32:
 ; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -468,7 +468,7 @@ define void @commute_ult_2.0_f32(i32 add
 
 ; GCN-LABEL: {{^}}commute_ule_2.0_f32:
 ; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -481,7 +481,7 @@ define void @commute_ule_2.0_f32(i32 add
 
 ; GCN-LABEL: {{^}}commute_une_2.0_f32:
 ; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -494,7 +494,7 @@ define void @commute_une_2.0_f32(i32 add
 
 ; GCN-LABEL: {{^}}commute_uno_2.0_f32:
 ; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
-define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -512,7 +512,7 @@ define void @commute_uno_2.0_f32(i32 add
 
 ; GCN-LABEL: {{^}}commute_oeq_2.0_f64:
 ; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -526,7 +526,7 @@ define void @commute_oeq_2.0_f64(i32 add
 
 ; GCN-LABEL: {{^}}commute_ogt_2.0_f64:
 ; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -539,7 +539,7 @@ define void @commute_ogt_2.0_f64(i32 add
 
 ; GCN-LABEL: {{^}}commute_oge_2.0_f64:
 ; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -552,7 +552,7 @@ define void @commute_oge_2.0_f64(i32 add
 
 ; GCN-LABEL: {{^}}commute_olt_2.0_f64:
 ; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -565,7 +565,7 @@ define void @commute_olt_2.0_f64(i32 add
 
 ; GCN-LABEL: {{^}}commute_ole_2.0_f64:
 ; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -578,7 +578,7 @@ define void @commute_ole_2.0_f64(i32 add
 
 ; GCN-LABEL: {{^}}commute_one_2.0_f64:
 ; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -591,7 +591,7 @@ define void @commute_one_2.0_f64(i32 add
 
 ; GCN-LABEL: {{^}}commute_ord_2.0_f64:
 ; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
-define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -604,7 +604,7 @@ define void @commute_ord_2.0_f64(i32 add
 
 ; GCN-LABEL: {{^}}commute_ueq_2.0_f64:
 ; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -617,7 +617,7 @@ define void @commute_ueq_2.0_f64(i32 add
 
 ; GCN-LABEL: {{^}}commute_ugt_2.0_f64:
 ; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -630,7 +630,7 @@ define void @commute_ugt_2.0_f64(i32 add
 
 ; GCN-LABEL: {{^}}commute_uge_2.0_f64:
 ; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -643,7 +643,7 @@ define void @commute_uge_2.0_f64(i32 add
 
 ; GCN-LABEL: {{^}}commute_ult_2.0_f64:
 ; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -656,7 +656,7 @@ define void @commute_ult_2.0_f64(i32 add
 
 ; GCN-LABEL: {{^}}commute_ule_2.0_f64:
 ; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -669,7 +669,7 @@ define void @commute_ule_2.0_f64(i32 add
 
 ; GCN-LABEL: {{^}}commute_une_2.0_f64:
 ; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -682,7 +682,7 @@ define void @commute_une_2.0_f64(i32 add
 
 ; GCN-LABEL: {{^}}commute_uno_2.0_f64:
 ; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
-define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -703,7 +703,7 @@ define void @commute_uno_2.0_f64(i32 add
 
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}}
-define void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %stack0 = alloca i32
   %ptr0 = load volatile i32*, i32* addrspace(1)* undef

Modified: llvm/trunk/test/CodeGen/AMDGPU/commute_modifiers.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/commute_modifiers.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/commute_modifiers.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/commute_modifiers.ll Tue Mar 21 16:39:51 2017
@@ -8,7 +8,7 @@ declare float @llvm.fma.f32(float, float
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, 2.0
 ; SI: buffer_store_dword [[REG]]
-define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -22,7 +22,7 @@ define void @commute_add_imm_fabs_f32(fl
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -4.0
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -37,7 +37,7 @@ define void @commute_mul_imm_fneg_fabs_f
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]]
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -53,7 +53,7 @@ define void @commute_mul_imm_fneg_f32(fl
 ; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[K]], |[[X]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -68,7 +68,7 @@ define void @commute_add_lit_fabs_f32(fl
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -85,7 +85,7 @@ define void @commute_add_fabs_f32(float
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -102,7 +102,7 @@ define void @commute_mul_fneg_f32(float
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -121,7 +121,7 @@ define void @commute_mul_fabs_fneg_f32(f
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -139,7 +139,7 @@ define void @commute_mul_fabs_x_fabs_y_f
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -161,7 +161,7 @@ define void @commute_mul_fabs_x_fneg_fab
 ; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, |[[R2]]|
 ; SI: buffer_store_dword [[RESULT]]
-define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1

Modified: llvm/trunk/test/CodeGen/AMDGPU/concat_vectors.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/concat_vectors.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/concat_vectors.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/concat_vectors.ll Tue Mar 21 16:39:51 2017
@@ -8,7 +8,7 @@
 ; value if we want to ensure scratch memory is not being used.
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
   %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
   store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8
   ret void
@@ -17,7 +17,7 @@ define void @test_concat_v1i32(<2 x i32>
 ; FUNC-LABEL: {{^}}test_concat_v2i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
   %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16
   ret void
@@ -26,7 +26,7 @@ define void @test_concat_v2i32(<4 x i32>
 ; FUNC-LABEL: {{^}}test_concat_v4i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
   %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32
   ret void
@@ -35,7 +35,7 @@ define void @test_concat_v4i32(<8 x i32>
 ; FUNC-LABEL: {{^}}test_concat_v8i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
   %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64
   ret void
@@ -44,7 +44,7 @@ define void @test_concat_v8i32(<16 x i32
 ; FUNC-LABEL: {{^}}test_concat_v16i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind {
   %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128
   ret void
@@ -53,7 +53,7 @@ define void @test_concat_v16i32(<32 x i3
 ; FUNC-LABEL: {{^}}test_concat_v1f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind {
   %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> <i32 0, i32 1>
   store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -62,7 +62,7 @@ define void @test_concat_v1f32(<2 x floa
 ; FUNC-LABEL: {{^}}test_concat_v2f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
   %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -71,7 +71,7 @@ define void @test_concat_v2f32(<4 x floa
 ; FUNC-LABEL: {{^}}test_concat_v4f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
   %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -80,7 +80,7 @@ define void @test_concat_v4f32(<8 x floa
 ; FUNC-LABEL: {{^}}test_concat_v8f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
   %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64
   ret void
@@ -89,7 +89,7 @@ define void @test_concat_v8f32(<16 x flo
 ; FUNC-LABEL: {{^}}test_concat_v16f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
   %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128
   ret void
@@ -98,7 +98,7 @@ define void @test_concat_v16f32(<32 x fl
 ; FUNC-LABEL: {{^}}test_concat_v1i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
   %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
   store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -107,7 +107,7 @@ define void @test_concat_v1i64(<2 x doub
 ; FUNC-LABEL: {{^}}test_concat_v2i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -116,7 +116,7 @@ define void @test_concat_v2i64(<4 x doub
 ; FUNC-LABEL: {{^}}test_concat_v4i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -125,7 +125,7 @@ define void @test_concat_v4i64(<8 x doub
 ; FUNC-LABEL: {{^}}test_concat_v8i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
   ret void
@@ -134,7 +134,7 @@ define void @test_concat_v8i64(<16 x dou
 ; FUNC-LABEL: {{^}}test_concat_v16i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
   ret void
@@ -143,7 +143,7 @@ define void @test_concat_v16i64(<32 x do
 ; FUNC-LABEL: {{^}}test_concat_v1f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
   %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
   store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -152,7 +152,7 @@ define void @test_concat_v1f64(<2 x doub
 ; FUNC-LABEL: {{^}}test_concat_v2f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -161,7 +161,7 @@ define void @test_concat_v2f64(<4 x doub
 ; FUNC-LABEL: {{^}}test_concat_v4f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -170,7 +170,7 @@ define void @test_concat_v4f64(<8 x doub
 ; FUNC-LABEL: {{^}}test_concat_v8f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
   ret void
@@ -179,7 +179,7 @@ define void @test_concat_v8f64(<16 x dou
 ; FUNC-LABEL: {{^}}test_concat_v16f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
   ret void
@@ -188,7 +188,7 @@ define void @test_concat_v16f64(<32 x do
 ; FUNC-LABEL: {{^}}test_concat_v1i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind {
   %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> <i32 0, i32 1>
   store <2 x i1> %concat, <2 x i1> addrspace(1)* %out
   ret void
@@ -197,7 +197,7 @@ define void @test_concat_v1i1(<2 x i1> a
 ; FUNC-LABEL: {{^}}test_concat_v2i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind {
   %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i1> %concat, <4 x i1> addrspace(1)* %out
   ret void
@@ -206,7 +206,7 @@ define void @test_concat_v2i1(<4 x i1> a
 ; FUNC-LABEL: {{^}}test_concat_v4i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind {
   %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i1> %concat, <8 x i1> addrspace(1)* %out
   ret void
@@ -215,7 +215,7 @@ define void @test_concat_v4i1(<8 x i1> a
 ; FUNC-LABEL: {{^}}test_concat_v8i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind {
   %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x i1> %concat, <16 x i1> addrspace(1)* %out
   ret void
@@ -224,7 +224,7 @@ define void @test_concat_v8i1(<16 x i1>
 ; FUNC-LABEL: {{^}}test_concat_v16i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind {
   %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x i1> %concat, <32 x i1> addrspace(1)* %out
   ret void
@@ -233,7 +233,7 @@ define void @test_concat_v16i1(<32 x i1>
 ; FUNC-LABEL: {{^}}test_concat_v32i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind {
   %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
   store <64 x i1> %concat, <64 x i1> addrspace(1)* %out
   ret void
@@ -242,7 +242,7 @@ define void @test_concat_v32i1(<64 x i1>
 ; FUNC-LABEL: {{^}}test_concat_v1i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind {
   %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> <i32 0, i32 1>
   store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4
   ret void
@@ -251,7 +251,7 @@ define void @test_concat_v1i16(<2 x i16>
 ; FUNC-LABEL: {{^}}test_concat_v2i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind {
   %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8
   ret void
@@ -260,7 +260,7 @@ define void @test_concat_v2i16(<4 x i16>
 ; FUNC-LABEL: {{^}}test_concat_v4i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
   %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16
   ret void
@@ -269,7 +269,7 @@ define void @test_concat_v4i16(<8 x i16>
 ; FUNC-LABEL: {{^}}test_concat_v8i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
   %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32
   ret void
@@ -278,7 +278,7 @@ define void @test_concat_v8i16(<16 x i16
 ; FUNC-LABEL: {{^}}test_concat_v16i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind {
   %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64
   ret void
@@ -286,7 +286,7 @@ define void @test_concat_v16i16(<32 x i1
 
 ; FUNC-LABEL: {{^}}concat_vector_crash:
 ; SI: s_endpgm
-define void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
 bb:
   %tmp = load <2 x float>, <2 x float> addrspace(1)* %in, align 4
   %tmp1 = shufflevector <2 x float> %tmp, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>

Modified: llvm/trunk/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir Tue Mar 21 16:39:51 2017
@@ -1,12 +1,12 @@
 # RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination -o - %s | FileCheck -check-prefix=GCN %s
 --- |
-  define void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  define amdgpu_kernel void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %and = and i32 %a, 1234567
     store volatile i32 %and, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
@@ -17,13 +17,13 @@
     ret void
   }
 
-  define void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  define amdgpu_kernel void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %shl = shl i32 %a, 12
     store volatile i32 %shl, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
@@ -34,13 +34,13 @@
     ret void
   }
 
-  define void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  define amdgpu_kernel void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %ashr = ashr i32 %a, 12
     store volatile i32 %ashr, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
@@ -51,13 +51,13 @@
     ret void
   }
 
-   define void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+   define amdgpu_kernel void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %lshr = lshr i32 %a, 12
     store volatile i32 %lshr, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom

Modified: llvm/trunk/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
   %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %and = and i32 %size, %x
@@ -17,7 +17,7 @@ define void @fold_mi_v_and_0(i32 addrspa
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %and = and i32 %size, %x
   store i32 %and, i32 addrspace(1)* %out
@@ -28,7 +28,7 @@ define void @fold_mi_s_and_0(i32 addrspa
 ; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
   %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %or = or i32 %size, %x
@@ -42,7 +42,7 @@ define void @fold_mi_v_or_0(i32 addrspac
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %or = or i32 %size, %x
   store i32 %or, i32 addrspace(1)* %out
@@ -53,7 +53,7 @@ define void @fold_mi_s_or_0(i32 addrspac
 ; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
   %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %xor = xor i32 %size, %x
@@ -67,7 +67,7 @@ define void @fold_mi_v_xor_0(i32 addrspa
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %xor = xor i32 %size, %x
   store i32 %xor, i32 addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @fold_mi_s_xor_0(i32 addrspa
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], -1{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %xor = xor i32 %size, -1
   store i32 %xor, i32 addrspace(1)* %out
@@ -91,7 +91,7 @@ define void @fold_mi_s_not_0(i32 addrspa
 ; GCN-NEXT: v_not_b32_e32 v[[RESULT_LO]]
 ; GCN-NEXT: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], -1{{$}}
 ; GCN-NEXT: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
   %vreg = load volatile i64, i64 addrspace(1)* undef
   %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg)
   %xor = xor i64 %ctpop, -1
@@ -110,7 +110,7 @@ define void @fold_mi_v_not_0(i64 addrspa
 ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
   %vreg0 = load volatile i64, i64 addrspace(1)* undef
   %vreg1 = load volatile i64, i64 addrspace(1)* undef
   %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0)
@@ -126,7 +126,7 @@ define void @fold_mi_or_neg1(i64 addrspa
 ; GCN: v_not_b32
 ; GCN: v_and_b32
 ; GCN-NOT: v_and_b32
-define void @fold_mi_and_neg1(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_and_neg1(i64 addrspace(1)* %out) {
   %vreg0 = load volatile i64, i64 addrspace(1)* undef
   %vreg1 = load volatile i64, i64 addrspace(1)* undef
   %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0)

Modified: llvm/trunk/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll Tue Mar 21 16:39:51 2017
@@ -72,7 +72,7 @@
 ; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], s7 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
-define void @divergent_if_endif(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %load0 = load volatile i32, i32 addrspace(3)* undef
@@ -150,7 +150,7 @@ endif:
 ; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
-define void @divergent_loop(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %load0 = load volatile i32, i32 addrspace(3)* undef
@@ -272,7 +272,7 @@ end:
 
 ; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]]
-define void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %load0 = load volatile i32, i32 addrspace(3)* undef

Modified: llvm/trunk/test/CodeGen/AMDGPU/convergent-inlineasm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/convergent-inlineasm.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/convergent-inlineasm.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/convergent-inlineasm.ll Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 ; GCN: v_cmp_ne_u32_e64
 ; GCN: ; mask branch
 ; GCN: BB{{[0-9]+_[0-9]+}}:
-define void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1) #1
@@ -30,7 +30,7 @@ bb5:
 
 ; GCN: BB{{[0-9]+_[0-9]+}}:
 
-define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1)

Modified: llvm/trunk/test/CodeGen/AMDGPU/copy-illegal-type.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/copy-illegal-type.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/copy-illegal-type.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/copy-illegal-type.ll Tue Mar 21 16:39:51 2017
@@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.y()
 ; GCN: buffer_load_dword [[REG:v[0-9]+]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
   ret void
@@ -19,7 +19,7 @@ define void @test_copy_v4i8(<4 x i8> add
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
@@ -32,7 +32,7 @@ define void @test_copy_v4i8_x2(<4 x i8>
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
@@ -47,7 +47,7 @@ define void @test_copy_v4i8_x3(<4 x i8>
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
@@ -65,7 +65,7 @@ define void @test_copy_v4i8_x4(<4 x i8>
 ; GCN-DAG: buffer_store_dword
 
 ; GCN: s_endpgm
-define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
@@ -85,7 +85,7 @@ define void @test_copy_v4i8_extra_use(<4
 ; GCN: {{buffer|flat}}_store_dword
 ; GCN: {{buffer|flat}}_store_dword
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
@@ -101,7 +101,7 @@ define void @test_copy_v4i8_x2_extra_use
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
 ; GCN: s_endpgm
-define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
   ret void
@@ -113,7 +113,7 @@ define void @test_copy_v3i8_align4(<3 x
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
 ; GCN: s_endpgm
-define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2
   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2
   ret void
@@ -128,7 +128,7 @@ define void @test_copy_v3i8_align2(<3 x
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1
   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1
   ret void
@@ -141,7 +141,7 @@ define void @test_copy_v3i8_align1(<3 x
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
   ret void
@@ -157,7 +157,7 @@ define void @test_copy_v4i8_volatile_loa
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/copy-to-reg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/copy-to-reg.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/copy-to-reg.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/copy-to-reg.ll Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@
 
 ; Make sure this doesn't crash
 ; CHECK-LABEL: {{^}}copy_to_reg_frameindex:
-define void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %alloca = alloca [16 x i32]
   br label %loop

Modified: llvm/trunk/test/CodeGen/AMDGPU/ctlz.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ctlz.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ctlz.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ctlz.ll Tue Mar 21 16:39:51 2017
@@ -27,7 +27,7 @@ declare i32 @llvm.r600.read.tidig.x() no
 
 ; EG: FFBH_UINT
 ; EG: CNDE_INT
-define void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
   ret void
@@ -43,7 +43,7 @@ define void @s_ctlz_i32(i32 addrspace(1)
 
 ; EG: FFBH_UINT
 ; EG: CNDE_INT
-define void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr, align 4
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
@@ -61,7 +61,7 @@ define void @v_ctlz_i32(i32 addrspace(1)
 ; EG: CNDE_INT
 ; EG: FFBH_UINT
 ; EG: CNDE_INT
-define void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
   %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
   store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
@@ -89,7 +89,7 @@ define void @v_ctlz_v2i32(<2 x i32> addr
 
 ; EG-DAG: FFBH_UINT
 ; EG-DAG: CNDE_INT
-define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
   store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
@@ -101,7 +101,7 @@ define void @v_ctlz_v4i32(<4 x i32> addr
 ; GCN-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_byte [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %val = load i8, i8 addrspace(1)* %valptr
   %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
   store i8 %ctlz, i8 addrspace(1)* %out
@@ -119,14 +119,14 @@ define void @v_ctlz_i8(i8 addrspace(1)*
 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
   store i64 %ctlz, i64 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_ctlz_i64_trunc:
-define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
   %trunc = trunc i64 %ctlz to i32
   store i32 %trunc, i32 addrspace(1)* %out
@@ -145,7 +145,7 @@ define void @s_ctlz_i64_trunc(i32 addrsp
 ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]]
 ; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
-define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -156,7 +156,7 @@ define void @v_ctlz_i64(i64 addrspace(1)
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_i64_trunc:
-define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -172,7 +172,7 @@ define void @v_ctlz_i64_trunc(i32 addrsp
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
- define void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -186,7 +186,7 @@ define void @v_ctlz_i64_trunc(i32 addrsp
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp ne i32 %val, 0
@@ -202,7 +202,7 @@ define void @v_ctlz_i32_sel_ne_neg1(i32
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: s_endpgm
-define void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp eq i32 %ctlz, 32
@@ -217,7 +217,7 @@ define void @v_ctlz_i32_sel_eq_bitwidth(
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: s_endpgm
-define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp ne i32 %ctlz, 32
@@ -230,7 +230,7 @@ define void @v_ctlz_i32_sel_ne_bitwidth(
 ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; GCN: {{buffer|flat}}_store_byte [[FFBH]],
- define void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
   %val = load i8, i8 addrspace(1)* %valptr.gep
@@ -245,7 +245,7 @@ define void @v_ctlz_i32_sel_ne_bitwidth(
 ; SI: buffer_load_ushort [[VAL:v[0-9]+]],
 ; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_short [[FFBH]],
- define void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
   %val = load i16, i16 addrspace(1)* %valptr
   %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone
   %cmp = icmp eq i16 %val, 0
@@ -260,7 +260,7 @@ define void @v_ctlz_i32_sel_ne_bitwidth(
 ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; GCN: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7f, [[FFBH]]
 ; GCN: {{buffer|flat}}_store_byte [[TRUNC]],
-define void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid
   %val = load i7, i7 addrspace(1)* %valptr.gep

Modified: llvm/trunk/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ctlz_zero_undef.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ctlz_zero_undef.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ctlz_zero_undef.ll Tue Mar 21 16:39:51 2017
@@ -22,7 +22,7 @@ declare i32 @llvm.r600.read.tidig.x() no
 ; GCN: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
   ret void
@@ -35,7 +35,7 @@ define void @s_ctlz_zero_undef_i32(i32 a
 ; GCN: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr, align 4
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
@@ -51,7 +51,7 @@ define void @v_ctlz_zero_undef_i32(i32 a
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
   %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
   store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
@@ -71,7 +71,7 @@ define void @v_ctlz_zero_undef_v2i32(<2
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
   store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
@@ -82,7 +82,7 @@ define void @v_ctlz_zero_undef_v4i32(<4
 ; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_byte [[RESULT]],
-define void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %val = load i8, i8 addrspace(1)* %valptr
   %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
   store i8 %ctlz, i8 addrspace(1)* %out
@@ -100,14 +100,14 @@ define void @v_ctlz_zero_undef_i8(i8 add
 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
   store i64 %ctlz, i64 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64_trunc:
-define void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
   %trunc = trunc i64 %ctlz to i32
   store i32 %trunc, i32 addrspace(1)* %out
@@ -123,7 +123,7 @@ define void @s_ctlz_zero_undef_i64_trunc
 ; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -134,7 +134,7 @@ define void @v_ctlz_zero_undef_i64(i64 a
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64_trunc:
-define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -149,7 +149,7 @@ define void @v_ctlz_zero_undef_i64_trunc
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
- define void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -162,7 +162,7 @@ define void @v_ctlz_zero_undef_i64_trunc
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
-define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp ne i32 %val, 0
@@ -175,7 +175,7 @@ define void @v_ctlz_zero_undef_i32_sel_n
 ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; GCN: {{buffer|flat}}_store_byte [[FFBH]],
-define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
   %val = load i8, i8 addrspace(1)* %valptr.gep
@@ -194,7 +194,7 @@ define void @v_ctlz_zero_undef_i8_sel_eq
 ; GCN-DAG: buffer_store_dword [[RESULT0]]
 ; GCN-DAG: buffer_store_byte [[RESULT1]]
 ; GCN: s_endpgm
- define void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -211,7 +211,7 @@ define void @v_ctlz_zero_undef_i8_sel_eq
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
- define void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -227,7 +227,7 @@ define void @v_ctlz_zero_undef_i8_sel_eq
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
-define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp ne i32 %val, 0
@@ -243,7 +243,7 @@ define void @v_ctlz_zero_undef_i32_sel_n
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
- define void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 1
@@ -259,7 +259,7 @@ define void @v_ctlz_zero_undef_i32_sel_n
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
-define void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp ne i32 %val, 1

Modified: llvm/trunk/test/CodeGen/AMDGPU/ctpop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ctpop.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ctpop.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ctpop.ll Tue Mar 21 16:39:51 2017
@@ -16,7 +16,7 @@ declare <16 x i32> @llvm.ctpop.v16i32(<1
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   store i32 %ctpop, i32 addrspace(1)* %out, align 4
   ret void
@@ -30,7 +30,7 @@ define void @s_ctpop_i32(i32 addrspace(1
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   store i32 %ctpop, i32 addrspace(1)* %out, align 4
@@ -48,7 +48,7 @@ define void @v_ctpop_i32(i32 addrspace(1
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
+define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
   %val0 = load i32, i32 addrspace(1)* %in0, align 4
   %val1 = load i32, i32 addrspace(1)* %in1, align 4
   %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
@@ -64,7 +64,7 @@ define void @v_ctpop_add_chain_i32(i32 a
 ; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
+define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
   %val0 = load i32, i32 addrspace(1)* %in0, align 4
   %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
   %add = add i32 %ctpop0, %sval
@@ -79,7 +79,7 @@ define void @v_ctpop_add_sgpr_i32(i32 ad
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
   store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
@@ -97,7 +97,7 @@ define void @v_ctpop_v2i32(<2 x i32> add
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
   store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
@@ -123,7 +123,7 @@ define void @v_ctpop_v4i32(<4 x i32> add
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
   %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
   store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
@@ -165,7 +165,7 @@ define void @v_ctpop_v8i32(<8 x i32> add
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32
   %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
   store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
@@ -179,7 +179,7 @@ define void @v_ctpop_v16i32(<16 x i32> a
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, 4
@@ -194,7 +194,7 @@ define void @v_ctpop_i32_add_inline_cons
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 4, %ctpop
@@ -209,7 +209,7 @@ define void @v_ctpop_i32_add_inline_cons
 ; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, 99999
@@ -225,7 +225,7 @@ define void @v_ctpop_i32_add_literal(i32
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, %const
@@ -241,7 +241,7 @@ define void @v_ctpop_i32_add_var(i32 add
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %const, %ctpop
@@ -258,7 +258,7 @@ define void @v_ctpop_i32_add_var_inv(i32
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4
@@ -279,7 +279,7 @@ define void @v_ctpop_i32_add_vvar_inv(i3
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 ; EG: BCNT_INT
-define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
+define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
 entry:
   %tmp0 = icmp eq i32 %cond, 0
   br i1 %tmp0, label %if, label %else

Modified: llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ctpop64.ll Tue Mar 21 16:39:51 2017
@@ -17,7 +17,7 @@ declare i128 @llvm.ctpop.i128(i128) noun
 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_dword [[VRESULT]],
 ; GCN: s_endpgm
-define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
   store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@@ -31,7 +31,7 @@ define void @s_ctpop_i64(i32 addrspace(1
 ; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
@@ -48,7 +48,7 @@ define void @v_ctpop_i64(i32 addrspace(1
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
+define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %or = or i64 %ctpop, %s.val
@@ -60,7 +60,7 @@ define void @v_ctpop_i64_user(i64 addrsp
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_endpgm
-define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
+define amdgpu_kernel void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
   %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
   store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
@@ -73,7 +73,7 @@ define void @s_ctpop_v2i64(<2 x i32> add
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_endpgm
-define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
+define amdgpu_kernel void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
   %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
   store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
@@ -86,7 +86,7 @@ define void @s_ctpop_v4i64(<4 x i32> add
 ; GCN: v_bcnt_u32_b32
 ; GCN: v_bcnt_u32_b32
 ; GCN: s_endpgm
-define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
   %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
@@ -104,7 +104,7 @@ define void @v_ctpop_v2i64(<2 x i32> add
 ; GCN: v_bcnt_u32_b32
 ; GCN: v_bcnt_u32_b32
 ; GCN: s_endpgm
-define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
   %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
   %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
@@ -121,7 +121,7 @@ define void @v_ctpop_v4i64(<4 x i32> add
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[ZERO]]
 ; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
 ; GCN: s_endpgm
-define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
+define amdgpu_kernel void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
 entry:
   %tmp0 = icmp eq i32 %cond, 0
   br i1 %tmp0, label %if, label %else
@@ -146,7 +146,7 @@ endif:
 ; GCN: s_bcnt1_i32_b64 [[SRESULT1:s[0-9]+]],
 ; GCN: s_add_i32 s{{[0-9]+}}, [[SRESULT1]], [[SRESULT0]]
 ; GCN: s_endpgm
-define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
   %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
   %truncctpop = trunc i128 %ctpop to i32
   store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@@ -159,7 +159,7 @@ define void @s_ctpop_i128(i32 addrspace(
 ; GCN: s_bcnt1_i32_b64 [[REG1:s[0-9]+]],
 ; GCN: s_add_i32 {{s[0-9]+}}, [[REG0]], [[REG1]]
 ; GCN: s_endpgm
-define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
   %ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone
   %truncctpop = trunc i65 %ctpop to i32
   store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@@ -181,7 +181,7 @@ define void @s_ctpop_i65(i32 addrspace(1
 
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
   %val = load i128, i128 addrspace(1)* %in, align 8
   %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
   %truncctpop = trunc i128 %ctpop to i32

Modified: llvm/trunk/test/CodeGen/AMDGPU/cttz_zero_undef.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cttz_zero_undef.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cttz_zero_undef.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cttz_zero_undef.ll Tue Mar 21 16:39:51 2017
@@ -14,7 +14,7 @@ declare <4 x i32> @llvm.cttz.v4i32(<4 x
 ; SI: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %cttz, i32 addrspace(1)* %out, align 4
   ret void
@@ -27,7 +27,7 @@ define void @s_cttz_zero_undef_i32(i32 a
 ; SI: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr, align 4
   %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %cttz, i32 addrspace(1)* %out, align 4
@@ -43,7 +43,7 @@ define void @v_cttz_zero_undef_i32(i32 a
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
   %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
   store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
@@ -63,7 +63,7 @@ define void @v_cttz_zero_undef_v2i32(<2
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
   %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
   store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16

Modified: llvm/trunk/test/CodeGen/AMDGPU/cube.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cube.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cube.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cube.ll Tue Mar 21 16:39:51 2017
@@ -12,7 +12,7 @@ declare float @llvm.amdgcn.cubema(float,
 ; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: _store_dwordx4
-define void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %cubeid = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
   %cubesc = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c)
   %cubetc = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c)

Modified: llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll Tue Mar 21 16:39:51 2017
@@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.y()
 ; GCN-NOT: lshr
 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
 ; GCN: buffer_store_dword [[CONV]],
-define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
   %load = load i8, i8 addrspace(1)* %in, align 1
   %cvt = uitofp i8 %load to float
   store float %cvt, float addrspace(1)* %out, align 4
@@ -22,7 +22,7 @@ define void @load_i8_to_f32(float addrsp
 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
   %cvt = uitofp <2 x i8> %load to <2 x float>
   store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
@@ -36,7 +36,7 @@ define void @load_v2i8_to_v2f32(<2 x flo
 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
   %cvt = uitofp <3 x i8> %load to <3 x float>
   store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
@@ -52,7 +52,7 @@ define void @load_v3i8_to_v3f32(<3 x flo
 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %cvt = uitofp <4 x i8> %load to <4 x float>
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
@@ -76,7 +76,7 @@ define void @load_v4i8_to_v4f32(<4 x flo
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]]
 
 ; GCN: buffer_store_dwordx4
-define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <4 x i8> %load to <4 x float>
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
@@ -110,7 +110,7 @@ define void @load_v4i8_to_v4f32_unaligne
 ; GCN: {{buffer|flat}}_store_dword
 
 ; GCN: s_endpgm
-define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
@@ -124,7 +124,7 @@ define void @load_v4i8_to_v4f32_2_uses(<
 ; Make sure this doesn't crash.
 ; GCN-LABEL: {{^}}load_v7i8_to_v7f32:
 ; GCN: s_endpgm
-define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <7 x i8> %load to <7 x float>
   store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
@@ -147,7 +147,7 @@ define void @load_v7i8_to_v7f32(<7 x flo
 ; GCN-NOT: lshr
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
   %cvt = uitofp <8 x i8> %load to <8 x float>
   store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
@@ -159,7 +159,7 @@ define void @load_v8i8_to_v8f32(<8 x flo
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
 ; GCN: buffer_store_dword [[CONV]],
-define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 2
   %inreg = and i32 %add, 255
@@ -169,7 +169,7 @@ define void @i8_zext_inreg_i32_to_f32(fl
 }
 
 ; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
-define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %inreg = and i32 %load, 65280
   %shr = lshr i32 %inreg, 8
@@ -181,7 +181,7 @@ define void @i8_zext_inreg_hi1_to_f32(fl
 ; We don't get these ones because of the zext, but instcombine removes
 ; them so it shouldn't really matter.
 ; GCN-LABEL: {{^}}i8_zext_i32_to_f32:
-define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
   %load = load i8, i8 addrspace(1)* %in, align 1
   %ext = zext i8 %load to i32
   %cvt = uitofp i32 %ext to float
@@ -190,7 +190,7 @@ define void @i8_zext_i32_to_f32(float ad
 }
 
 ; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
-define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
   %ext = zext <4 x i8> %load to <4 x i32>
   %cvt = uitofp <4 x i32> %ext to <4 x float>
@@ -203,7 +203,7 @@ define void @v4i8_zext_v4i32_to_v4f32(<4
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %and = and i32 %val, 255
   %cvt = uitofp i32 %and to float
@@ -216,7 +216,7 @@ define void @extract_byte0_to_f32(float
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %srl = lshr i32 %val, 8
   %and = and i32 %srl, 255
@@ -230,7 +230,7 @@ define void @extract_byte1_to_f32(float
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %srl = lshr i32 %val, 16
   %and = and i32 %srl, 255
@@ -244,7 +244,7 @@ define void @extract_byte2_to_f32(float
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %srl = lshr i32 %val, 24
   %and = and i32 %srl, 255

Modified: llvm/trunk/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll Tue Mar 21 16:39:51 2017
@@ -10,7 +10,7 @@ declare float @llvm.floor.f32(float) #1
 ; SI-NOT: add
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
   %floor = call float @llvm.floor.f32(float %x) #1
   %cvt = fptosi float %floor to i32
   store i32 %cvt, i32 addrspace(1)* %out
@@ -22,7 +22,7 @@ define void @cvt_flr_i32_f32_0(i32 addrs
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
   %fadd = fadd float %x, 1.0
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptosi float %floor to i32
@@ -35,7 +35,7 @@ define void @cvt_flr_i32_f32_1(i32 addrs
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %floor = call float @llvm.floor.f32(float %x.fabs) #1
   %cvt = fptosi float %floor to i32
@@ -48,7 +48,7 @@ define void @cvt_flr_i32_f32_fabs(i32 ad
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fneg = fsub float -0.000000e+00, %x
   %floor = call float @llvm.floor.f32(float %x.fneg) #1
   %cvt = fptosi float %floor to i32
@@ -61,7 +61,7 @@ define void @cvt_flr_i32_f32_fneg(i32 ad
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
   %floor = call float @llvm.floor.f32(float %x.fabs.fneg) #1
@@ -75,7 +75,7 @@ define void @cvt_flr_i32_f32_fabs_fneg(i
 ; SI: v_floor_f32
 ; SI: v_cvt_u32_f32_e32
 ; SI: s_endpgm
-define void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
   %floor = call float @llvm.floor.f32(float %x) #1
   %cvt = fptoui float %floor to i32
   store i32 %cvt, i32 addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@ declare float @llvm.floor.f32(float) #1
 ; SI-SAFE-NOT: v_cvt_rpi_i32_f32
 ; SI-NONAN: v_cvt_rpi_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
   %fadd = fadd float %x, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptosi float %floor to i32
@@ -21,7 +21,7 @@ define void @cvt_rpi_i32_f32(i32 addrspa
 ; SI-SAFE-NOT: v_cvt_rpi_i32_f32
 ; SI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %fadd = fadd float %x.fabs, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
@@ -37,7 +37,7 @@ define void @cvt_rpi_i32_f32_fabs(i32 ad
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fneg = fsub float -0.000000e+00, %x
   %fadd = fadd float %x.fneg, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
@@ -55,7 +55,7 @@ define void @cvt_rpi_i32_f32_fneg(i32 ad
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
   %fadd = fadd float %x.fabs.fneg, 0.5
@@ -71,7 +71,7 @@ define void @cvt_rpi_i32_f32_fabs_fneg(i
 ; SI: v_floor_f32
 ; SI: v_cvt_u32_f32
 ; SI: s_endpgm
-define void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
   %fadd = fadd float %x, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptoui float %floor to i32

Modified: llvm/trunk/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 ; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]
 
-define void @store_same_base_ptr(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @store_same_base_ptr(i32 addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #0
   %offset = sext i32 %id to i64

Modified: llvm/trunk/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll Tue Mar 21 16:39:51 2017
@@ -10,7 +10,7 @@
 ; CHECK: {{^}}sint:
 ; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %sint = load i32, i32 addrspace(1) * %in
@@ -24,7 +24,7 @@ entry:
 ;CHECK: {{^}}uint:
 ;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %uint = load i32, i32 addrspace(1) * %in

Modified: llvm/trunk/test/CodeGen/AMDGPU/debug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/debug.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/debug.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/debug.ll Tue Mar 21 16:39:51 2017
@@ -4,7 +4,7 @@
 ; Test for a crash in the custom assembly dump code.
 
 ; SI: s_endpgm
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
   store i32 0, i32 addrspace(1)* %out
   ret void
 }

Modified: llvm/trunk/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/debugger-emit-prologue.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/debugger-emit-prologue.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/debugger-emit-prologue.ll Tue Mar 21 16:39:51 2017
@@ -23,7 +23,7 @@
 ; NOATTR-NOT: DebuggerPrivateSegmentBufferSGPR
 
 ; Function Attrs: nounwind
-define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4

Modified: llvm/trunk/test/CodeGen/AMDGPU/debugger-insert-nops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/debugger-insert-nops.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/debugger-insert-nops.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/debugger-insert-nops.ll Tue Mar 21 16:39:51 2017
@@ -22,7 +22,7 @@
 ; CHECK-NEXT: s_endpgm
 
 ; Function Attrs: nounwind
-define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4

Modified: llvm/trunk/test/CodeGen/AMDGPU/debugger-reserve-regs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/debugger-reserve-regs.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/debugger-reserve-regs.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/debugger-reserve-regs.ll Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@
 ; CHECK-NEXT: ReservedVGPRCount: 4
 
 ; Function Attrs: nounwind
-define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4

Modified: llvm/trunk/test/CodeGen/AMDGPU/default-fp-mode.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/default-fp-mode.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/default-fp-mode.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/default-fp-mode.ll Tue Mar 21 16:39:51 2017
@@ -3,7 +3,7 @@
 ; GCN-LABEL: {{^}}test_default_si:
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
+define amdgpu_kernel void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -12,7 +12,7 @@ define void @test_default_si(float addrs
 ; GCN-LABEL: {{^}}test_default_vi:
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
+define amdgpu_kernel void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -21,7 +21,7 @@ define void @test_default_vi(float addrs
 ; GCN-LABEL: {{^}}test_f64_denormals:
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
+define amdgpu_kernel void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -30,7 +30,7 @@ define void @test_f64_denormals(float ad
 ; GCN-LABEL: {{^}}test_f32_denormals:
 ; GCNL: FloatMode: 48
 ; GCN: IeeeMode: 1
-define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
+define amdgpu_kernel void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -39,7 +39,7 @@ define void @test_f32_denormals(float ad
 ; GCN-LABEL: {{^}}test_f32_f64_denormals:
 ; GCN: FloatMode: 240
 ; GCN: IeeeMode: 1
-define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
+define amdgpu_kernel void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -48,7 +48,7 @@ define void @test_f32_f64_denormals(floa
 ; GCN-LABEL: {{^}}test_no_denormals
 ; GCN: FloatMode: 0
 ; GCN: IeeeMode: 1
-define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
+define amdgpu_kernel void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -57,7 +57,7 @@ define void @test_no_denormals(float add
 ; GCN-LABEL: {{^}}test_f16_f64_denormals:
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define void @test_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #6 {
+define amdgpu_kernel void @test_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #6 {
   store half 0.0, half addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -66,7 +66,7 @@ define void @test_f16_f64_denormals(half
 ; GCN-LABEL: {{^}}test_no_f16_f64_denormals:
 ; GCN: FloatMode: 0
 ; GCN: IeeeMode: 1
-define void @test_no_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #7 {
+define amdgpu_kernel void @test_no_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #7 {
   store half 0.0, half addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -75,7 +75,7 @@ define void @test_no_f16_f64_denormals(h
 ; GCN-LABEL: {{^}}test_f32_f16_f64_denormals:
 ; GCN: FloatMode: 240
 ; GCN: IeeeMode: 1
-define void @test_f32_f16_f64_denormals(half addrspace(1)* %out0, float addrspace(1)* %out1, double addrspace(1)* %out2) #8 {
+define amdgpu_kernel void @test_f32_f16_f64_denormals(half addrspace(1)* %out0, float addrspace(1)* %out1, double addrspace(1)* %out2) #8 {
   store half 0.0, half addrspace(1)* %out0
   store float 0.0, float addrspace(1)* %out1
   store double 0.0, double addrspace(1)* %out2

Modified: llvm/trunk/test/CodeGen/AMDGPU/detect-dead-lanes.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/detect-dead-lanes.mir?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/detect-dead-lanes.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/detect-dead-lanes.mir Tue Mar 21 16:39:51 2017
@@ -1,14 +1,14 @@
 # RUN: llc -march=amdgcn -run-pass detect-dead-lanes -o - %s | FileCheck %s
 --- |
-  define void @test0() { ret void }
-  define void @test1() { ret void }
-  define void @test2() { ret void }
-  define void @test3() { ret void }
-  define void @test4() { ret void }
-  define void @test5() { ret void }
-  define void @loop0() { ret void }
-  define void @loop1() { ret void }
-  define void @loop2() { ret void }
+  define amdgpu_kernel void @test0() { ret void }
+  define amdgpu_kernel void @test1() { ret void }
+  define amdgpu_kernel void @test2() { ret void }
+  define amdgpu_kernel void @test3() { ret void }
+  define amdgpu_kernel void @test4() { ret void }
+  define amdgpu_kernel void @test5() { ret void }
+  define amdgpu_kernel void @loop0() { ret void }
+  define amdgpu_kernel void @loop1() { ret void }
+  define amdgpu_kernel void @loop2() { ret void }
 ...
 ---
 # Combined use/def transfer check, the basics.

Modified: llvm/trunk/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 ; CHECK: ALU_PUSH_BEFORE
 ; CHECK-NEXT: JUMP
 ; CHECK-NEXT: LOOP_BREAK
-define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind {
+define amdgpu_kernel void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind {
 entry:
   %cmp5 = icmp sgt i32 %iterations, 0
   br i1 %cmp5, label %for.body, label %for.end

Modified: llvm/trunk/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 ; GCN: buffer_load_dword
 ; GCN: ds_write2_b32
 ; GCN: s_endpgm
-define void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 {
+define amdgpu_kernel void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 {
 entry:
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx = shl i32 %tid, 2

Modified: llvm/trunk/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll Tue Mar 21 16:39:51 2017
@@ -23,7 +23,7 @@ declare void @llvm.amdgcn.s.barrier() #1
 ; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:34
 ; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256
 ; CHECK: s_endpgm
-define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
+define amdgpu_kernel void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
 entry:
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %mul = shl nsw i32 %x.i, 1

Modified: llvm/trunk/test/CodeGen/AMDGPU/ds-sub-offset.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ds-sub-offset.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ds-sub-offset.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ds-sub-offset.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 ; GCN: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]]
 ; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b
 ; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12
-define void @write_ds_sub0_offset0_global() #0 {
+define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
 entry:
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
   %sub1 = sub i32 0, %x.i
@@ -24,7 +24,7 @@ entry:
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]]
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13
 ; GCN: ds_write_b8 [[NEG]], [[K]] offset:65535
-define void @add_x_shl_neg_to_sub_max_offset() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -39,7 +39,7 @@ define void @add_x_shl_neg_to_sub_max_of
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x10000, [[SCALED]]
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13
 ; GCN: ds_write_b8 [[NEG]], [[K]]{{$}}
-define void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -58,7 +58,7 @@ define void @add_x_shl_neg_to_sub_max_of
 ; GCN-NOT: v_sub
 ; GCN: ds_write_b32 [[NEG]], [[K]] offset:456{{$}}
 ; GCN: s_endpgm
-define void @add_x_shl_neg_to_sub_multi_use() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -80,7 +80,7 @@ define void @add_x_shl_neg_to_sub_multi_
 ; GCN-NOT: v_sub
 ; GCN: ds_write_b32 [[NEG]], [[K]] offset:123{{$}}
 ; GCN: s_endpgm
-define void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -95,7 +95,7 @@ define void @add_x_shl_neg_to_sub_multi_
 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]]
 ; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset0:254 offset1:255
-define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -109,7 +109,7 @@ define void @add_x_shl_neg_to_sub_misali
 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x3fc, [[SCALED]]
 ; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}}
-define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2

Modified: llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll Tue Mar 21 16:39:51 2017
@@ -12,7 +12,7 @@
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -31,7 +31,7 @@ define void @simple_read2_f32(float addr
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -49,7 +49,7 @@ define void @simple_read2_f32_max_offset
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
 ; SI: s_endpgm
-define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -66,7 +66,7 @@ define void @simple_read2_f32_too_far(fl
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 0
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
@@ -98,7 +98,7 @@ define void @simple_read2_f32_x2(float a
 ; SI: s_barrier
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 0
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
@@ -133,7 +133,7 @@ define void @simple_read2_f32_x2_barrier
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
@@ -170,7 +170,7 @@ define void @simple_read2_f32_x2_nonzero
 ; SI: ds_read_b32
 ; SI: ds_read_b32
 ; SI: s_endpgm
-define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
@@ -196,7 +196,7 @@ define void @read2_ptr_is_subreg_arg_f32
 ; SI: ds_read_b32
 ; SI: ds_read_b32
 ; SI: s_endpgm
-define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
@@ -219,7 +219,7 @@ define void @read2_ptr_is_subreg_arg_off
 ; SI-LABEL: {{^}}read2_ptr_is_subreg_f32:
 ; SI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}}
 ; SI: s_endpgm
-define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0
   %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1
@@ -243,7 +243,7 @@ define void @read2_ptr_is_subreg_f32(flo
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4
@@ -261,7 +261,7 @@ define void @simple_read2_f32_volatile_0
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -280,7 +280,7 @@ define void @simple_read2_f32_volatile_1
 ; SI-LABEL: @unaligned_read2_f32
 ; SI-NOT: ds_read2_b32
 ; SI: s_endpgm
-define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 1
@@ -296,7 +296,7 @@ define void @unaligned_read2_f32(float a
 ; SI-LABEL: @misaligned_2_simple_read2_f32
 ; SI-NOT: ds_read2_b32
 ; SI: s_endpgm
-define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 2
@@ -315,7 +315,7 @@ define void @misaligned_2_simple_read2_f
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2_f64(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -331,7 +331,7 @@ define void @simple_read2_f64(double add
 ; SI-LABEL: @simple_read2_f64_max_offset
 ; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255
 ; SI: s_endpgm
-define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -349,7 +349,7 @@ define void @simple_read2_f64_max_offset
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
 ; SI: s_endpgm
-define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -367,7 +367,7 @@ define void @simple_read2_f64_too_far(do
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
 ; SI: s_endpgm
-define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 4
@@ -385,7 +385,7 @@ define void @misaligned_read2_f64(double
 ; SI-LABEL: @load_constant_adjacent_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
-define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
   %sum = add i32 %val0, %val1
@@ -396,7 +396,7 @@ define void @load_constant_adjacent_offs
 ; SI-LABEL: @load_constant_disjoint_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
-define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
   %sum = add i32 %val0, %val1
@@ -410,7 +410,7 @@ define void @load_constant_disjoint_offs
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
-define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
   %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
   %sum = add i64 %val0, %val1
@@ -426,7 +426,7 @@ define void @load_misaligned64_constant_
 ; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
 ; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
 ; SI: s_endpgm
-define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
   %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
   %sum = add i64 %val0, %val1
@@ -437,7 +437,7 @@ define void @load_misaligned64_constant_
 @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
 
-define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
+define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
   %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
   %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
@@ -481,13 +481,13 @@ define void @sgemm_inner_loop_read2_sequ
   ret void
 }
 
-define void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
   %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
   store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8
   ret void
 }
 
-define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
   %load = load i64, i64 addrspace(3)* %in, align 4
   store i64 %load, i64 addrspace(1)* %out, align 8
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ds_read2_offset_order.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ds_read2_offset_order.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ds_read2_offset_order.ll Tue Mar 21 16:39:51 2017
@@ -10,7 +10,7 @@
 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56
 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:12
-define void @offset_order(float addrspace(1)* %out) {
+define amdgpu_kernel void @offset_order(float addrspace(1)* %out) {
 entry:
   %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0
   %val0 = load float, float addrspace(3)* %ptr0

Modified: llvm/trunk/test/CodeGen/AMDGPU/ds_read2_superreg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ds_read2_superreg.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ds_read2_superreg.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ds_read2_superreg.ll Tue Mar 21 16:39:51 2017
@@ -12,7 +12,7 @@
 ; CI: s_waitcnt lgkmcnt(0)
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; CI: s_endpgm
-define void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds  [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
   %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0, align 4
@@ -26,7 +26,7 @@ define void @simple_read2_v2f32_superreg
 ; CI: s_waitcnt lgkmcnt(0)
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; CI: s_endpgm
-define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
   %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0
@@ -43,7 +43,7 @@ define void @simple_read2_v2f32_superreg
 ; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]]
 ; CI: buffer_store_dword v[[ADD2]]
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
   %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 4
@@ -68,7 +68,7 @@ define void @simple_read2_v4f32_superreg
 ; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]]
 ; CI: buffer_store_dword v[[ADD1]]
 ; CI: s_endpgm
-define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <3 x float>], [512 x <3 x float>] addrspace(3)* @lds.v3, i32 0, i32 %x.i
   %val0 = load <3 x float>, <3 x float> addrspace(3)* %arrayidx0, align 4
@@ -88,7 +88,7 @@ define void @simple_read2_v3f32_superreg
 ; CI: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
 ; CI: buffer_store_dwordx4 [[REG_ZW]]
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
   %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 8
@@ -101,7 +101,7 @@ define void @simple_read2_v4f32_superreg
 ; CI-DAG: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
 ; CI: buffer_store_dwordx4 [[REG_ZW]]
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
   %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0
@@ -117,7 +117,7 @@ define void @simple_read2_v4f32_superreg
 ; CI-DAG: buffer_store_dwordx4 [[VEC_HI]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
 ; CI-DAG: buffer_store_dwordx4 [[VEC_LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}}
 ; CI: s_endpgm
-define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <8 x float>], [512 x <8 x float>] addrspace(3)* @lds.v8, i32 0, i32 %x.i
   %val0 = load <8 x float>, <8 x float> addrspace(3)* %arrayidx0
@@ -138,7 +138,7 @@ define void @simple_read2_v8f32_superreg
 ; CI-DAG: buffer_store_dwordx4 [[VEC8_11]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32
 ; CI-DAG: buffer_store_dwordx4 [[VEC12_15]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48
 ; CI: s_endpgm
-define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <16 x float>], [512 x <16 x float>] addrspace(3)* @lds.v16, i32 0, i32 %x.i
   %val0 = load <16 x float>, <16 x float> addrspace(3)* %arrayidx0
@@ -153,7 +153,7 @@ define void @simple_read2_v16f32_superre
 ; CI-NOT: v_mov
 ; CI: buffer_store_dwordx2 v{{\[}}[[REG_ELT0]]:[[REG_ELT1]]{{\]}}
 ; CI: s_endpgm
-define void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1
@@ -176,7 +176,7 @@ define void @simple_read2_v2f32_superreg
 ; CI-NOT: v_mov
 ; CI: buffer_store_dwordx4 v{{\[}}[[REG_ELT0]]:[[REG_ELT3]]{{\]}}
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1

Modified: llvm/trunk/test/CodeGen/AMDGPU/ds_read2st64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ds_read2st64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ds_read2st64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ds_read2st64.ll Tue Mar 21 16:39:51 2017
@@ -10,7 +10,7 @@
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -29,7 +29,7 @@ define void @simple_read2st64_f32_0_1(fl
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
@@ -49,7 +49,7 @@ define void @simple_read2st64_f32_1_2(fl
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
@@ -69,7 +69,7 @@ define void @simple_read2st64_f32_max_of
 ; SI-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
 ; SI-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}}
 ; SI: s_endpgm
-define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
@@ -86,7 +86,7 @@ define void @simple_read2st64_f32_over_m
 ; SI-LABEL: @odd_invalid_read2st64_f32_0
 ; SI-NOT: ds_read2st64_b32
 ; SI: s_endpgm
-define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -102,7 +102,7 @@ define void @odd_invalid_read2st64_f32_0
 ; SI-LABEL: @odd_invalid_read2st64_f32_1
 ; SI-NOT: ds_read2st64_b32
 ; SI: s_endpgm
-define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
@@ -122,7 +122,7 @@ define void @odd_invalid_read2st64_f32_1
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -141,7 +141,7 @@ define void @simple_read2st64_f64_0_1(do
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -161,7 +161,7 @@ define void @simple_read2st64_f64_1_2(do
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
 ; SI: s_endpgm
-define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 4
@@ -181,7 +181,7 @@ define void @misaligned_read2st64_f64(do
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 256
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -201,7 +201,7 @@ define void @simple_read2st64_f64_max_of
 ; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -218,7 +218,7 @@ define void @simple_read2st64_f64_over_m
 ; SI-LABEL: @invalid_read2st64_f64_odd_offset
 ; SI-NOT: ds_read2st64_b64
 ; SI: s_endpgm
-define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -239,7 +239,7 @@ define void @invalid_read2st64_f64_odd_o
 ; SI-NOT: ds_read2st_b64
 ; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
 ; SI: s_endpgm
-define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8

Modified: llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %val = load float, float addrspace(1)* %in.gep, align 4
@@ -27,7 +27,7 @@ define void @simple_write2_one_val_f32(f
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -46,7 +46,7 @@ define void @simple_write2_two_val_f32(f
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -65,7 +65,7 @@ define void @simple_write2_two_val_f32_v
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -86,7 +86,7 @@ define void @simple_write2_two_val_f32_v
 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
@@ -107,7 +107,7 @@ define void @simple_write2_two_val_subre
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
   %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
@@ -126,7 +126,7 @@ define void @simple_write2_two_val_subre
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
   %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
@@ -146,7 +146,7 @@ define void @simple_write2_two_val_subre
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
 ; SI: s_endpgm
-define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -164,7 +164,7 @@ define void @simple_write2_two_val_max_o
 ; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
 ; SI: s_endpgm
-define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -182,7 +182,7 @@ define void @simple_write2_two_val_too_f
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
@@ -212,7 +212,7 @@ define void @simple_write2_two_val_f32_x
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
@@ -243,7 +243,7 @@ define void @simple_write2_two_val_f32_x
 ; SI: ds_write_b32
 ; SI: ds_write_b32
 ; SI: s_endpgm
-define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
+define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -270,7 +270,7 @@ define void @write2_ptr_subreg_arg_two_v
 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
 ; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %val = load double, double addrspace(1)* %in.gep, align 8
@@ -288,7 +288,7 @@ define void @simple_write2_one_val_f64(d
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
 ; SI: s_endpgm
-define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %val = load double, double addrspace(1)* %in.gep, align 8
@@ -306,7 +306,7 @@ define void @misaligned_simple_write2_on
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
 ; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
@@ -325,7 +325,7 @@ define void @simple_write2_two_val_f64(d
 ; SI-LABEL: @store_constant_adjacent_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-define void @store_constant_adjacent_offsets() {
+define amdgpu_kernel void @store_constant_adjacent_offsets() {
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
   ret void
@@ -335,7 +335,7 @@ define void @store_constant_adjacent_off
 ; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
 ; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
-define void @store_constant_disjoint_offsets() {
+define amdgpu_kernel void @store_constant_disjoint_offsets() {
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
   ret void
@@ -348,7 +348,7 @@ define void @store_constant_disjoint_off
 ; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
 ; SI: s_endpgm
-define void @store_misaligned64_constant_offsets() {
+define amdgpu_kernel void @store_misaligned64_constant_offsets() {
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
   ret void
@@ -362,7 +362,7 @@ define void @store_misaligned64_constant
 ; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI: s_endpgm
-define void @store_misaligned64_constant_large_offsets() {
+define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
   ret void
@@ -371,7 +371,7 @@ define void @store_misaligned64_constant
 @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
 
-define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
   %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
   %val = load float, float addrspace(1)* %in
@@ -410,7 +410,7 @@ define void @write2_sgemm_sequence(float
 ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:2{{$}}
 ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:1{{$}}
 ; CI: s_endpgm
-define void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
   %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4

Modified: llvm/trunk/test/CodeGen/AMDGPU/ds_write2st64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ds_write2st64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ds_write2st64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ds_write2st64.ll Tue Mar 21 16:39:51 2017
@@ -7,7 +7,7 @@
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1
 ; SI: s_endpgm
-define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %val = load float, float addrspace(1)* %in.gep, align 4
@@ -25,7 +25,7 @@ define void @simple_write2st64_one_val_f
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5
 ; SI: s_endpgm
-define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -46,7 +46,7 @@ define void @simple_write2st64_two_val_f
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
 ; SI: s_endpgm
-define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -66,7 +66,7 @@ define void @simple_write2st64_two_val_m
 ; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]],
 ; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
 ; SI: s_endpgm
-define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
@@ -85,7 +85,7 @@ define void @simple_write2st64_two_val_m
 ; SI-NOT: ds_write2st64_b64
 ; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8
 ; SI: s_endpgm
-define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %val = load double, double addrspace(1)* %in.gep, align 8

Modified: llvm/trunk/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/dynamic_stackalloc.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/dynamic_stackalloc.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/dynamic_stackalloc.ll Tue Mar 21 16:39:51 2017
@@ -4,7 +4,7 @@
 
 ; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
 
-define void @test_dynamic_stackalloc(i32 addrspace(1)* %out, i32 %n) {
+define amdgpu_kernel void @test_dynamic_stackalloc(i32 addrspace(1)* %out, i32 %n) {
   %alloca = alloca i32, i32 %n
   store volatile i32 0, i32* %alloca
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/early-if-convert-cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/early-if-convert-cost.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/early-if-convert-cost.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/early-if-convert-cost.ll Tue Mar 21 16:39:51 2017
@@ -10,7 +10,7 @@
 ; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_LO:[0-9]+]], v[[ADD_LO]], v[[VAL_LO]], vcc
 ; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_HI:[0-9]+]], v[[ADD_HI]], v[[VAL_HI]], vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
 entry:
   %v = load double, double addrspace(1)* %in
   %cc = fcmp oeq double %v, 1.000000e+00
@@ -32,7 +32,7 @@ endif:
 ; GCN: v_add_f64
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
-define void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
 entry:
   %v = load double, double addrspace(2)* %in
   %cc = fcmp oeq double %v, 1.000000e+00
@@ -62,7 +62,7 @@ endif:
 
 ; GCN-DAG: buffer_store_dword v
 ; GCN-DAG: buffer_store_dwordx2
-define void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 {
 entry:
   %v = load <3 x i32>, <3 x i32> addrspace(1)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
@@ -93,7 +93,7 @@ endif:
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 
 ; GCN: buffer_store_dwordx4
-define void @test_vccnz_ifcvt_triangle128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, float %cnd) #0 {
 entry:
   %v = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00

Modified: llvm/trunk/test/CodeGen/AMDGPU/early-if-convert.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/early-if-convert.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/early-if-convert.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/early-if-convert.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -32,7 +32,7 @@ endif:
 ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]]
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[MUL]], vcc
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -58,7 +58,7 @@ endif:
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc
 ; GCN: s_mov_b64 vcc, [[CMP]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
-define void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 {
 entry:
   %v = load i32, i32 addrspace(1)* %in
   %cc = fcmp oeq float %k, 1.000000e+00
@@ -87,7 +87,7 @@ endif:
 ; GCN: v_mul_f32
 ; GCN: v_mul_f32
 ; GCN: v_cndmask_b32_e32
-define void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -128,7 +128,7 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -162,7 +162,7 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -187,7 +187,7 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(2)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(2)* %in, float %cnd) #0 {
 entry:
   %v = load i32, i32 addrspace(2)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
@@ -206,7 +206,7 @@ endif:
 
 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load:
 ; GCN: v_cndmask_b32
-define void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(2)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(2)* %in) #0 {
 entry:
   %v = load float, float addrspace(2)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -227,7 +227,7 @@ endif:
 
 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload:
 ; GCN: v_cndmask_b32
-define void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 {
 entry:
   %cc = fcmp oeq float %v, 1.000000e+00
   br i1 %cc, label %if, label %endif
@@ -248,7 +248,7 @@ endif:
 ; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]]
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[ADD]], [[VAL]]
-define void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(2)* %in, i32 %cond) #0 {
 entry:
   %v = load i32, i32 addrspace(2)* %in
   %cc = icmp eq i32 %cond, 1
@@ -274,7 +274,7 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = icmp eq i32 %cond, 1
@@ -295,7 +295,7 @@ endif:
 ; GCN: s_addc_u32
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(2)* %in, i32 %cond) #0 {
 entry:
   %v = load i64, i64 addrspace(2)* %in
   %cc = icmp eq i32 %cond, 1
@@ -320,7 +320,7 @@ endif:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(2)* %in, i32 %cond) #0 {
 entry:
   %v = load <3 x i32>, <3 x i32> addrspace(2)* %in
   %cc = icmp eq i32 %cond, 1
@@ -345,7 +345,7 @@ endif:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(2)* %in, i32 %cond) #0 {
 entry:
   %v = load <4 x i32>, <4 x i32> addrspace(2)* %in
   %cc = icmp eq i32 %cond, 1
@@ -364,7 +364,7 @@ endif:
 ; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
 ; GCN: s_cselect_b32 s{{[0-9]+}}, 1, 0{{$}}
-define void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %else, label %if
@@ -385,7 +385,7 @@ done:
 ; GCN: {{^}}; BB#0:
 ; GCN-NEXT: s_load_dwordx2
 ; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 1, 0
-define void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   br i1 undef, label %else, label %if
 
@@ -410,7 +410,7 @@ done:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 {
 entry:
   %v = load <8 x i32>, <8 x i32> addrspace(1)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
@@ -435,7 +435,7 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 {
 entry:
   %v = load <16 x i32>, <16 x i32> addrspace(1)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00

Modified: llvm/trunk/test/CodeGen/AMDGPU/elf.r600.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/elf.r600.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/elf.r600.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/elf.r600.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 ; CONFIG-NEXT: .long   2
 ; CONFIG-NEXT: .long   165900
 ; CONFIG-NEXT: .long   0
-define void @test(float addrspace(1)* %out, i32 %p) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, i32 %p) {
    %i = add i32 %p, 2
    %r = bitcast i32 %i to float
    store float %r, float addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/empty-function.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/empty-function.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/empty-function.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/empty-function.ll Tue Mar 21 16:39:51 2017
@@ -7,14 +7,14 @@
 ; SI-LABEL: {{^}}empty_function_ret:
 ; SI: s_endpgm
 ; SI: codeLenInByte = 4
-define void @empty_function_ret() #0 {
+define amdgpu_kernel void @empty_function_ret() #0 {
   ret void
 }
 
 ; SI: .text
 ; SI-LABEL: {{^}}empty_function_unreachable:
 ; SI: codeLenInByte = 0
-define void @empty_function_unreachable() #0 {
+define amdgpu_kernel void @empty_function_unreachable() #0 {
   unreachable
 }
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
 
 ; GCN-UNSAFE-NOT: xor
-define void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4

Modified: llvm/trunk/test/CodeGen/AMDGPU/endcf-loop-header.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/endcf-loop-header.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/endcf-loop-header.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/endcf-loop-header.ll Tue Mar 21 16:39:51 2017
@@ -12,7 +12,7 @@
 ; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}}
 ; CHECK-NOT: s_or_b64 exec, exec
 ; CHECK: s_cbranch_execnz [[LOOP_LABEL]]
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
 entry:
   %cond = call i32 @llvm.r600.read.tidig.x() #0
   %tmp0 = icmp eq i32 %cond, 0

Modified: llvm/trunk/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/exceed-max-sgprs.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/exceed-max-sgprs.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/exceed-max-sgprs.ll Tue Mar 21 16:39:51 2017
@@ -1,7 +1,7 @@
 ; RUN: not llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_tahiti
-define void @use_too_many_sgprs_tahiti() #0 {
+define amdgpu_kernel void @use_too_many_sgprs_tahiti() #0 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
@@ -20,7 +20,7 @@ define void @use_too_many_sgprs_tahiti()
 }
 
 ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire
-define void @use_too_many_sgprs_bonaire() #1 {
+define amdgpu_kernel void @use_too_many_sgprs_bonaire() #1 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
@@ -39,7 +39,7 @@ define void @use_too_many_sgprs_bonaire(
 }
 
 ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire_flat_scr
-define void @use_too_many_sgprs_bonaire_flat_scr() #1 {
+define amdgpu_kernel void @use_too_many_sgprs_bonaire_flat_scr() #1 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
@@ -59,7 +59,7 @@ define void @use_too_many_sgprs_bonaire_
 }
 
 ; ERROR: error: scalar registers limit of 96 exceeded (98) in use_too_many_sgprs_iceland
-define void @use_too_many_sgprs_iceland() #2 {
+define amdgpu_kernel void @use_too_many_sgprs_iceland() #2 {
   call void asm sideeffect "", "~{VCC}" ()
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
@@ -77,7 +77,7 @@ define void @use_too_many_sgprs_iceland(
 }
 
 ; ERROR: error: addressable scalar registers limit of 102 exceeded (103) in use_too_many_sgprs_fiji
-define void @use_too_many_sgprs_fiji() #3 {
+define amdgpu_kernel void @use_too_many_sgprs_fiji() #3 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()

Modified: llvm/trunk/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll Tue Mar 21 16:39:51 2017
@@ -3,7 +3,7 @@
 ; GCN-LABEL: and_zext:
 ; GCN: v_and_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]]
-define void @and_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @and_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id
   %a = load i16, i16 addrspace(1)* %in
@@ -18,7 +18,7 @@ define void @and_zext(i32 addrspace(1)*
 ; GCN-LABEL: or_zext:
 ; GCN: v_or_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]]
-define void @or_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @or_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id
   %a = load i16, i16 addrspace(1)* %in
@@ -33,7 +33,7 @@ define void @or_zext(i32 addrspace(1)* %
 ; GCN-LABEL: xor_zext:
 ; GCN: v_xor_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]]
-define void @xor_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @xor_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id
   %a = load i16, i16 addrspace(1)* %in

Modified: llvm/trunk/test/CodeGen/AMDGPU/extload-align.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/extload-align.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/extload-align.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/extload-align.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 ; DEBUG: mem:LD2[<unknown>]{{[^(]}}
 ; DEBUG: {{^}}# End machine code for function extload_align.
 
-define void @extload_align(i32* %out, i32 %index) #0 {
+define amdgpu_kernel void @extload_align(i32* %out, i32 %index) #0 {
   %v0 = alloca [4 x i16]
   %a1 = getelementptr inbounds [4 x i16], [4 x i16]* %v0, i32 0, i32 0
   %a2 = getelementptr inbounds [4 x i16], [4 x i16]* %v0, i32 0, i32 1

Modified: llvm/trunk/test/CodeGen/AMDGPU/extload-private.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/extload-private.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/extload-private.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/extload-private.ll Tue Mar 21 16:39:51 2017
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}load_i8_sext_private:
 ; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
-define void @load_i8_sext_private(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_i8_sext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i8
   %tmp1 = load i8, i8* %tmp0
@@ -14,7 +14,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}load_i8_zext_private:
 ; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
-define void @load_i8_zext_private(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_i8_zext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i8
   %tmp1 = load i8, i8* %tmp0
@@ -25,7 +25,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}load_i16_sext_private:
 ; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
-define void @load_i16_sext_private(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_i16_sext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i16
   %tmp1 = load i16, i16* %tmp0
@@ -36,7 +36,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}load_i16_zext_private:
 ; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
-define void @load_i16_zext_private(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_i16_zext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i16
   %tmp1 = load volatile i16, i16* %tmp0

Modified: llvm/trunk/test/CodeGen/AMDGPU/extload.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/extload.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/extload.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/extload.ll Tue Mar 21 16:39:51 2017
@@ -10,7 +10,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
 ; EG: VTX_READ_32 [[VAL]]
-define void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
   %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
   %load = load i32, i32 addrspace(1)* %cast
   %x = bitcast i32 %load to <4 x i8>
@@ -25,7 +25,7 @@ define void @global_anyext_load_i8(i8 ad
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
 ; EG: VTX_READ_32 [[VAL]]
-define void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
   %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
   %load = load i32, i32 addrspace(1)* %cast
   %x = bitcast i32 %load to <2 x i16>
@@ -40,7 +40,7 @@ define void @global_anyext_load_i16(i16
 
 ; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
 ; EG: LDS_WRITE * [[VAL]]
-define void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
   %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
   %load = load i32, i32 addrspace(3)* %cast
   %x = bitcast i32 %load to <4 x i8>
@@ -55,7 +55,7 @@ define void @local_anyext_load_i8(i8 add
 
 ; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
 ; EG: LDS_WRITE * [[VAL]]
-define void @local_anyext_load_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @local_anyext_load_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
   %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
   %load = load i32, i32 addrspace(3)* %cast
   %x = bitcast i32 %load to <2 x i16>

Modified: llvm/trunk/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll Tue Mar 21 16:39:51 2017
@@ -13,7 +13,7 @@
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
-define void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
+define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
                                                     <4 x i32> addrspace(1)* noalias %out1,
                                                     i32 addrspace(1)* noalias %out2,
                                                     i32 addrspace(1)* %in) {
@@ -55,7 +55,7 @@ define void @store_build_vector_multiple
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
-define void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
+define amdgpu_kernel void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
                                                             <4 x i32> addrspace(1)* noalias %out1,
                                                             i32 addrspace(1)* noalias %out2,
                                                             i32 addrspace(1)* %in) {
@@ -99,7 +99,7 @@ define void @store_build_vector_multiple
 
 ; GCN: buffer_store_dwordx2
 ; GCN: buffer_store_dwordx2
-define void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0,
+define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0,
                                                                      <4 x i32> addrspace(1)* noalias %out1,
                                                                      i64 addrspace(1)* noalias %out2,
                                                                      i32 addrspace(1)* %in) {

Modified: llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll Tue Mar 21 16:39:51 2017
@@ -8,7 +8,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN-DAG: buffer_store_short [[VELT0]]
 ; GCN-DAG: buffer_store_short [[VELT1]]
-define void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
   %p0 = extractelement <2 x half> %vec, i32 0
   %p1 = extractelement <2 x half> %vec, i32 1
@@ -26,7 +26,7 @@ define void @extract_vector_elt_v2f16(ha
 ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN: buffer_store_short [[VELT1]]
 ; GCN: ScratchSize: 0
-define void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 %idx) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 %idx) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
   %elt = extractelement <2 x half> %vec, i32 %idx
   store half %elt, half addrspace(1)* %out, align 2
@@ -45,7 +45,7 @@ define void @extract_vector_elt_v2f16_dy
 ; SI: buffer_store_short [[ELT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]
 ; GCN: ScratchSize: 0{{$}}
-define void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
@@ -61,7 +61,7 @@ define void @extract_vector_elt_v2f16_dy
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
-define void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 {
   %p0 = extractelement <3 x half> %foo, i32 0
   %p1 = extractelement <3 x half> %foo, i32 2
   %out1 = getelementptr half, half addrspace(1)* %out, i32 1
@@ -75,7 +75,7 @@ define void @extract_vector_elt_v3f16(ha
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
-define void @extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo) #0 {
   %p0 = extractelement <4 x half> %foo, i32 0
   %p1 = extractelement <4 x half> %foo, i32 2
   %out1 = getelementptr half, half addrspace(1)* %out, i32 10
@@ -95,7 +95,7 @@ define void @extract_vector_elt_v4f16(ha
 
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
-define void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {
   %p0 = extractelement <3 x half> %foo, i32 %idx
   %out1 = getelementptr half, half addrspace(1)* %out, i32 1
   store half %p0, half addrspace(1)* %out
@@ -115,7 +115,7 @@ define void @dynamic_extract_vector_elt_
 
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
-define void @dynamic_extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo, i32 %idx) #0 {
   %p0 = extractelement <4 x half> %foo, i32 %idx
   %out1 = getelementptr half, half addrspace(1)* %out, i32 1
   store half %p0, half addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_load_dwordx2
 ; GCN: buffer_store_dwordx2
-define void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
   %ld = load volatile <3 x double>, <3 x double> addrspace(1)* %in
   %elt = extractelement <3 x double> %ld, i32 2
   store volatile double %elt, double addrspace(1)* %out
@@ -13,14 +13,14 @@ define void @extract_vector_elt_v3f64_2(
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3f64:
-define void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x double> %foo, i32 %elt
   store volatile double %dynelt, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4f64:
-define void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x double> %foo, i32 %elt
   store volatile double %dynelt, double addrspace(1)* %out
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN-DAG: buffer_store_short [[VELT0]]
 ; GCN-DAG: buffer_store_short [[VELT1]]
-define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %p0 = extractelement <2 x i16> %vec, i32 0
   %p1 = extractelement <2 x i16> %vec, i32 1
@@ -27,7 +27,7 @@ define void @extract_vector_elt_v2i16(i1
 ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN: buffer_store_short [[VELT1]]
 ; GCN: ScratchSize: 0
-define void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %idx) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %idx) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %elt = extractelement <2 x i16> %vec, i32 %idx
   store i16 %elt, i16 addrspace(1)* %out, align 2
@@ -45,7 +45,7 @@ define void @extract_vector_elt_v2i16_dy
 ; SI: buffer_store_short [[ELT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]
 ; GCN: ScratchSize: 0{{$}}
-define void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
@@ -61,7 +61,7 @@ define void @extract_vector_elt_v2i16_dy
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
-define void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 {
   %p0 = extractelement <3 x i16> %foo, i32 0
   %p1 = extractelement <3 x i16> %foo, i32 2
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
@@ -82,7 +82,7 @@ define void @extract_vector_elt_v3i16(i1
 ; GFX9-DAG: buffer_store_short [[VLOAD0]], off
 ; GFX9-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], [[LOAD1]]
 ; GFX9-DAG: buffer_store_short [[VLOAD1]], off
-define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 {
   %p0 = extractelement <4 x i16> %foo, i32 0
   %p1 = extractelement <4 x i16> %foo, i32 2
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10
@@ -105,7 +105,7 @@ define void @extract_vector_elt_v4i16(i1
 
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
-define void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
   %p0 = extractelement <3 x i16> %foo, i32 %idx
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
   store i16 %p0, i16 addrspace(1)* %out
@@ -131,7 +131,7 @@ define void @dynamic_extract_vector_elt_
 ; GFX9: buffer_store_dword
 ; GFX9: buffer_load_ushort
 ; GFX9: buffer_store_short
-define void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 {
   %p0 = extractelement <4 x i16> %foo, i32 %idx
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
   store i16 %p0, i16 addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll Tue Mar 21 16:39:51 2017
@@ -8,7 +8,7 @@
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dwordx2
-define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) #0 {
+define amdgpu_kernel void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) #0 {
   %vec = bitcast i64 %val to <2 x i32>
   %elt0 = extractelement <2 x i32> %vec, i32 0
   %elt1 = extractelement <2 x i32> %vec, i32 1
@@ -20,7 +20,7 @@ define void @extract_vector_elt_select_e
 }
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v2i64:
-define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0 {
   %p0 = extractelement <2 x i64> %foo, i32 0
   %p1 = extractelement <2 x i64> %foo, i32 1
   %out1 = getelementptr i64, i64 addrspace(1)* %out, i32 1
@@ -30,14 +30,14 @@ define void @extract_vector_elt_v2i64(i6
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64:
-define void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <2 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64_2:
-define void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 {
   %load = load volatile <2 x i64>, <2 x i64> addrspace(1)* %foo
   %or = or <2 x i64> %load, %arst
   %dynelt = extractelement <2 x i64> %or, i32 %elt
@@ -46,14 +46,14 @@ define void @dyn_extract_vector_elt_v2i6
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64:
-define void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64:
-define void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll Tue Mar 21 16:39:51 2017
@@ -4,7 +4,7 @@
 ; FUNC-LABEL: {{^}}extract_vector_elt_v1i8:
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
   %p0 = extractelement <1 x i8> %foo, i32 0
   store i8 %p0, i8 addrspace(1)* %out
   ret void
@@ -15,7 +15,7 @@ define void @extract_vector_elt_v1i8(i8
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 {
   %p0 = extractelement <2 x i8> %foo, i32 0
   %p1 = extractelement <2 x i8> %foo, i32 1
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -29,7 +29,7 @@ define void @extract_vector_elt_v2i8(i8
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 {
   %p0 = extractelement <3 x i8> %foo, i32 0
   %p1 = extractelement <3 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -43,7 +43,7 @@ define void @extract_vector_elt_v3i8(i8
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 {
   %p0 = extractelement <4 x i8> %foo, i32 0
   %p1 = extractelement <4 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -57,7 +57,7 @@ define void @extract_vector_elt_v4i8(i8
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
   %p0 = extractelement <8 x i8> %foo, i32 0
   %p1 = extractelement <8 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -71,7 +71,7 @@ define void @extract_vector_elt_v8i8(i8
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 {
   %p0 = extractelement <16 x i8> %foo, i32 0
   %p1 = extractelement <16 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -85,7 +85,7 @@ define void @extract_vector_elt_v16i8(i8
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 {
   %p0 = extractelement <32 x i8> %foo, i32 0
   %p1 = extractelement <32 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -99,7 +99,7 @@ define void @extract_vector_elt_v32i8(i8
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 {
   %p0 = extractelement <64 x i8> %foo, i32 0
   %p1 = extractelement <64 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -120,7 +120,7 @@ define void @extract_vector_elt_v64i8(i8
 ; GCN: buffer_store_byte
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
-define void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
   %p0 = extractelement <3 x i8> %foo, i32 %idx
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   store i8 %p0, i8 addrspace(1)* %out
@@ -141,7 +141,7 @@ define void @dynamic_extract_vector_elt_
 ; GCN: buffer_store_byte
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
-define void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 {
   %p0 = extractelement <4 x i8> %foo, i32 %idx
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   store i8 %p0, i8 addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/extractelt-to-trunc.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/extractelt-to-trunc.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/extractelt-to-trunc.ll Tue Mar 21 16:39:51 2017
@@ -7,7 +7,7 @@
 ; GCN-DAG: buffer_load_dword [[A:v[0-9]+]]
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, [[B]], [[A]]
 ; GCN: buffer_store_dword [[ADD]]
-define void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
+define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
    %a = load i64, i64 addrspace(1)* %in
    %add = add i64 %a, %b
    %val.bc = bitcast i64 %add to <2 x i32>
@@ -20,7 +20,7 @@ define void @bitcast_int_to_vector_extra
 ; GCN: buffer_load_dwordx2
 ; GCN: v_add_f64
 ; GCN: buffer_store_dword v
-define void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) {
+define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) {
    %a = load double, double addrspace(1)* %in
    %add = fadd double %a, %b
    %val.bc = bitcast double %add to <2 x i32>
@@ -33,7 +33,7 @@ define void @bitcast_fp_to_vector_extrac
 ; GCN: buffer_load_dwordx2
 ; GCN: v_add_i32
 ; GCN: buffer_store_dword
-define void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
+define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
    %a = load i64, i64 addrspace(1)* %in
    %add = add i64 %a, %b
    %val.bc = bitcast i64 %add to <2 x float>
@@ -45,7 +45,7 @@ define void @bitcast_int_to_fpvector_ext
 ; GCN-LABEL: {{^}}no_extract_volatile_load_extract0:
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_store_dword v
-define void @no_extract_volatile_load_extract0(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @no_extract_volatile_load_extract0(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 entry:
   %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
   %elt0 = extractelement <4 x i32> %vec, i32 0
@@ -57,7 +57,7 @@ entry:
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_store_dword v
 
-define void @no_extract_volatile_load_extract2(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @no_extract_volatile_load_extract2(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 entry:
   %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
   %elt2 = extractelement <4 x i32> %vec, i32 2
@@ -68,7 +68,7 @@ entry:
 ; GCN-LABEL: {{^}}no_extract_volatile_load_dynextract:
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_store_dword v
-define void @no_extract_volatile_load_dynextract(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @no_extract_volatile_load_dynextract(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
 entry:
   %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
   %eltN = extractelement <4 x i32> %vec, i32 %idx

Modified: llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll Tue Mar 21 16:39:51 2017
@@ -11,7 +11,7 @@
 ; GCN: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]]
 ; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
 
-define void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
+define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
   %bc= bitcast i16 %in to half
   %fabs = call half @llvm.fabs.f16(half %bc)
   store half %fabs, half addrspace(1)* %out
@@ -22,7 +22,7 @@ define void @s_fabs_free_f16(half addrsp
 ; CI: flat_load_ushort [[VAL:v[0-9]+]],
 ; CI: v_and_b32_e32 [[CVT0:v[0-9]+]], 0x7fff, [[VAL]]
 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @s_fabs_f16(half addrspace(1)* %out, half %in) {
+define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) {
   %fabs = call half @llvm.fabs.f16(half %in)
   store half %fabs, half addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@ define void @s_fabs_f16(half addrspace(1
 
 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
 ; GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff
-define void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
+define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   store <2 x half> %fabs, <2 x half> addrspace(1)* %out
   ret void
@@ -68,7 +68,7 @@ define void @s_fabs_v2f16(<2 x half> add
 ; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
 
 ; GCN: flat_store_dwordx2
-define void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
+define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
   %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
   store <4 x half> %fabs, <4 x half> addrspace(1)* %out
   ret void
@@ -87,7 +87,7 @@ define void @s_fabs_v4f16(<4 x half> add
 ; VI-NOT: and
 ; VI: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN1]]|, [[IN0]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) {
+define amdgpu_kernel void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) {
   %fabs = call half @llvm.fabs.f16(half %in0)
   %fmul = fmul half %fabs, %in1
   store half %fmul, half addrspace(1)* %out
@@ -97,7 +97,7 @@ define void @fabs_fold_f16(half addrspac
 ; GCN-LABEL: {{^}}v_fabs_v2f16:
 ; GCN: flat_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, [[VAL]]
-define void @v_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
@@ -110,7 +110,7 @@ define void @v_fabs_v2f16(<2 x half> add
 ; GCN-LABEL: {{^}}fabs_free_v2f16:
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff
-define void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
   %bc = bitcast i32 %in to <2 x half>
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc)
   store <2 x half> %fabs, <2 x half> addrspace(1)* %out
@@ -133,7 +133,7 @@ define void @fabs_free_v2f16(<2 x half>
 
 ; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]]
 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}}
-define void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
   %fmul = fmul <2 x half> %fabs, %val

Modified: llvm/trunk/test/CodeGen/AMDGPU/fabs.f64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fabs.f64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fabs.f64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fabs.f64.ll Tue Mar 21 16:39:51 2017
@@ -10,7 +10,7 @@ declare <4 x double> @llvm.fabs.v4f64(<4
 ; FUNC-LABEL: {{^}}v_fabs_f64:
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %tidext = sext i32 %tid to i64
   %gep = getelementptr double, double addrspace(1)* %in, i64 %tidext
@@ -24,7 +24,7 @@ define void @v_fabs_f64(double addrspace
 ; SI: v_and_b32
 ; SI-NOT: v_and_b32
 ; SI: s_endpgm
-define void @fabs_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double %in) {
   %fabs = call double @llvm.fabs.f64(double %in)
   store double %fabs, double addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@ define void @fabs_f64(double addrspace(1
 ; SI: v_and_b32
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
   %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
   store <2 x double> %fabs, <2 x double> addrspace(1)* %out
   ret void
@@ -46,7 +46,7 @@ define void @fabs_v2f64(<2 x double> add
 ; SI: v_and_b32
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
   %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
   store <4 x double> %fabs, <4 x double> addrspace(1)* %out
   ret void
@@ -57,7 +57,7 @@ define void @fabs_v4f64(<4 x double> add
 ; SI-NOT: and
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|
 ; SI: s_endpgm
-define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
   %fabs = call double @llvm.fabs.f64(double %in0)
   %fmul = fmul double %fabs, %in1
   store double %fmul, double addrspace(1)* %out
@@ -69,7 +69,7 @@ define void @fabs_fold_f64(double addrsp
 ; SI-NOT: and
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|
 ; SI: s_endpgm
-define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
   %fabs = call double @fabs(double %in0)
   %fmul = fmul double %fabs, %in1
   store double %fmul, double addrspace(1)* %out
@@ -79,7 +79,7 @@ define void @fabs_fn_fold_f64(double add
 ; FUNC-LABEL: {{^}}fabs_free_f64:
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fabs_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc= bitcast i64 %in to double
   %fabs = call double @llvm.fabs.f64(double %bc)
   store double %fabs, double addrspace(1)* %out
@@ -89,7 +89,7 @@ define void @fabs_free_f64(double addrsp
 ; FUNC-LABEL: {{^}}fabs_fn_free_f64:
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc= bitcast i64 %in to double
   %fabs = call double @fabs(double %bc)
   store double %fabs, double addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/fabs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fabs.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fabs.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fabs.ll Tue Mar 21 16:39:51 2017
@@ -13,7 +13,7 @@
 
 ; GCN: v_and_b32
 
-define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
   %fabs = call float @fabs(float %bc)
   store float %fabs, float addrspace(1)* %out
@@ -26,7 +26,7 @@ define void @fabs_fn_free(float addrspac
 
 ; GCN: v_and_b32
 
-define void @fabs_free(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fabs_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
   %fabs = call float @llvm.fabs.f32(float %bc)
   store float %fabs, float addrspace(1)* %out
@@ -37,7 +37,7 @@ define void @fabs_free(float addrspace(1
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 
 ; GCN: v_and_b32
-define void @fabs_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fabs_f32(float addrspace(1)* %out, float %in) {
   %fabs = call float @llvm.fabs.f32(float %in)
   store float %fabs, float addrspace(1)* %out
   ret void
@@ -49,7 +49,7 @@ define void @fabs_f32(float addrspace(1)
 
 ; GCN: v_and_b32
 ; GCN: v_and_b32
-define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   store <2 x float> %fabs, <2 x float> addrspace(1)* %out
   ret void
@@ -65,7 +65,7 @@ define void @fabs_v2f32(<2 x float> addr
 ; GCN: v_and_b32
 ; GCN: v_and_b32
 ; GCN: v_and_b32
-define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
   store <4 x float> %fabs, <4 x float> addrspace(1)* %out
   ret void
@@ -76,7 +76,7 @@ define void @fabs_v4f32(<4 x float> addr
 ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: and
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]|
-define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
+define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @fabs(float %in0)
   %fmul = fmul float %fabs, %in1
   store float %fmul, float addrspace(1)* %out
@@ -88,7 +88,7 @@ define void @fabs_fn_fold(float addrspac
 ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: and
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]|
-define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
+define amdgpu_kernel void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @llvm.fabs.f32(float %in0)
   %fmul = fmul float %fabs, %in1
   store float %fmul, float addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll Tue Mar 21 16:39:51 2017
@@ -28,7 +28,7 @@
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -55,7 +55,7 @@ define void @fast_add_fmuladd_fmul() #0
 ; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]]
 ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]]
 ; GCN-FASTFMA: buffer_store_dword [[FMA1]]
-define void @fast_sub_fmuladd_fmul() #0 {
+define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -87,7 +87,7 @@ define void @fast_sub_fmuladd_fmul() #0
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -120,7 +120,7 @@ define void @fast_add_fmuladd_fmul_multi
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -145,7 +145,7 @@ define void @fast_add_fmuladd_fmul_multi
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -170,7 +170,7 @@ define void @fast_add_fmuladd_fmul_multi
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -205,7 +205,7 @@ define void @fast_add_fmuladd_fmul_multi
 
 ; GCN: buffer_store_dword [[MUL]]
 ; GCN: buffer_store_dword [[MAD]]
-define void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
+define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -241,7 +241,7 @@ define void @fast_sub_fmuladd_fmul_multi
 ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]]
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_subrev_f32_e32
-define void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 {
+define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef

Modified: llvm/trunk/test/CodeGen/AMDGPU/fadd.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fadd.f16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fadd.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fadd.f16.ll Tue Mar 21 16:39:51 2017
@@ -11,7 +11,7 @@
 ; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fadd_f16(
+define amdgpu_kernel void @fadd_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -31,7 +31,7 @@ entry:
 ; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fadd_f16_imm_a(
+define amdgpu_kernel void @fadd_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -49,7 +49,7 @@ entry:
 ; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fadd_f16_imm_b(
+define amdgpu_kernel void @fadd_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -83,7 +83,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fadd_v2f16(
+define amdgpu_kernel void @fadd_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -111,7 +111,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fadd_v2f16_imm_a(
+define amdgpu_kernel void @fadd_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -136,7 +136,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fadd_v2f16_imm_b(
+define amdgpu_kernel void @fadd_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:

Modified: llvm/trunk/test/CodeGen/AMDGPU/fadd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fadd.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fadd.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fadd.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}fadd_f32:
 ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
 ; SI: v_add_f32
-define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float %a, float %b) #0 {
    %add = fadd float %a, %b
    store float %add, float addrspace(1)* %out, align 4
    ret void
@@ -16,7 +16,7 @@ define void @fadd_f32(float addrspace(1)
 ; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
 ; SI: v_add_f32
 ; SI: v_add_f32
-define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
   %add = fadd <2 x float> %a, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -31,7 +31,7 @@ define void @fadd_v2f32(<2 x float> addr
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; SI: v_add_f32
-define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
   %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16
@@ -57,7 +57,7 @@ define void @fadd_v4f32(<4 x float> addr
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; SI: v_add_f32
-define void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) #0 {
+define amdgpu_kernel void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) #0 {
   %add = fadd <8 x float> %a, %b
   store <8 x float> %add, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -65,7 +65,7 @@ define void @fadd_v8f32(<8 x float> addr
 
 ; FUNC-LABEL: {{^}}fadd_0_nsz_attr_f32:
 ; SI-NOT: v_add_f32
-define void @fadd_0_nsz_attr_f32(float addrspace(1)* %out, float %a) #1 {
+define amdgpu_kernel void @fadd_0_nsz_attr_f32(float addrspace(1)* %out, float %a) #1 {
    %add = fadd float %a, 0.0
    store float %add, float addrspace(1)* %out, align 4
    ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/fadd64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fadd64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fadd64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fadd64.ll Tue Mar 21 16:39:51 2017
@@ -3,7 +3,7 @@
 
 ; CHECK-LABEL: {{^}}v_fadd_f64:
 ; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
-define void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                         double addrspace(1)* %in2) {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -14,7 +14,7 @@ define void @v_fadd_f64(double addrspace
 
 ; CHECK-LABEL: {{^}}s_fadd_f64:
 ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) {
+define amdgpu_kernel void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) {
   %r2 = fadd double %r0, %r1
   store double %r2, double addrspace(1)* %out
   ret void
@@ -24,7 +24,7 @@ define void @s_fadd_f64(double addrspace
 ; CHECK: v_add_f64
 ; CHECK: v_add_f64
 ; CHECK: _store_dwordx4
-define void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                           <2 x double> addrspace(1)* %in2) {
   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
   %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
@@ -37,7 +37,7 @@ define void @v_fadd_v2f64(<2 x double> a
 ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 ; CHECK: _store_dwordx4
-define void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %r0, <2 x double> %r1) {
+define amdgpu_kernel void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %r0, <2 x double> %r1) {
   %r2 = fadd <2 x double> %r0, %r1
   store <2 x double> %r2, <2 x double> addrspace(1)* %out
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@ declare <2 x half> @llvm.canonicalize.v2
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f16:
 ; GCN: v_mul_f16_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
 ; GCN: buffer_store_short [[REG]]
-define void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 {
   %val = load half, half addrspace(1)* %out
   %canonicalized = call half @llvm.canonicalize.f16(half %val)
   store half %canonicalized, half addrspace(1)* %out
@@ -19,7 +19,7 @@ define void @v_test_canonicalize_var_f16
 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f16:
 ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}}
 ; GCN: buffer_store_short [[REG]]
-define void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 {
+define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 {
   %val = bitcast i16 %val.arg to half
   %canonicalized = call half @llvm.canonicalize.f16(half %val)
   store half %canonicalized, half addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @s_test_canonicalize_var_f16
 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16:
 ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}|
 ; GCN: buffer_store_short [[REG]]
-define void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 {
   %val = load half, half addrspace(1)* %out
   %val.fabs = call half @llvm.fabs.f16(half %val)
   %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs)
@@ -40,7 +40,7 @@ define void @v_test_canonicalize_fabs_va
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f16:
 ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -|{{v[0-9]+}}|
 ; GCN: buffer_store_short [[REG]]
-define void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 {
   %val = load half, half addrspace(1)* %out
   %val.fabs = call half @llvm.fabs.f16(half %val)
   %val.fabs.fneg = fsub half -0.0, %val.fabs
@@ -52,7 +52,7 @@ define void @v_test_canonicalize_fneg_fa
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f16:
 ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -{{v[0-9]+}}
 ; GCN: buffer_store_short [[REG]]
-define void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 {
   %val = load half, half addrspace(1)* %out
   %val.fneg = fsub half -0.0, %val
   %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg)
@@ -63,7 +63,7 @@ define void @v_test_canonicalize_fneg_va
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -72,7 +72,7 @@ define void @test_fold_canonicalize_p0_f
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half -0.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -81,7 +81,7 @@ define void @test_fold_canonicalize_n0_f
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 1.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -90,7 +90,7 @@ define void @test_fold_canonicalize_p1_f
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half -1.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -99,7 +99,7 @@ define void @test_fold_canonicalize_n1_f
 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 16.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -108,7 +108,7 @@ define void @test_fold_canonicalize_lite
 ; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -117,7 +117,7 @@ define void @test_default_denormals_fold
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -126,7 +126,7 @@ define void @test_denormals_fold_canonic
 ; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -135,7 +135,7 @@ define void @test_default_denormals_fold
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -144,7 +144,7 @@ define void @test_denormals_fold_canonic
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -153,7 +153,7 @@ define void @test_fold_canonicalize_qnan
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half))
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -162,7 +162,7 @@ define void @test_fold_canonicalize_qnan
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half))
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -171,7 +171,7 @@ define void @test_fold_canonicalize_qnan
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -180,7 +180,7 @@ define void @test_fold_canonicalize_snan
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -189,7 +189,7 @@ define void @test_fold_canonicalize_snan
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -198,7 +198,7 @@ define void @test_fold_canonicalize_snan
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -211,7 +211,7 @@ define void @test_fold_canonicalize_snan
 
 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}}
 ; GFX9: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %out
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -229,7 +229,7 @@ define void @v_test_canonicalize_var_v2f
 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]]{{$}}
 ; GCN: buffer_store_dword
-define void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %out
   %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs)
@@ -246,7 +246,7 @@ define void @v_test_canonicalize_fabs_va
 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]] neg_lo:[0,1] neg_hi:[0,1]{{$}}
 ; GCN: buffer_store_dword
-define void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %out
   %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
   %val.fabs.fneg = fsub <2 x half> <half -0.0, half -0.0>, %val.fabs
@@ -265,7 +265,7 @@ define void @v_test_canonicalize_fneg_fa
 
 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}}
 ; GFX9: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %out
   %fneg.val = fsub <2 x half> <half -0.0, half -0.0>, %val
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val)
@@ -280,7 +280,7 @@ define void @v_test_canonicalize_fneg_va
 
 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{s[0-9]+$}}
 ; GFX9: buffer_store_dword [[REG]]
-define void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 {
+define amdgpu_kernel void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 {
   %val = bitcast i32 %val.arg to <2 x half>
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -290,7 +290,7 @@ define void @s_test_canonicalize_var_v2f
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -299,7 +299,7 @@ define void @test_fold_canonicalize_p0_v
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -0.0, half -0.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -308,7 +308,7 @@ define void @test_fold_canonicalize_n0_v
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 1.0, half 1.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -317,7 +317,7 @@ define void @test_fold_canonicalize_p1_v
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -1.0, half -1.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -326,7 +326,7 @@ define void @test_fold_canonicalize_n1_v
 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c004c00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 16.0, half 16.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -335,7 +335,7 @@ define void @test_fold_canonicalize_lite
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -344,7 +344,7 @@ define void @test_no_denormals_fold_cano
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #3 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -353,7 +353,7 @@ define void @test_denormals_fold_canonic
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -362,7 +362,7 @@ define void @test_no_denormals_fold_cano
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #3 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -371,7 +371,7 @@ define void @test_denormals_fold_canonic
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c007c00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C00, half 0xH7C00>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -380,7 +380,7 @@ define void @test_fold_canonicalize_qnan
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>))
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -389,7 +389,7 @@ define void @test_fold_canonicalize_qnan
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half bitcast (i16 -2 to half), half bitcast (i16 -2 to half)>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -398,7 +398,7 @@ define void @test_fold_canonicalize_qnan
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C01, half 0xH7C01>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -407,7 +407,7 @@ define void @test_fold_canonicalize_snan
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7DFF, half 0xH7DFF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -416,7 +416,7 @@ define void @test_fold_canonicalize_snan
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFDFF, half 0xHFDFF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -425,7 +425,7 @@ define void @test_fold_canonicalize_snan
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFC01, half 0xHFC01>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.ll Tue Mar 21 16:39:51 2017
@@ -8,7 +8,7 @@ declare double @llvm.canonicalize.f64(do
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f32:
 ; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
 ; GCN: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
   %val = load float, float addrspace(1)* %out
   %canonicalized = call float @llvm.canonicalize.f32(float %val)
   store float %canonicalized, float addrspace(1)* %out
@@ -18,7 +18,7 @@ define void @v_test_canonicalize_var_f32
 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f32:
 ; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}}
 ; GCN: buffer_store_dword [[REG]]
-define void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 {
+define amdgpu_kernel void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float %val)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -27,7 +27,7 @@ define void @s_test_canonicalize_var_f32
 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f32:
 ; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}|
 ; GCN: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* %out) #1 {
   %val = load float, float addrspace(1)* %out
   %val.fabs = call float @llvm.fabs.f32(float %val)
   %canonicalized = call float @llvm.canonicalize.f32(float %val.fabs)
@@ -38,7 +38,7 @@ define void @v_test_canonicalize_fabs_va
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f32:
 ; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, -|{{v[0-9]+}}|
 ; GCN: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace(1)* %out) #1 {
   %val = load float, float addrspace(1)* %out
   %val.fabs = call float @llvm.fabs.f32(float %val)
   %val.fabs.fneg = fsub float -0.0, %val.fabs
@@ -50,7 +50,7 @@ define void @v_test_canonicalize_fneg_fa
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f32:
 ; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, -{{v[0-9]+}}
 ; GCN: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* %out) #1 {
   %val = load float, float addrspace(1)* %out
   %val.fneg = fsub float -0.0, %val
   %canonicalized = call float @llvm.canonicalize.f32(float %val.fneg)
@@ -61,7 +61,7 @@ define void @v_test_canonicalize_fneg_va
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 0.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -70,7 +70,7 @@ define void @test_fold_canonicalize_p0_f
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f32:
 ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float -0.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -79,7 +79,7 @@ define void @test_fold_canonicalize_n0_f
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 1.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -88,7 +88,7 @@ define void @test_fold_canonicalize_p1_f
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float -1.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -97,7 +97,7 @@ define void @test_fold_canonicalize_n1_f
 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x41800000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 16.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -106,7 +106,7 @@ define void @test_fold_canonicalize_lite
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -115,7 +115,7 @@ define void @test_no_denormals_fold_cano
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fffff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -124,7 +124,7 @@ define void @test_denormals_fold_canonic
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -133,7 +133,7 @@ define void @test_no_denormals_fold_cano
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x807fffff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -142,7 +142,7 @@ define void @test_denormals_fold_canonic
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -151,7 +151,7 @@ define void @test_fold_canonicalize_qnan
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -160,7 +160,7 @@ define void @test_fold_canonicalize_qnan
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -169,7 +169,7 @@ define void @test_fold_canonicalize_qnan
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -178,7 +178,7 @@ define void @test_fold_canonicalize_snan
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -187,7 +187,7 @@ define void @test_fold_canonicalize_snan
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -196,7 +196,7 @@ define void @test_fold_canonicalize_snan
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -205,7 +205,7 @@ define void @test_fold_canonicalize_snan
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f64:
 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{v\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
   %val = load double, double addrspace(1)* %out
   %canonicalized = call double @llvm.canonicalize.f64(double %val)
   store double %canonicalized, double addrspace(1)* %out
@@ -215,7 +215,7 @@ define void @v_test_canonicalize_var_f64
 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f64:
 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{s\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 {
+define amdgpu_kernel void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double %val)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -224,7 +224,7 @@ define void @s_test_canonicalize_var_f64
 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f64:
 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, |{{v\[[0-9]+:[0-9]+\]}}|
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* %out) #1 {
   %val = load double, double addrspace(1)* %out
   %val.fabs = call double @llvm.fabs.f64(double %val)
   %canonicalized = call double @llvm.canonicalize.f64(double %val.fabs)
@@ -235,7 +235,7 @@ define void @v_test_canonicalize_fabs_va
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f64:
 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]\]]], 1.0, -|{{v\[[0-9]+:[0-9]+\]}}|
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1 {
   %val = load double, double addrspace(1)* %out
   %val.fabs = call double @llvm.fabs.f64(double %val)
   %val.fabs.fneg = fsub double -0.0, %val.fabs
@@ -247,7 +247,7 @@ define void @v_test_canonicalize_fneg_fa
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f64:
 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, -{{v\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 {
   %val = load double, double addrspace(1)* %out
   %val.fneg = fsub double -0.0, %val
   %canonicalized = call double @llvm.canonicalize.f64(double %val.fneg)
@@ -259,7 +259,7 @@ define void @v_test_canonicalize_fneg_va
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 0.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -269,7 +269,7 @@ define void @test_fold_canonicalize_p0_f
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HI:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double -0.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -279,7 +279,7 @@ define void @test_fold_canonicalize_n0_f
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x3ff00000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 1.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -289,7 +289,7 @@ define void @test_fold_canonicalize_p1_f
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xbff00000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double -1.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -299,7 +299,7 @@ define void @test_fold_canonicalize_n1_f
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x40300000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 16.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -309,7 +309,7 @@ define void @test_fold_canonicalize_lite
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -319,7 +319,7 @@ define void @test_no_denormals_fold_cano
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xfffff{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -329,7 +329,7 @@ define void @test_denormals_fold_canonic
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -339,7 +339,7 @@ define void @test_no_denormals_fold_cano
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x800fffff{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -349,7 +349,7 @@ define void @test_denormals_fold_canonic
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -359,7 +359,7 @@ define void @test_fold_canonicalize_qnan
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -369,7 +369,7 @@ define void @test_fold_canonicalize_qnan
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -379,7 +379,7 @@ define void @test_fold_canonicalize_qnan
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -389,7 +389,7 @@ define void @test_fold_canonicalize_snan
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -399,7 +399,7 @@ define void @test_fold_canonicalize_snan
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -409,7 +409,7 @@ define void @test_fold_canonicalize_snan
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/fceil.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fceil.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fceil.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fceil.ll Tue Mar 21 16:39:51 2017
@@ -13,7 +13,7 @@ declare <16 x float> @llvm.ceil.v16f32(<
 ; SI: v_ceil_f32_e32
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
-define void @fceil_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @fceil_f32(float addrspace(1)* %out, float %x) {
   %y = call float @llvm.ceil.f32(float %x) nounwind readnone
   store float %y, float addrspace(1)* %out
   ret void
@@ -25,7 +25,7 @@ define void @fceil_f32(float addrspace(1
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
-define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
+define amdgpu_kernel void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
   %y = call <2 x float> @llvm.ceil.v2f32(<2 x float> %x) nounwind readnone
   store <2 x float> %y, <2 x float> addrspace(1)* %out
   ret void
@@ -41,7 +41,7 @@ define void @fceil_v2f32(<2 x float> add
 ; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
+define amdgpu_kernel void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
   %y = call <3 x float> @llvm.ceil.v3f32(<3 x float> %x) nounwind readnone
   store <3 x float> %y, <3 x float> addrspace(1)* %out
   ret void
@@ -57,7 +57,7 @@ define void @fceil_v3f32(<3 x float> add
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
-define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
+define amdgpu_kernel void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
   %y = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone
   store <4 x float> %y, <4 x float> addrspace(1)* %out
   ret void
@@ -82,7 +82,7 @@ define void @fceil_v4f32(<4 x float> add
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
+define amdgpu_kernel void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
   %y = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) nounwind readnone
   store <8 x float> %y, <8 x float> addrspace(1)* %out
   ret void
@@ -125,7 +125,7 @@ define void @fceil_v8f32(<8 x float> add
 ; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
-define void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
+define amdgpu_kernel void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
   %y = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) nounwind readnone
   store <16 x float> %y, <16 x float> addrspace(1)* %out
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/fceil64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fceil64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fceil64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fceil64.ll Tue Mar 21 16:39:51 2017
@@ -31,7 +31,7 @@ declare <16 x double> @llvm.ceil.v16f64(
 ; SI: v_cndmask_b32
 ; SI: v_add_f64
 ; SI: s_endpgm
-define void @fceil_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @fceil_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.ceil.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
@@ -40,7 +40,7 @@ define void @fceil_f64(double addrspace(
 ; FUNC-LABEL: {{^}}fceil_v2f64:
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+define amdgpu_kernel void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone
   store <2 x double> %y, <2 x double> addrspace(1)* %out
   ret void
@@ -50,7 +50,7 @@ define void @fceil_v2f64(<2 x double> ad
 ; FIXME-CI: v_ceil_f64_e32
 ; FIXME-CI: v_ceil_f64_e32
 ; FIXME-CI: v_ceil_f64_e32
-; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+; define amdgpu_kernel void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
 ;   %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone
 ;   store <3 x double> %y, <3 x double> addrspace(1)* %out
 ;   ret void
@@ -61,7 +61,7 @@ define void @fceil_v2f64(<2 x double> ad
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+define amdgpu_kernel void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone
   store <4 x double> %y, <4 x double> addrspace(1)* %out
   ret void
@@ -76,7 +76,7 @@ define void @fceil_v4f64(<4 x double> ad
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+define amdgpu_kernel void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone
   store <8 x double> %y, <8 x double> addrspace(1)* %out
   ret void
@@ -99,7 +99,7 @@ define void @fceil_v8f64(<8 x double> ad
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+define amdgpu_kernel void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone
   store <16 x double> %y, <16 x double> addrspace(1)* %out
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/fcmp-cnd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fcmp-cnd.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fcmp-cnd.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fcmp-cnd.ll Tue Mar 21 16:39:51 2017
@@ -4,7 +4,7 @@
 ;registers and literal.x depending on what the optimizer does.
 ;CHECK: CNDE  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float, float addrspace(1)* %in
   %cmp = fcmp oeq float %0, 0.000000e+00

Modified: llvm/trunk/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@
 
 ; CHECK: SET{{[A-Z]+}}_DX10
 
-define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float, float addrspace(1)* %in
   %cmp = fcmp oeq float %0, 0.000000e+00

Modified: llvm/trunk/test/CodeGen/AMDGPU/fcmp.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fcmp.f16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fcmp.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fcmp.f16.ll Tue Mar 21 16:39:51 2017
@@ -11,7 +11,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_lt(
+define amdgpu_kernel void @fcmp_f16_lt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -37,7 +37,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_lt_abs(
+define amdgpu_kernel void @fcmp_f16_lt_abs(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -62,7 +62,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_eq(
+define amdgpu_kernel void @fcmp_f16_eq(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -85,7 +85,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_le(
+define amdgpu_kernel void @fcmp_f16_le(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -108,7 +108,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_gt(
+define amdgpu_kernel void @fcmp_f16_gt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -131,7 +131,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_lg(
+define amdgpu_kernel void @fcmp_f16_lg(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -154,7 +154,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_ge(
+define amdgpu_kernel void @fcmp_f16_ge(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -177,7 +177,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_o(
+define amdgpu_kernel void @fcmp_f16_o(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -200,7 +200,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_u(
+define amdgpu_kernel void @fcmp_f16_u(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -223,7 +223,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nge(
+define amdgpu_kernel void @fcmp_f16_nge(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -246,7 +246,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nlg(
+define amdgpu_kernel void @fcmp_f16_nlg(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -269,7 +269,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_ngt(
+define amdgpu_kernel void @fcmp_f16_ngt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -292,7 +292,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nle(
+define amdgpu_kernel void @fcmp_f16_nle(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -315,7 +315,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_neq(
+define amdgpu_kernel void @fcmp_f16_neq(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -338,7 +338,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nlt(
+define amdgpu_kernel void @fcmp_f16_nlt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -368,7 +368,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_lt(
+define amdgpu_kernel void @fcmp_v2f16_lt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -398,7 +398,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_eq(
+define amdgpu_kernel void @fcmp_v2f16_eq(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -428,7 +428,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_le(
+define amdgpu_kernel void @fcmp_v2f16_le(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -458,7 +458,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_gt(
+define amdgpu_kernel void @fcmp_v2f16_gt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -488,7 +488,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_lg(
+define amdgpu_kernel void @fcmp_v2f16_lg(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -518,7 +518,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_ge(
+define amdgpu_kernel void @fcmp_v2f16_ge(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -548,7 +548,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_o(
+define amdgpu_kernel void @fcmp_v2f16_o(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -578,7 +578,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_u(
+define amdgpu_kernel void @fcmp_v2f16_u(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -608,7 +608,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nge(
+define amdgpu_kernel void @fcmp_v2f16_nge(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -638,7 +638,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nlg(
+define amdgpu_kernel void @fcmp_v2f16_nlg(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -668,7 +668,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_ngt(
+define amdgpu_kernel void @fcmp_v2f16_ngt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -698,7 +698,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nle(
+define amdgpu_kernel void @fcmp_v2f16_nle(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -728,7 +728,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_neq(
+define amdgpu_kernel void @fcmp_v2f16_neq(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -758,7 +758,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nlt(
+define amdgpu_kernel void @fcmp_v2f16_nlt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {

Modified: llvm/trunk/test/CodeGen/AMDGPU/fcmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fcmp.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fcmp.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fcmp.ll Tue Mar 21 16:39:51 2017
@@ -3,7 +3,7 @@
 ; CHECK: {{^}}fcmp_sext:
 ; CHECK: SETE_DX10  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float, float addrspace(1)* %in
   %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %in, i32 1
@@ -22,7 +22,7 @@ entry:
 ; CHECK: SET{{[N]*}}E_DX10 * T{{[0-9]+\.[XYZW],}}
 ; CHECK-NEXT: {{[0-9]+\(5.0}}
 
-define void @fcmp_br(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_br(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oeq float %in, 5.0
   br i1 %0, label %IF, label %ENDIF

Modified: llvm/trunk/test/CodeGen/AMDGPU/fcmp64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fcmp64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fcmp64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fcmp64.ll Tue Mar 21 16:39:51 2017
@@ -3,7 +3,7 @@
 
 ; CHECK-LABEL: {{^}}flt_f64:
 ; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -15,7 +15,7 @@ define void @flt_f64(i32 addrspace(1)* %
 
 ; CHECK-LABEL: {{^}}fle_f64:
 ; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -27,7 +27,7 @@ define void @fle_f64(i32 addrspace(1)* %
 
 ; CHECK-LABEL: {{^}}fgt_f64:
 ; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -39,7 +39,7 @@ define void @fgt_f64(i32 addrspace(1)* %
 
 ; CHECK-LABEL: {{^}}fge_f64:
 ; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -51,7 +51,7 @@ define void @fge_f64(i32 addrspace(1)* %
 
 ; CHECK-LABEL: {{^}}fne_f64:
 ; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -63,7 +63,7 @@ define void @fne_f64(double addrspace(1)
 
 ; CHECK-LABEL: {{^}}feq_f64:
 ; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2

Modified: llvm/trunk/test/CodeGen/AMDGPU/fconst64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fconst64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fconst64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fconst64.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0x40140000
 ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0
 
-define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
    %r1 = load double, double addrspace(1)* %in
    %r2 = fadd double %r1, 5.000000e+00
    store double %r2, double addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f16.ll Tue Mar 21 16:39:51 2017
@@ -23,7 +23,7 @@ declare <4 x half> @llvm.copysign.v4f16(
 ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]]
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_f16(
+define amdgpu_kernel void @test_copysign_f16(
   half addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -43,7 +43,7 @@ entry:
 ; GCN: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_EXT]], v[[SIGN]]
 ; GCN: buffer_store_dword v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f32_mag_f16_sign_f32(
+define amdgpu_kernel void @test_copysign_out_f32_mag_f16_sign_f32(
   float addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   float addrspace(1)* %arg_sign) {
@@ -65,7 +65,7 @@ entry:
 ; GCN: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_EXT_HI]], v[[SIGN_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[MAG_EXT_LO]]:[[OUT_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @test_copysign_out_f64_mag_f16_sign_f64(
+define amdgpu_kernel void @test_copysign_out_f64_mag_f16_sign_f64(
   double addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   double addrspace(1)* %arg_sign) {
@@ -88,7 +88,7 @@ entry:
 ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]]
 ; GCN: buffer_store_dword v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f32_mag_f32_sign_f16(
+define amdgpu_kernel void @test_copysign_out_f32_mag_f32_sign_f16(
   float addrspace(1)* %arg_out,
   float addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -111,7 +111,7 @@ entry:
 ; VI: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_SHIFT]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[MAG_LO]]:[[OUT_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @test_copysign_out_f64_mag_f64_sign_f16(
+define amdgpu_kernel void @test_copysign_out_f64_mag_f64_sign_f16(
   double addrspace(1)* %arg_out,
   double addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -136,7 +136,7 @@ entry:
 ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]]
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f16_mag_f16_sign_f32(
+define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f32(
   half addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   float addrspace(1)* %arg_sign) {
@@ -161,7 +161,7 @@ entry:
 ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]]
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f16_mag_f16_sign_f64(
+define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f64(
   half addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   double addrspace(1)* %arg_sign) {
@@ -188,7 +188,7 @@ entry:
 ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_TRUNC]], v[[SIGN]]
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f16_mag_f32_sign_f16(
+define amdgpu_kernel void @test_copysign_out_f16_mag_f32_sign_f16(
   half addrspace(1)* %arg_out,
   float addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -204,7 +204,7 @@ entry:
 ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f64_sign_f16:
 ; GCN: v_bfi_b32
 ; GCN: s_endpgm
-define void @test_copysign_out_f16_mag_f64_sign_f16(
+define amdgpu_kernel void @test_copysign_out_f16_mag_f64_sign_f16(
   half addrspace(1)* %arg_out,
   double addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -221,7 +221,7 @@ entry:
 ; GCN: v_bfi_b32
 ; GCN: v_bfi_b32
 ; GCN: s_endpgm
-define void @test_copysign_v2f16(
+define amdgpu_kernel void @test_copysign_v2f16(
   <2 x half> addrspace(1)* %arg_out,
   <2 x half> %arg_mag,
   <2 x half> %arg_sign) {
@@ -236,7 +236,7 @@ entry:
 ; GCN: v_bfi_b32
 ; GCN: v_bfi_b32
 ; GCN: s_endpgm
-define void @test_copysign_v3f16(
+define amdgpu_kernel void @test_copysign_v3f16(
   <3 x half> addrspace(1)* %arg_out,
   <3 x half> %arg_mag,
   <3 x half> %arg_sign) {
@@ -252,7 +252,7 @@ entry:
 ; GCN: v_bfi_b32
 ; GCN: v_bfi_b32
 ; GCN: s_endpgm
-define void @test_copysign_v4f16(
+define amdgpu_kernel void @test_copysign_v4f16(
   <4 x half> addrspace(1)* %arg_out,
   <4 x half> %arg_mag,
   <4 x half> %arg_sign) {

Modified: llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f32.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f32.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f32.ll Tue Mar 21 16:39:51 2017
@@ -20,7 +20,7 @@ declare <4 x float> @llvm.copysign.v4f32
 ; GCN: s_endpgm
 
 ; EG: BFI_INT
-define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
   %result = call float @llvm.copysign.f32(float %mag, float %sign)
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -31,7 +31,7 @@ define void @test_copysign_f32(float add
 
 ; EG: BFI_INT
 ; EG: BFI_INT
-define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind {
   %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
   store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -44,7 +44,7 @@ define void @test_copysign_v2f32(<2 x fl
 ; EG: BFI_INT
 ; EG: BFI_INT
 ; EG: BFI_INT
-define void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind {
   %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign)
   store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fcopysign.f64.ll Tue Mar 21 16:39:51 2017
@@ -17,7 +17,7 @@ declare <4 x double> @llvm.copysign.v4f6
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
   %result = call double @llvm.copysign.f64(double %mag, double %sign)
   store double %result, double addrspace(1)* %out, align 8
   ret void
@@ -32,7 +32,7 @@ define void @test_copysign_f64(double ad
 ; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN]]
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
-define void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind {
   %c = fpext float %sign to double
   %result = call double @llvm.copysign.f64(double %mag, double %c)
   store double %result, double addrspace(1)* %out, align 8
@@ -41,7 +41,7 @@ define void @test_copysign_f64_f32(doubl
 
 ; FUNC-LABEL: {{^}}test_copysign_v2f64:
 ; GCN: s_endpgm
-define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
   %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
   store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8
   ret void
@@ -49,7 +49,7 @@ define void @test_copysign_v2f64(<2 x do
 
 ; FUNC-LABEL: {{^}}test_copysign_v4f64:
 ; GCN: s_endpgm
-define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
   %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
   store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/fdiv.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fdiv.f16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fdiv.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fdiv.f16.ll Tue Mar 21 16:39:51 2017
@@ -31,7 +31,7 @@
 ; VI: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]]
 ; VI: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_fdiv_f16(
+define amdgpu_kernel void @v_fdiv_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) #0 {
@@ -54,7 +54,7 @@ entry:
 ; VI: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; VI-NOT: [[RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -72,7 +72,7 @@ entry:
 ; VI: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]|
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -91,7 +91,7 @@ entry:
 ; VI: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; VI-NOT: [[RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -109,7 +109,7 @@ entry:
 ; VI: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]]
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -127,7 +127,7 @@ entry:
 ; VI: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -147,7 +147,7 @@ entry:
 ; VI-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]]
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -168,7 +168,7 @@ entry:
 ; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -190,7 +190,7 @@ entry:
 ; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 {
+define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -209,7 +209,7 @@ entry:
 
 ; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
 ; VI: buffer_store_short [[MUL]]
-define void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
   %x = load half, half addrspace(1)* undef
   %rcp = fdiv arcp half %x, 2.0
   store half %rcp, half addrspace(1)* %out, align 4
@@ -221,7 +221,7 @@ define void @div_arcp_2_x_pat_f16(half a
 
 ; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
 ; VI: buffer_store_short [[MUL]]
-define void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
   %x = load half, half addrspace(1)* undef
   %rcp = fdiv arcp half %x, 10.0
   store half %rcp, half addrspace(1)* %out, align 4
@@ -233,7 +233,7 @@ define void @div_arcp_k_x_pat_f16(half a
 
 ; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
 ; VI: buffer_store_short [[MUL]]
-define void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
   %x = load half, half addrspace(1)* undef
   %rcp = fdiv arcp half %x, -10.0
   store half %rcp, half addrspace(1)* %out, align 4

Modified: llvm/trunk/test/CodeGen/AMDGPU/fdiv.f64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fdiv.f64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fdiv.f64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fdiv.f64.ll Tue Mar 21 16:39:51 2017
@@ -29,7 +29,7 @@
 ; GCN: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
 ; GCN: s_endpgm
-define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %gep.1 = getelementptr double, double addrspace(1)* %in, i32 1
   %num = load volatile double, double addrspace(1)* %in
   %den = load volatile double, double addrspace(1)* %gep.1
@@ -39,7 +39,7 @@ define void @fdiv_f64(double addrspace(1
 }
 
 ; GCN-LABEL: {{^}}fdiv_f64_s_v:
-define void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) #0 {
+define amdgpu_kernel void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) #0 {
   %den = load double, double addrspace(1)* %in
   %result = fdiv double %num, %den
   store double %result, double addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @fdiv_f64_s_v(double addrspa
 }
 
 ; GCN-LABEL: {{^}}fdiv_f64_v_s:
-define void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) #0 {
+define amdgpu_kernel void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) #0 {
   %num = load double, double addrspace(1)* %in
   %result = fdiv double %num, %den
   store double %result, double addrspace(1)* %out
@@ -55,14 +55,14 @@ define void @fdiv_f64_v_s(double addrspa
 }
 
 ; GCN-LABEL: {{^}}fdiv_f64_s_s:
-define void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) #0 {
+define amdgpu_kernel void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) #0 {
   %result = fdiv double %num, %den
   store double %result, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_fdiv_v2f64:
-define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
   %gep.1 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in, i32 1
   %num = load <2 x double>, <2 x double> addrspace(1)* %in
   %den = load <2 x double>, <2 x double> addrspace(1)* %gep.1
@@ -72,14 +72,14 @@ define void @v_fdiv_v2f64(<2 x double> a
 }
 
 ; GCN-LABEL: {{^}}s_fdiv_v2f64:
-define void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) {
+define amdgpu_kernel void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) {
   %result = fdiv <2 x double> %num, %den
   store <2 x double> %result, <2 x double> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_fdiv_v4f64:
-define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
   %gep.1 = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1
   %num = load <4 x double>, <4 x double> addrspace(1)* %in
   %den = load <4 x double>, <4 x double> addrspace(1)* %gep.1
@@ -89,7 +89,7 @@ define void @v_fdiv_v4f64(<4 x double> a
 }
 
 ; GCN-LABEL: {{^}}s_fdiv_v4f64:
-define void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) #0 {
+define amdgpu_kernel void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) #0 {
   %result = fdiv <4 x double> %num, %den
   store <4 x double> %result, <4 x double> addrspace(1)* %out
   ret void
@@ -98,7 +98,7 @@ define void @s_fdiv_v4f64(<4 x double> a
 ; GCN-LABEL: {{^}}div_fast_2_x_pat_f64:
 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, 0.5
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define void @div_fast_2_x_pat_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @div_fast_2_x_pat_f64(double addrspace(1)* %out) #1 {
   %x = load double, double addrspace(1)* undef
   %rcp = fdiv fast double %x, 2.0
   store double %rcp, double addrspace(1)* %out, align 4
@@ -110,7 +110,7 @@ define void @div_fast_2_x_pat_f64(double
 ; GCN-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fb99999
 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define void @div_fast_k_x_pat_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @div_fast_k_x_pat_f64(double addrspace(1)* %out) #1 {
   %x = load double, double addrspace(1)* undef
   %rcp = fdiv fast double %x, 10.0
   store double %rcp, double addrspace(1)* %out, align 4
@@ -122,7 +122,7 @@ define void @div_fast_k_x_pat_f64(double
 ; GCN-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfb99999
 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define void @div_fast_neg_k_x_pat_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(double addrspace(1)* %out) #1 {
   %x = load double, double addrspace(1)* undef
   %rcp = fdiv fast double %x, -10.0
   store double %rcp, double addrspace(1)* %out, align 4

Modified: llvm/trunk/test/CodeGen/AMDGPU/fdiv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fdiv.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fdiv.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fdiv.ll Tue Mar 21 16:39:51 2017
@@ -27,7 +27,7 @@
 ; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
 ; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
-define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -52,7 +52,7 @@ entry:
 ; GCN-NOT: s_setreg
 ; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
 ; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
-define void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
   %fdiv = fdiv float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -65,7 +65,7 @@ entry:
 ; GCN: v_rcp_f32
 ; GCN: v_mul_f32
 ; GCN: v_mul_f32
-define void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv float %a, %b, !fpmath !0
   store float %fdiv, float addrspace(1)* %out
@@ -77,7 +77,7 @@ entry:
 ; GCN: v_fma_f32
 ; GCN: v_div_fmas_f32
 ; GCN: v_div_fixup_f32
-define void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
   %fdiv = fdiv float %a, %b, !fpmath !0
   store float %fdiv, float addrspace(1)* %out
@@ -89,7 +89,7 @@ entry:
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
   %fdiv = fdiv fast float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -104,7 +104,7 @@ entry:
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv fast float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -119,7 +119,7 @@ entry:
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv arcp float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -136,7 +136,7 @@ entry:
 ; GCN: v_div_scale_f32
 ; GCN: v_div_scale_f32
 ; GCN: v_div_scale_f32
-define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv <2 x float> %a, %b
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -146,7 +146,7 @@ entry:
 ; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
 ; GCN: v_cmp_gt_f32
 ; GCN: v_cmp_gt_f32
-define void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -161,7 +161,7 @@ entry:
 
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv fast <2 x float> %a, %b
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -176,7 +176,7 @@ entry:
 
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv arcp <2 x float> %a, %b
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -197,7 +197,7 @@ entry:
 ; GCN: v_div_fixup_f32
 ; GCN: v_div_fixup_f32
 ; GCN: v_div_fixup_f32
-define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -220,7 +220,7 @@ define void @fdiv_v4f32(<4 x float> addr
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -243,7 +243,7 @@ define void @fdiv_v4f32_fast_math(<4 x f
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr

Modified: llvm/trunk/test/CodeGen/AMDGPU/ffloor.f64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ffloor.f64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ffloor.f64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ffloor.f64.ll Tue Mar 21 16:39:51 2017
@@ -19,7 +19,7 @@ declare <16 x double> @llvm.floor.v16f64
 ; SI: v_cndmask_b32_e32
 ; SI: v_add_f64
 ; SI: s_endpgm
-define void @ffloor_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ffloor_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.floor.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@ define void @ffloor_f64(double addrspace
 ; SI: v_cndmask_b32_e32
 ; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]]
 ; SI: s_endpgm
-define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
   %neg = fsub double 0.0, %x
   %y = call double @llvm.floor.f64(double %neg) nounwind readnone
   store double %y, double addrspace(1)* %out
@@ -50,7 +50,7 @@ define void @ffloor_f64_neg(double addrs
 ; SI: v_cndmask_b32_e32
 ; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]|
 ; SI: s_endpgm
-define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) {
   %abs = call double @llvm.fabs.f64(double %x)
   %neg = fsub double 0.0, %abs
   %y = call double @llvm.floor.f64(double %neg) nounwind readnone
@@ -61,7 +61,7 @@ define void @ffloor_f64_neg_abs(double a
 ; FUNC-LABEL: {{^}}ffloor_v2f64:
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+define amdgpu_kernel void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone
   store <2 x double> %y, <2 x double> addrspace(1)* %out
   ret void
@@ -72,7 +72,7 @@ define void @ffloor_v2f64(<2 x double> a
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI-NOT: v_floor_f64_e32
-define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+define amdgpu_kernel void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
   %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone
   store <3 x double> %y, <3 x double> addrspace(1)* %out
   ret void
@@ -83,7 +83,7 @@ define void @ffloor_v3f64(<3 x double> a
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+define amdgpu_kernel void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone
   store <4 x double> %y, <4 x double> addrspace(1)* %out
   ret void
@@ -98,7 +98,7 @@ define void @ffloor_v4f64(<4 x double> a
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+define amdgpu_kernel void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone
   store <8 x double> %y, <8 x double> addrspace(1)* %out
   ret void
@@ -121,7 +121,7 @@ define void @ffloor_v8f64(<8 x double> a
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+define amdgpu_kernel void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone
   store <16 x double> %y, <16 x double> addrspace(1)* %out
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/ffloor.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ffloor.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ffloor.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ffloor.ll Tue Mar 21 16:39:51 2017
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}floor_f32:
 ; SI: v_floor_f32_e32
 ; R600: FLOOR
-define void @floor_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @floor_f32(float addrspace(1)* %out, float %in) {
   %tmp = call float @llvm.floor.f32(float %in) #0
   store float %tmp, float addrspace(1)* %out
   ret void
@@ -15,7 +15,7 @@ define void @floor_f32(float addrspace(1
 ; SI: v_floor_f32_e32
 ; SI: v_floor_f32_e32
 
-define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %tmp = call <2 x float> @llvm.floor.v2f32(<2 x float> %in) #0
   store <2 x float> %tmp, <2 x float> addrspace(1)* %out
   ret void
@@ -31,7 +31,7 @@ define void @floor_v2f32(<2 x float> add
 ; R600: FLOOR
 ; R600: FLOOR
 ; R600: FLOOR
-define void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %tmp = call <4 x float> @llvm.floor.v4f32(<4 x float> %in) #0
   store <4 x float> %tmp, <4 x float> addrspace(1)* %out
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/flat-address-space.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/flat-address-space.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/flat-address-space.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/flat-address-space.ll Tue Mar 21 16:39:51 2017
@@ -17,7 +17,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
 ; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
 ; CHECK: flat_store_dword v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}, v[[DATA]]
-define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
+define amdgpu_kernel void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
   %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
   store volatile i32 %x, i32 addrspace(4)* %fptr, align 4
   ret void
@@ -25,7 +25,7 @@ define void @store_flat_i32(i32 addrspac
 
 ; CHECK-LABEL: {{^}}store_flat_i64:
 ; CHECK: flat_store_dwordx2
-define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
+define amdgpu_kernel void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
   %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
   store volatile i64 %x, i64 addrspace(4)* %fptr, align 8
   ret void
@@ -33,7 +33,7 @@ define void @store_flat_i64(i64 addrspac
 
 ; CHECK-LABEL: {{^}}store_flat_v4i32:
 ; CHECK: flat_store_dwordx4
-define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
+define amdgpu_kernel void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
   %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
   store volatile <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16
   ret void
@@ -41,7 +41,7 @@ define void @store_flat_v4i32(<4 x i32>
 
 ; CHECK-LABEL: {{^}}store_flat_trunc_i16:
 ; CHECK: flat_store_short
-define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
+define amdgpu_kernel void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
   %y = trunc i32 %x to i16
   store volatile i16 %y, i16 addrspace(4)* %fptr, align 2
@@ -50,7 +50,7 @@ define void @store_flat_trunc_i16(i16 ad
 
 ; CHECK-LABEL: {{^}}store_flat_trunc_i8:
 ; CHECK: flat_store_byte
-define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
+define amdgpu_kernel void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
   %y = trunc i32 %x to i8
   store volatile i8 %y, i8 addrspace(4)* %fptr, align 2
@@ -61,7 +61,7 @@ define void @store_flat_trunc_i8(i8 addr
 
 ; CHECK-LABEL: load_flat_i32:
 ; CHECK: flat_load_dword
-define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
   %fload = load volatile i32, i32 addrspace(4)* %fptr, align 4
   store i32 %fload, i32 addrspace(1)* %out, align 4
@@ -70,7 +70,7 @@ define void @load_flat_i32(i32 addrspace
 
 ; CHECK-LABEL: load_flat_i64:
 ; CHECK: flat_load_dwordx2
-define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
   %fload = load volatile i64, i64 addrspace(4)* %fptr, align 8
   store i64 %fload, i64 addrspace(1)* %out, align 8
@@ -79,7 +79,7 @@ define void @load_flat_i64(i64 addrspace
 
 ; CHECK-LABEL: load_flat_v4i32:
 ; CHECK: flat_load_dwordx4
-define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
   %fload = load volatile <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 32
   store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
@@ -88,7 +88,7 @@ define void @load_flat_v4i32(<4 x i32> a
 
 ; CHECK-LABEL: sextload_flat_i8:
 ; CHECK: flat_load_sbyte
-define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
   %fload = load volatile i8, i8 addrspace(4)* %fptr, align 4
   %ext = sext i8 %fload to i32
@@ -98,7 +98,7 @@ define void @sextload_flat_i8(i32 addrsp
 
 ; CHECK-LABEL: zextload_flat_i8:
 ; CHECK: flat_load_ubyte
-define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
   %fload = load volatile i8, i8 addrspace(4)* %fptr, align 4
   %ext = zext i8 %fload to i32
@@ -108,7 +108,7 @@ define void @zextload_flat_i8(i32 addrsp
 
 ; CHECK-LABEL: sextload_flat_i16:
 ; CHECK: flat_load_sshort
-define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
   %fload = load volatile i16, i16 addrspace(4)* %fptr, align 4
   %ext = sext i16 %fload to i32
@@ -118,7 +118,7 @@ define void @sextload_flat_i16(i32 addrs
 
 ; CHECK-LABEL: zextload_flat_i16:
 ; CHECK: flat_load_ushort
-define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
   %fload = load volatile i16, i16 addrspace(4)* %fptr, align 4
   %ext = zext i16 %fload to i32
@@ -131,7 +131,7 @@ define void @zextload_flat_i16(i32 addrs
 ; CHECK: flat_load_ubyte
 ; CHECK: flat_load_ubyte
 ; CHECK: flat_load_ubyte
-define void @flat_scratch_unaligned_load() {
+define amdgpu_kernel void @flat_scratch_unaligned_load() {
   %scratch = alloca i32
   %fptr = addrspacecast i32* %scratch to i32 addrspace(4)*
   %ld = load volatile i32, i32 addrspace(4)* %fptr, align 1
@@ -143,7 +143,7 @@ define void @flat_scratch_unaligned_load
 ; CHECK: flat_store_byte
 ; CHECK: flat_store_byte
 ; CHECK: flat_store_byte
-define void @flat_scratch_unaligned_store() {
+define amdgpu_kernel void @flat_scratch_unaligned_store() {
   %scratch = alloca i32
   %fptr = addrspacecast i32* %scratch to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %fptr, align 1
@@ -154,7 +154,7 @@ define void @flat_scratch_unaligned_stor
 ; HSA: flat_load_dword
 ; HSA: flat_load_dword
 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
-define void @flat_scratch_multidword_load() {
+define amdgpu_kernel void @flat_scratch_multidword_load() {
   %scratch = alloca <2 x i32>
   %fptr = addrspacecast <2 x i32>* %scratch to <2 x i32> addrspace(4)*
   %ld = load volatile <2 x i32>, <2 x i32> addrspace(4)* %fptr
@@ -165,7 +165,7 @@ define void @flat_scratch_multidword_loa
 ; HSA: flat_store_dword
 ; HSA: flat_store_dword
 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
-define void @flat_scratch_multidword_store() {
+define amdgpu_kernel void @flat_scratch_multidword_store() {
   %scratch = alloca <2 x i32>
   %fptr = addrspacecast <2 x i32>* %scratch to <2 x i32> addrspace(4)*
   store volatile <2 x i32> zeroinitializer, <2 x i32> addrspace(4)* %fptr

Modified: llvm/trunk/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll Tue Mar 21 16:39:51 2017
@@ -23,7 +23,7 @@
 ; NOHSA-DEFAULT: buffer_store_dword
 ; NOHSA-NODEFAULT: flat_store_dword
 ; NOHSA-NOADDR64: flat_store_dword
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
 entry:
   store i32 0, i32 addrspace(1)* %out
   ret void
@@ -36,7 +36,7 @@ entry:
 ; NOHSA-DEFAULT: buffer_store_dword
 ; NOHSA-NODEFAULT: flat_store_dword
 ; NOHSA-NOADDR64: flat_store_dword
-define void @test_addr64(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_addr64(i32 addrspace(1)* %out) {
 entry:
   %out.addr = alloca i32 addrspace(1)*, align 4
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/flat-scratch-reg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/flat-scratch-reg.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/flat-scratch-reg.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/flat-scratch-reg.ll Tue Mar 21 16:39:51 2017
@@ -19,7 +19,7 @@
 ; CI: ; NumSgprs: 8
 ; VI-NOXNACK: ; NumSgprs: 8
 ; VI-XNACK: ; NumSgprs: 12
-define void @no_vcc_no_flat() {
+define amdgpu_kernel void @no_vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7}"()
   ret void
@@ -33,7 +33,7 @@ entry:
 ; CI: ; NumSgprs: 10
 ; VI-NOXNACK: ; NumSgprs: 10
 ; VI-XNACK: ; NumSgprs: 12
-define void @vcc_no_flat() {
+define amdgpu_kernel void @vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{VCC}"()
   ret void
@@ -50,7 +50,7 @@ entry:
 ; HSA-CI: ; NumSgprs: 8
 ; HSA-VI-NOXNACK: ; NumSgprs: 8
 ; HSA-VI-XNACK: ; NumSgprs: 12
-define void @no_vcc_flat() {
+define amdgpu_kernel void @no_vcc_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"()
   ret void
@@ -66,7 +66,7 @@ entry:
 ; HSA-CI: ; NumSgprs: 10
 ; HSA-VI-NOXNACK: ; NumSgprs: 10
 ; HSA-VI-XNACK: ; NumSgprs: 12
-define void @vcc_flat() {
+define amdgpu_kernel void @vcc_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"()
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/flat_atomics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/flat_atomics.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/flat_atomics.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/flat_atomics.ll Tue Mar 21 16:39:51 2017
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i32_offset:
 ; GCN: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_add_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -13,7 +13,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i32_ret_offset:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -23,7 +23,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i32_addr64_offset:
 ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -34,7 +34,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64_offset:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -45,7 +45,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i32:
 ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -54,7 +54,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i32_ret:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -63,7 +63,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i32_addr64:
 ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -73,7 +73,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -83,7 +83,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i32_offset:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -93,7 +93,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i32_ret_offset:
 ; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -103,7 +103,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i32_addr64_offset:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -114,7 +114,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64_offset:
 ; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -125,7 +125,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i32:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -134,7 +134,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i32_ret:
 ; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -143,7 +143,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i32_addr64:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -153,7 +153,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64:
 ; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -163,7 +163,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i32_offset:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -173,7 +173,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret_offset:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -183,7 +183,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i32_addr64_offset:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -194,7 +194,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -205,7 +205,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i32:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -214,7 +214,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -223,7 +223,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i32_addr64:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -233,7 +233,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -243,7 +243,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i32_offset:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -253,7 +253,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i32_ret_offset:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -263,7 +263,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i32_addr64_offset:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -274,7 +274,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64_offset:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -285,7 +285,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i32:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -294,7 +294,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i32_ret:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -303,7 +303,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i32_addr64:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -313,7 +313,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -323,7 +323,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i32_offset:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -333,7 +333,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret_offset:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -343,7 +343,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i32_addr64_offset:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -354,7 +354,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -365,7 +365,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i32:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -374,7 +374,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -383,7 +383,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i32_addr64:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -393,7 +393,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -403,7 +403,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i32_offset:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -413,7 +413,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i32_ret_offset:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -423,7 +423,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i32_addr64_offset:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -434,7 +434,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64_offset:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -445,7 +445,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i32:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -454,7 +454,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i32_ret:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -463,7 +463,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i32_addr64:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -473,7 +473,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -483,7 +483,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_offset:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -493,7 +493,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret_offset:
 ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -503,7 +503,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_addr64_offset:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -514,7 +514,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset:
 ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -525,7 +525,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i32:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -534,7 +534,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret:
 ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -543,7 +543,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_addr64:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -553,7 +553,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64:
 ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]{{$}}
-  define void @atomic_umin_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+  define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -563,7 +563,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i32_offset:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_or_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -573,7 +573,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i32_ret_offset:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -583,7 +583,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i32_addr64_offset:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_or_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -594,7 +594,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64_offset:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -605,7 +605,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i32:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_or_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -614,7 +614,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i32_ret:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -623,7 +623,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i32_addr64:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_or_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -633,7 +633,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -643,7 +643,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32_offset:
 ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xchg_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -653,7 +653,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_offset:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -663,7 +663,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
 ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -674,7 +674,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -685,7 +685,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32:
 ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xchg_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -694,7 +694,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -703,7 +703,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64:
 ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -713,7 +713,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]],  v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -725,7 +725,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_offset:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_offset(i32 addrspace(4)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(4)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -735,7 +735,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -746,7 +746,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -757,7 +757,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64_offset:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -769,7 +769,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32(i32 addrspace(4)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(4)* %out, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst
   ret void
@@ -778,7 +778,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst
   %flag = extractvalue { i32, i1 } %val, 0
@@ -788,7 +788,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -798,7 +798,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -809,7 +809,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i32_offset:
 ; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xor_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -819,7 +819,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret_offset:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -829,7 +829,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i32_addr64_offset:
 ; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xor_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -840,7 +840,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -851,7 +851,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i32:
 ; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xor_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -860,7 +860,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -869,7 +869,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i32_addr64:
 ; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xor_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -879,7 +879,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -890,7 +890,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i32_offset:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %in, i32 4
   %val = load atomic i32, i32 addrspace(4)* %gep  seq_cst, align 4
@@ -901,7 +901,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i32:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i32(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(4)* %in seq_cst, align 4
   store i32 %val, i32 addrspace(4)* %out
@@ -911,7 +911,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i32_addr64_offset:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32_addr64_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -923,7 +923,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i32_addr64:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32_addr64(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index
   %val = load atomic i32, i32 addrspace(4)* %ptr seq_cst, align 4
@@ -933,7 +933,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i32_offset:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_offset(i32 %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   store atomic i32 %in, i32 addrspace(4)* %gep  seq_cst, align 4
@@ -942,7 +942,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i32:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32(i32 %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(4)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(4)* %out seq_cst, align 4
   ret void
@@ -950,7 +950,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i32_addr64_offset:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -960,7 +960,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i32_addr64:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   store atomic i32 %in, i32 addrspace(4)* %ptr seq_cst, align 4

Modified: llvm/trunk/test/CodeGen/AMDGPU/flat_atomics_i64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/flat_atomics_i64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/flat_atomics_i64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/flat_atomics_i64.ll Tue Mar 21 16:39:51 2017
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i64_offset:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
-define void @atomic_add_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -13,7 +13,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret_offset:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -23,7 +23,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
-define void @atomic_add_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -34,7 +34,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64_offset:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -45,7 +45,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i64:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_add_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -54,7 +54,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -63,7 +63,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i64_addr64:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_add_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -73,7 +73,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -83,7 +83,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64_offset:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -93,7 +93,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret_offset:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -103,7 +103,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -114,7 +114,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64_offset:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -125,7 +125,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -134,7 +134,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -143,7 +143,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64_addr64:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -153,7 +153,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -163,7 +163,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_offset:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -173,7 +173,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret_offset:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -183,7 +183,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -194,7 +194,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64_offset:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -205,7 +205,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -214,7 +214,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -223,7 +223,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_addr64:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -233,7 +233,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -243,7 +243,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64_offset:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -253,7 +253,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret_offset:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -263,7 +263,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -274,7 +274,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64_offset:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -285,7 +285,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -294,7 +294,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -303,7 +303,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64_addr64:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -313,7 +313,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -323,7 +323,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_offset:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -333,7 +333,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret_offset:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -343,7 +343,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -354,7 +354,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64_offset:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -365,7 +365,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -374,7 +374,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -383,7 +383,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_addr64:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -393,7 +393,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -403,7 +403,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64_offset:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -413,7 +413,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret_offset:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -423,7 +423,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -434,7 +434,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64_offset:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -445,7 +445,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -454,7 +454,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -463,7 +463,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64_addr64:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -473,7 +473,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -483,7 +483,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64_offset:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -493,7 +493,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret_offset:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -503,7 +503,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -514,7 +514,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64_offset:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -525,7 +525,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -534,7 +534,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -543,7 +543,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64_addr64:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -553,7 +553,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -563,7 +563,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64_offset:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -573,7 +573,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret_offset:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -583,7 +583,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -594,7 +594,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64_offset:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -605,7 +605,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -614,7 +614,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -623,7 +623,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64_addr64:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -633,7 +633,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -643,7 +643,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64_offset:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -653,7 +653,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -663,7 +663,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -674,7 +674,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64_offset:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -685,7 +685,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -694,7 +694,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -703,7 +703,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -713,7 +713,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]],  v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -723,7 +723,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_offset:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -733,7 +733,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret_offset:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -743,7 +743,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -754,7 +754,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64_offset:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -765,7 +765,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -774,7 +774,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -783,7 +783,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_addr64:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -793,7 +793,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -804,7 +804,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i64_offset:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %in, i64 4
   %val = load atomic i64, i64 addrspace(4)* %gep  seq_cst, align 8
@@ -815,7 +815,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i64:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i64(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
 entry:
   %val = load atomic i64, i64 addrspace(4)* %in seq_cst, align 8
   store i64 %val, i64 addrspace(4)* %out
@@ -825,7 +825,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i64_addr64_offset:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64_addr64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -837,7 +837,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i64_addr64:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64_addr64(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index
   %val = load atomic i64, i64 addrspace(4)* %ptr seq_cst, align 8
@@ -847,7 +847,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i64_offset:
 ; GCN: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_offset(i64 %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   store atomic i64 %in, i64 addrspace(4)* %gep  seq_cst, align 8
@@ -856,7 +856,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i64:
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] glc
-define void @atomic_store_i64(i64 %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(4)* %out) {
 entry:
   store atomic i64 %in, i64 addrspace(4)* %out seq_cst, align 8
   ret void
@@ -864,7 +864,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i64_addr64_offset:
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -874,7 +874,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i64_addr64:
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_addr64(i64 %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   store atomic i64 %in, i64 addrspace(4)* %ptr seq_cst, align 8
@@ -883,7 +883,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_offset:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_offset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -892,7 +892,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_soffset:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_soffset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 9000
   %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -902,7 +902,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -913,7 +913,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64_offset:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -924,7 +924,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64_offset:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -936,7 +936,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64(i64 addrspace(4)* %out, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst
   ret void
@@ -945,7 +945,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
@@ -955,7 +955,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst
@@ -965,7 +965,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst

Modified: llvm/trunk/test/CodeGen/AMDGPU/fma-combine.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fma-combine.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fma-combine.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fma-combine.ll Tue Mar 21 16:39:51 2017
@@ -18,7 +18,7 @@ declare float @llvm.fma.f32(float, float
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -46,7 +46,7 @@ define void @combine_to_fma_f64_0(double
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -75,7 +75,7 @@ define void @combine_to_fma_f64_0_2use(d
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -99,7 +99,7 @@ define void @combine_to_fma_f64_1(double
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -127,7 +127,7 @@ define void @combine_to_fma_fsub_0_f64(d
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -156,7 +156,7 @@ define void @combine_to_fma_fsub_f64_0_2
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -184,7 +184,7 @@ define void @combine_to_fma_fsub_1_f64(d
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -213,7 +213,7 @@ define void @combine_to_fma_fsub_1_f64_2
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -242,7 +242,7 @@ define void @combine_to_fma_fsub_2_f64(d
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -276,7 +276,7 @@ define void @combine_to_fma_fsub_2_f64_2
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -317,7 +317,7 @@ define void @combine_to_fma_fsub_2_f64_2
 ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
 
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -358,7 +358,7 @@ define void @aggressive_combine_to_fma_f
 ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
 
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -390,7 +390,7 @@ define void @aggressive_combine_to_fma_f
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load volatile float, float addrspace(1)* %in1
@@ -406,7 +406,7 @@ define void @test_f32_mul_add_x_one_y(fl
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load volatile float, float addrspace(1)* %in1
@@ -422,7 +422,7 @@ define void @test_f32_mul_y_add_x_one(fl
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
                                            float addrspace(1)* %in1,
                                            float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -438,7 +438,7 @@ define void @test_f32_mul_add_x_negone_y
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
                                            float addrspace(1)* %in1,
                                            float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -454,7 +454,7 @@ define void @test_f32_mul_y_add_x_negone
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -470,7 +470,7 @@ define void @test_f32_mul_sub_one_x_y(fl
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -486,7 +486,7 @@ define void @test_f32_mul_y_sub_one_x(fl
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
                                            float addrspace(1)* %in1,
                                            float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -502,7 +502,7 @@ define void @test_f32_mul_sub_negone_x_y
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
                                          float addrspace(1)* %in1,
                                          float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -518,7 +518,7 @@ define void @test_f32_mul_y_sub_negone_x
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -534,7 +534,7 @@ define void @test_f32_mul_sub_x_one_y(fl
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
                                       float addrspace(1)* %in1,
                                       float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -550,7 +550,7 @@ define void @test_f32_mul_y_sub_x_one(fl
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
                                          float addrspace(1)* %in1,
                                          float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -566,7 +566,7 @@ define void @test_f32_mul_sub_x_negone_y
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
                                          float addrspace(1)* %in1,
                                          float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -588,7 +588,7 @@ define void @test_f32_mul_y_sub_x_negone
 ;
 ; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]]
-define void @test_f32_interp(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_interp(float addrspace(1)* %out,
                              float addrspace(1)* %in1,
                              float addrspace(1)* %in2,
                              float addrspace(1)* %in3) {
@@ -610,7 +610,7 @@ define void @test_f32_interp(float addrs
 ;
 ; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
 ; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
-define void @test_f64_interp(double addrspace(1)* %out,
+define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out,
                              double addrspace(1)* %in1,
                              double addrspace(1)* %in2,
                              double addrspace(1)* %in3) {

Modified: llvm/trunk/test/CodeGen/AMDGPU/fma.f64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fma.f64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fma.f64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fma.f64.ll Tue Mar 21 16:39:51 2017
@@ -8,7 +8,7 @@ declare <4 x double> @llvm.fma.v4f64(<4
 
 ; FUNC-LABEL: {{^}}fma_f64:
 ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2, double addrspace(1)* %in3) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -21,7 +21,7 @@ define void @fma_f64(double addrspace(1)
 ; FUNC-LABEL: {{^}}fma_v2f64:
 ; SI: v_fma_f64
 ; SI: v_fma_f64
-define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                        <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) {
    %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
    %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
@@ -36,7 +36,7 @@ define void @fma_v2f64(<2 x double> addr
 ; SI: v_fma_f64
 ; SI: v_fma_f64
 ; SI: v_fma_f64
-define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
                        <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) {
    %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1
    %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2

Modified: llvm/trunk/test/CodeGen/AMDGPU/fma.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fma.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fma.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fma.ll Tue Mar 21 16:39:51 2017
@@ -12,7 +12,7 @@ declare i32 @llvm.r600.read.tidig.x() no
 
 ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}},
 ; EG: FMA {{\*? *}}[[RES]]
-define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                      float addrspace(1)* %in2, float addrspace(1)* %in3) {
   %r0 = load float, float addrspace(1)* %in1
   %r1 = load float, float addrspace(1)* %in2
@@ -29,7 +29,7 @@ define void @fma_f32(float addrspace(1)*
 ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].[[CHLO:[XYZW]]][[CHHI:[XYZW]]], {{T[0-9]\.[XYZW]}},
 ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHLO]]
 ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]]
-define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
                        <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) {
   %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1
   %r1 = load <2 x float>, <2 x float> addrspace(1)* %in2
@@ -50,7 +50,7 @@ define void @fma_v2f32(<2 x float> addrs
 ; EG-DAG: FMA {{\*? *}}[[RES]].Y
 ; EG-DAG: FMA {{\*? *}}[[RES]].Z
 ; EG-DAG: FMA {{\*? *}}[[RES]].W
-define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
                        <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) {
   %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1
   %r1 = load <4 x float>, <4 x float> addrspace(1)* %in2
@@ -62,7 +62,7 @@ define void @fma_v4f32(<4 x float> addrs
 
 ; FUNC-LABEL: @fma_commute_mul_inline_imm_f32
 ; SI: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, 2.0, {{v[0-9]+}}
-define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -77,7 +77,7 @@ define void @fma_commute_mul_inline_imm_
 }
 
 ; FUNC-LABEL: @fma_commute_mul_s_f32
-define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
+define amdgpu_kernel void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmax3.f64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmax3.f64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmax3.f64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmax3.f64.ll Tue Mar 21 16:39:51 2017
@@ -11,7 +11,7 @@ declare double @llvm.maxnum.f64(double,
 ; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]]
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
+define amdgpu_kernel void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
   %bptr = getelementptr double, double addrspace(1)* %aptr, i32 1
   %cptr = getelementptr double, double addrspace(1)* %aptr, i32 2
   %a = load volatile double, double addrspace(1)* %aptr, align 8

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmax3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmax3.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmax3.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmax3.ll Tue Mar 21 16:39:51 2017
@@ -10,7 +10,7 @@ declare float @llvm.maxnum.f32(float, fl
 ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile  float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
@@ -28,7 +28,7 @@ define void @test_fmax3_olt_0(float addr
 ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.f64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.f64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.f64.ll Tue Mar 21 16:39:51 2017
@@ -4,7 +4,7 @@
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; FUNC-LABEL: @test_fmax_legacy_uge_f64
-define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -19,7 +19,7 @@ define void @test_fmax_legacy_uge_f64(do
 }
 
 ; FUNC-LABEL: @test_fmax_legacy_oge_f64
-define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -34,7 +34,7 @@ define void @test_fmax_legacy_oge_f64(do
 }
 
 ; FUNC-LABEL: @test_fmax_legacy_ugt_f64
-define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -49,7 +49,7 @@ define void @test_fmax_legacy_ugt_f64(do
 }
 
 ; FUNC-LABEL: @test_fmax_legacy_ogt_f64
-define void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmax_legacy.ll Tue Mar 21 16:39:51 2017
@@ -13,7 +13,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 
 ; EG: MAX
-define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -33,7 +33,7 @@ define void @test_fmax_legacy_uge_f32(fl
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -53,7 +53,7 @@ define void @test_fmax_legacy_oge_f32(fl
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -73,7 +73,7 @@ define void @test_fmax_legacy_ugt_f32(fl
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -93,7 +93,7 @@ define void @test_fmax_legacy_ogt_f32(fl
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1
@@ -114,7 +114,7 @@ define void @test_fmax_legacy_ogt_v1f32(
 ; SI-NONAN: v_max_f32_e32
 ; SI-NONAN: v_max_f32_e32
 ; SI-NONAN: v_max_f32_e32
-define void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1
@@ -137,7 +137,7 @@ define void @test_fmax_legacy_ogt_v3f32(
 ; SI-NOT: v_max_
 
 ; EG: MAX
-define void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmaxnum.f64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmaxnum.f64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmaxnum.f64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmaxnum.f64.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@ declare <16 x double> @llvm.maxnum.v16f6
 
 ; FUNC-LABEL: @test_fmax_f64
 ; SI: v_max_f64
-define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
   %val = call double @llvm.maxnum.f64(double %a, double %b) #0
   store double %val, double addrspace(1)* %out, align 8
   ret void
@@ -18,7 +18,7 @@ define void @test_fmax_f64(double addrsp
 ; FUNC-LABEL: @test_fmax_v2f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) #0
   store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -29,7 +29,7 @@ define void @test_fmax_v2f64(<2 x double
 ; SI: v_max_f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %val = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %a, <4 x double> %b) #0
   store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -44,7 +44,7 @@ define void @test_fmax_v4f64(<4 x double
 ; SI: v_max_f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %val = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %a, <8 x double> %b) #0
   store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -67,7 +67,7 @@ define void @test_fmax_v8f64(<8 x double
 ; SI: v_max_f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %val = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %a, <16 x double> %b) #0
   store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmaxnum.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmaxnum.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmaxnum.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmaxnum.ll Tue Mar 21 16:39:51 2017
@@ -14,7 +14,7 @@ declare double @llvm.maxnum.f64(double,
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
-define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float %b) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -27,7 +27,7 @@ define void @test_fmax_f32(float addrspa
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
-define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
   %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0
   store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -44,7 +44,7 @@ define void @test_fmax_v2f32(<2 x float>
 ; EG: MAX_DX10 {{.*}}[[OUT]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
-define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
   %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) #0
   store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -70,7 +70,7 @@ define void @test_fmax_v4f32(<4 x float>
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W
-define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
   %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) #0
   store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -114,7 +114,7 @@ define void @test_fmax_v8f32(<8 x float>
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Y
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Z
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].W
-define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
   %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) #0
   store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
   ret void
@@ -128,7 +128,7 @@ define void @test_fmax_v16f32(<16 x floa
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -143,7 +143,7 @@ define void @constant_fold_fmax_f32(floa
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 ; EG: 2143289344(nan)
-define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -157,7 +157,7 @@ define void @constant_fold_fmax_f32_nan_
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -171,7 +171,7 @@ define void @constant_fold_fmax_f32_val_
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -185,7 +185,7 @@ define void @constant_fold_fmax_f32_nan_
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -199,7 +199,7 @@ define void @constant_fold_fmax_f32_p0_p
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -213,7 +213,7 @@ define void @constant_fold_fmax_f32_p0_n
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -227,7 +227,7 @@ define void @constant_fold_fmax_f32_n0_p
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -239,7 +239,7 @@ define void @constant_fold_fmax_f32_n0_n
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -250,7 +250,7 @@ define void @fmax_var_immediate_f32(floa
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -262,7 +262,7 @@ define void @fmax_immediate_var_f32(floa
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -274,7 +274,7 @@ define void @fmax_var_literal_f32(float
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmed3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmed3.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmed3.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmed3.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 ; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f32:
 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v{{[0-9]+}}
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
-define void @v_test_nnan_input_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -27,7 +27,7 @@ define void @v_test_nnan_input_fmed3_r_i
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -45,7 +45,7 @@ define void @v_test_fmed3_r_i_i_f32(floa
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -63,7 +63,7 @@ define void @v_test_fmed3_r_i_i_commute0
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -79,7 +79,7 @@ define void @v_test_fmed3_r_i_i_commute1
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_constant_order_f32:
 ; GCN: v_max_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
 ; GCN: v_min_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -96,7 +96,7 @@ define void @v_test_fmed3_r_i_i_constant
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_multi_use_f32:
 ; GCN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; GCN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -113,7 +113,7 @@ define void @v_test_fmed3_r_i_i_multi_us
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f64:
 ; GCN: v_max_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 2.0
 ; GCN: v_min_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 4.0
-define void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr double, double addrspace(1)* %out, i32 %tid
@@ -128,7 +128,7 @@ define void @v_test_fmed3_r_i_i_f64(doub
 
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_no_nans_f32:
 ; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
-define void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -146,7 +146,7 @@ define void @v_test_fmed3_r_i_i_no_nans_
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -169,7 +169,7 @@ define void @v_test_legacy_fmed3_r_i_i_f
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -192,7 +192,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], -[[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat0_srcmod1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -215,7 +215,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], -[[C]]
-define void @v_test_global_nnans_med3_f32_pat0_srcmod2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -238,7 +238,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], |[[B]]|, -|[[C]]|
-define void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -267,7 +267,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, -|[[A]]|, -|[[B]]|, -|[[C]]|
-define void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -301,7 +301,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN-DAG: v_add_f32_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]]
 ; GCN-DAG: v_add_f32_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
-define void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -341,7 +341,7 @@ define void @v_nnan_inputs_med3_f32_pat0
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -363,7 +363,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -385,7 +385,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -407,7 +407,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat3(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -429,7 +429,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat4(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -451,7 +451,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat5(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -473,7 +473,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat6(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -495,7 +495,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat7(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -517,7 +517,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat8(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -539,7 +539,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat9(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -561,7 +561,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat10(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -583,7 +583,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat11(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -605,7 +605,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat12(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -627,7 +627,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat13(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -649,7 +649,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat14(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -671,7 +671,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat15(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -697,7 +697,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN-DAG: v_max_f32
 ; GCN: v_min_f32
 ; GCN: v_max_f32
-define void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -716,7 +716,7 @@ define void @v_test_safe_med3_f32_pat0_m
 }
 
 ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use1:
-define void @v_test_safe_med3_f32_pat0_multi_use1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -735,7 +735,7 @@ define void @v_test_safe_med3_f32_pat0_m
 }
 
 ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use2:
-define void @v_test_safe_med3_f32_pat0_multi_use2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -755,7 +755,7 @@ define void @v_test_safe_med3_f32_pat0_m
 
 
 ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0:
-define void @v_test_safe_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -773,7 +773,7 @@ define void @v_test_safe_med3_f32_pat0(f
 }
 
 ; GCN-LABEL: {{^}}v_nnan_inputs_missing0_med3_f32_pat0:
-define void @v_nnan_inputs_missing0_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -796,7 +796,7 @@ define void @v_nnan_inputs_missing0_med3
 }
 
 ; GCN-LABEL: {{^}}v_nnan_inputs_missing1_med3_f32_pat0:
-define void @v_nnan_inputs_missing1_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -819,7 +819,7 @@ define void @v_nnan_inputs_missing1_med3
 }
 
 ; GCN-LABEL: {{^}}v_nnan_inputs_missing2_med3_f32_pat0:
-define void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -849,7 +849,7 @@ define void @v_nnan_inputs_missing2_med3
 ; GCN: v_max_f32
 ; GCN: v_min_f32
 ; GCN: v_max_f32
-define void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -874,7 +874,7 @@ define void @v_test_global_nnans_med3_f3
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[B]], [[A]]
 ; GCN: v_min_f32_e32 v{{[0-9]+}}, [[C]], [[MAX]]
-define void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -901,7 +901,7 @@ define void @v_test_global_nnans_min_max
 
 ; GFX9: v_add_f16_e32 v{{[0-9]+}}, 1.0
 ; GFX9: v_med3_f16 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
-define void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
@@ -938,7 +938,7 @@ define void @v_test_nnan_input_fmed3_r_i
 ; VI: v_max_f16
 
 ; GFX9: v_med3_f16 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
-define void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr half, half addrspace(1)* %bptr, i32 %tid

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmin3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmin3.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmin3.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmin3.ll Tue Mar 21 16:39:51 2017
@@ -11,7 +11,7 @@ declare float @llvm.minnum.f32(float, fl
 ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
@@ -29,7 +29,7 @@ define void @test_fmin3_olt_0(float addr
 ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.f64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.f64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.f64.ll Tue Mar 21 16:39:51 2017
@@ -3,7 +3,7 @@
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; FUNC-LABEL: @test_fmin_legacy_f64
-define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 {
+define amdgpu_kernel void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 {
    %r0 = extractelement <4 x double> %reg0, i32 0
    %r1 = extractelement <4 x double> %reg0, i32 1
    %r2 = fcmp uge double %r0, %r1
@@ -14,7 +14,7 @@ define void @test_fmin_legacy_f64(<4 x d
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_ule_f64
-define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -29,7 +29,7 @@ define void @test_fmin_legacy_ule_f64(do
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_ole_f64
-define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -44,7 +44,7 @@ define void @test_fmin_legacy_ole_f64(do
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_olt_f64
-define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -59,7 +59,7 @@ define void @test_fmin_legacy_olt_f64(do
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_ult_f64
-define void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmin_legacy.ll Tue Mar 21 16:39:51 2017
@@ -14,7 +14,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; EG: MIN *
 ; SI-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
+define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = fcmp uge float %r0, %r1
@@ -34,7 +34,7 @@ define void @s_test_fmin_legacy_subreg_i
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[VA]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[VB]]
 
-define void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %cmp = fcmp ule float %a, %b
   %val = select i1 %cmp, float %a, float %b
   store float %val, float addrspace(1)* %out, align 4
@@ -46,7 +46,7 @@ define void @s_test_fmin_legacy_ule_f32(
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -65,7 +65,7 @@ define void @test_fmin_legacy_ule_f32(fl
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -84,7 +84,7 @@ define void @test_fmin_legacy_ole_f32(fl
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -103,7 +103,7 @@ define void @test_fmin_legacy_olt_f32(fl
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -122,7 +122,7 @@ define void @test_fmin_legacy_ult_f32(fl
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1
@@ -144,7 +144,7 @@ define void @test_fmin_legacy_ult_v1f32(
 
 ; SI-NONAN: v_min_f32_e32
 ; SI-NONAN: v_min_f32_e32
-define void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %gep.0, i32 1
@@ -166,7 +166,7 @@ define void @test_fmin_legacy_ult_v2f32(
 ; SI-NONAN: v_min_f32_e32
 ; SI-NONAN: v_min_f32_e32
 ; SI-NONAN: v_min_f32_e32
-define void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1
@@ -188,7 +188,7 @@ define void @test_fmin_legacy_ult_v3f32(
 ; SI-NEXT: v_cndmask_b32
 ; SI-NOT: v_min
 ; SI: s_endpgm
-define void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1

Modified: llvm/trunk/test/CodeGen/AMDGPU/fminnum.f64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fminnum.f64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fminnum.f64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fminnum.f64.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@ declare <16 x double> @llvm.minnum.v16f6
 
 ; FUNC-LABEL: @test_fmin_f64
 ; SI: v_min_f64
-define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
   %val = call double @llvm.minnum.f64(double %a, double %b) #0
   store double %val, double addrspace(1)* %out, align 8
   ret void
@@ -18,7 +18,7 @@ define void @test_fmin_f64(double addrsp
 ; FUNC-LABEL: @test_fmin_v2f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) #0
   store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -29,7 +29,7 @@ define void @test_fmin_v2f64(<2 x double
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %val = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b) #0
   store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -44,7 +44,7 @@ define void @test_fmin_v4f64(<4 x double
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %val = call <8 x double> @llvm.minnum.v8f64(<8 x double> %a, <8 x double> %b) #0
   store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -67,7 +67,7 @@ define void @test_fmin_v8f64(<8 x double
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %val = call <16 x double> @llvm.minnum.v16f64(<16 x double> %a, <16 x double> %b) #0
   store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/fminnum.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fminnum.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fminnum.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fminnum.ll Tue Mar 21 16:39:51 2017
@@ -13,7 +13,7 @@ declare <16 x float> @llvm.minnum.v16f32
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
-define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float %b) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -26,7 +26,7 @@ define void @test_fmin_f32(float addrspa
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
-define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
   %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) #0
   store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -43,7 +43,7 @@ define void @test_fmin_v2f32(<2 x float>
 ; EG: MIN_DX10 {{.*}}[[OUT]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
-define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
   %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) #0
   store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -69,7 +69,7 @@ define void @test_fmin_v4f32(<4 x float>
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W
-define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
   %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) #0
   store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -113,7 +113,7 @@ define void @test_fmin_v8f32(<8 x float>
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Y
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Z
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].W
-define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
   %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) #0
   store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
   ret void
@@ -127,7 +127,7 @@ define void @test_fmin_v16f32(<16 x floa
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 1.0, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -142,7 +142,7 @@ define void @constant_fold_fmin_f32(floa
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 ; EG: 2143289344({{nan|1\.#QNAN0e\+00}})
-define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -156,7 +156,7 @@ define void @constant_fold_fmin_f32_nan_
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -170,7 +170,7 @@ define void @constant_fold_fmin_f32_val_
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -184,7 +184,7 @@ define void @constant_fold_fmin_f32_nan_
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -198,7 +198,7 @@ define void @constant_fold_fmin_f32_p0_p
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -212,7 +212,7 @@ define void @constant_fold_fmin_f32_p0_n
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float -0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -226,7 +226,7 @@ define void @constant_fold_fmin_f32_n0_p
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float -0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -237,7 +237,7 @@ define void @constant_fold_fmin_f32_n0_n
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -248,7 +248,7 @@ define void @fmin_var_immediate_f32(floa
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float 2.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -260,7 +260,7 @@ define void @fmin_immediate_var_f32(floa
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float 99.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -272,7 +272,7 @@ define void @fmin_var_literal_f32(float
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float 99.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll Tue Mar 21 16:39:51 2017
@@ -23,7 +23,7 @@ declare float @llvm.fabs.f32(float) #1
 ; VI: v_add_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
 ; VI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
-define void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
+define amdgpu_kernel void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
   %a11 = fadd fast float %y, -1.0
   %a12 = call float @llvm.fabs.f32(float %a11)
   %a13 = fadd fast float %x, -1.0
@@ -44,7 +44,7 @@ define void @multiple_fadd_use_test_f32(
 ; GCN-DAG: buffer_store_dword [[MUL2]]
 ; GCN-DAG: buffer_store_dword [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %mul2 = fmul fast float %x, 2.0
   %mad = fadd fast float %mul2, %y
@@ -59,7 +59,7 @@ define void @multiple_use_fadd_fmac_f32(
 ; GCN-DAG: buffer_store_dword [[MUL2]]
 ; GCN-DAG: buffer_store_dword [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %x.abs = call float @llvm.fabs.f32(float %x)
   %mul2 = fmul fast float %x.abs, 2.0
@@ -72,7 +72,7 @@ define void @multiple_use_fadd_fmad_f32(
 ; GCN-LABEL: {{^}}multiple_use_fadd_multi_fmad_f32:
 ; GCN: v_mad_f32 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}}
 ; GCN: v_mad_f32 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}}
-define void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
+define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %x.abs = call float @llvm.fabs.f32(float %x)
   %mul2 = fmul fast float %x.abs, 2.0
@@ -87,7 +87,7 @@ define void @multiple_use_fadd_multi_fma
 ; GCN: v_mul_f32_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %mul2 = fmul fast float %x, 2.0
   %muln2 = fmul fast float %x, -2.0
@@ -101,7 +101,7 @@ define void @fmul_x2_xn2_f32(float addrs
 ; GCN: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]]
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %mul2 = fmul fast float %x, 2.0
   %muln2 = fmul fast float %x, -3.0
@@ -119,7 +119,7 @@ define void @fmul_x2_xn3_f32(float addrs
 ; VI: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
 ; VI-DENORM: v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
-define void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
+define amdgpu_kernel void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %z = bitcast i16 %z.arg to half
@@ -146,7 +146,7 @@ define void @multiple_fadd_use_test_f16(
 ; GCN-DAG: buffer_store_short [[MUL2]]
 ; GCN-DAG: buffer_store_short [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
@@ -166,7 +166,7 @@ define void @multiple_use_fadd_fmac_f16(
 ; GCN-DAG: buffer_store_short [[MUL2]]
 ; GCN-DAG: buffer_store_short [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
@@ -185,7 +185,7 @@ define void @multiple_use_fadd_fmad_f16(
 ; VI-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}}
 ; VI-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}}
 
-define void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
+define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %z = bitcast i16 %z.arg to half
@@ -203,7 +203,7 @@ define void @multiple_use_fadd_multi_fma
 ; GCN: v_mul_f16_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0
 ; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
@@ -219,7 +219,7 @@ define void @fmul_x2_xn2_f16(half addrsp
 ; GCN: v_mul_f16_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]]
 ; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @fmul_x2_xn3_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @fmul_x2_xn3_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmul.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmul.f16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmul.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmul.f16.ll Tue Mar 21 16:39:51 2017
@@ -11,7 +11,7 @@
 ; VI:  v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fmul_f16(
+define amdgpu_kernel void @fmul_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -31,7 +31,7 @@ entry:
 ; VI:  v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fmul_f16_imm_a(
+define amdgpu_kernel void @fmul_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -50,7 +50,7 @@ entry:
 ; VI:  v_mul_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fmul_f16_imm_b(
+define amdgpu_kernel void @fmul_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -83,7 +83,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fmul_v2f16(
+define amdgpu_kernel void @fmul_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -110,7 +110,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fmul_v2f16_imm_a(
+define amdgpu_kernel void @fmul_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -135,7 +135,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fmul_v2f16_imm_b(
+define amdgpu_kernel void @fmul_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmul.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmul.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmul.ll Tue Mar 21 16:39:51 2017
@@ -6,7 +6,7 @@
 ; GCN: v_mul_f32
 
 ; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
-define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fmul float %a, %b
   store float %0, float addrspace(1)* %out
@@ -19,7 +19,7 @@ entry:
 
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
-define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
 entry:
   %0 = fmul <2 x float> %a, %b
   store <2 x float> %0, <2 x float> addrspace(1)* %out
@@ -36,7 +36,7 @@ entry:
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -49,7 +49,7 @@ define void @fmul_v4f32(<4 x float> addr
 ; GCN: v_mul_f32
 ; GCN-NOT: v_mul_f32
 ; GCN: s_endpgm
-define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
   %y = fmul float %x, 2.0
   %z = fmul float %y, 3.0
   store float %z, float addrspace(1)* %out
@@ -61,7 +61,7 @@ define void @test_mul_2_k(float addrspac
 ; GCN-NOT: v_mul_f32
 ; GCN-NOT: v_mad_f32
 ; GCN: s_endpgm
-define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
   %y = fmul float %x, 3.0
   %z = fmul float %y, 2.0
   store float %z, float addrspace(1)* %out
@@ -75,7 +75,7 @@ define void @test_mul_2_k_inv(float addr
 ; GCN: v_mul_f32
 ; GCN: v_mul_f32
 ; GCN-NOT: v_mul_f32
-define void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 {
   %a = fmul float %x, 5.0
   %b = fsub float -0.0, %a
   %c = fmul float %b, %y

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmul64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmul64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmul64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmul64.ll Tue Mar 21 16:39:51 2017
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}fmul_f64:
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -15,7 +15,7 @@ define void @fmul_f64(double addrspace(1
 ; FUNC-LABEL: {{^}}fmul_v2f64:
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                         <2 x double> addrspace(1)* %in2) {
    %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
    %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
@@ -29,7 +29,7 @@ define void @fmul_v2f64(<2 x double> add
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
                         <4 x double> addrspace(1)* %in2) {
    %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1
    %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f16.ll Tue Mar 21 16:39:51 2017
@@ -16,7 +16,7 @@ declare half @llvm.fabs.f16(half) #1
 ; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
 ; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-define void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
+define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
                          half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
   %r0 = load half, half addrspace(1)* %in1
   %r1 = load half, half addrspace(1)* %in2
@@ -34,7 +34,7 @@ define void @fmuladd_f16(half addrspace(
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -56,7 +56,7 @@ define void @fmuladd_2.0_a_b_f16(half ad
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -82,7 +82,7 @@ define void @fmuladd_a_2.0_b_f16(half ad
 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
 
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_a_a_b_f16(half addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
                             half addrspace(1)* %in1,
                             half addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -111,7 +111,7 @@ define void @fadd_a_a_b_f16(half addrspa
 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
 
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_b_a_a_f16(half addrspace(1)* %out,
+define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out,
                             half addrspace(1)* %in1,
                             half addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -134,7 +134,7 @@ define void @fadd_b_a_a_f16(half addrspa
 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
 ; VI-DENORM: v_fma_f16 [[R2:v[0-9]+]], [[R1]], -2.0, [[R2]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-define void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -156,7 +156,7 @@ define void @fmuladd_neg_2.0_a_b_f16(hal
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -180,7 +180,7 @@ define void @fmuladd_neg_2.0_neg_a_b_f16
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -202,7 +202,7 @@ define void @fmuladd_2.0_neg_a_b_f16(hal
 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -231,7 +231,7 @@ define void @fmuladd_2.0_a_neg_b_f16(hal
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -261,7 +261,7 @@ define void @mad_sub_f16(half addrspace(
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -291,7 +291,7 @@ define void @mad_sub_inv_f16(half addrsp
 ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -323,7 +323,7 @@ define void @mad_sub_fabs_f16(half addrs
 ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -355,7 +355,7 @@ define void @mad_sub_fabs_inv_f16(half a
 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -388,7 +388,7 @@ define void @neg_neg_mad_f16(half addrsp
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -419,7 +419,7 @@ define void @mad_fabs_sub_f16(half addrs
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
 
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -447,7 +447,7 @@ define void @fsub_c_fadd_a_a_f16(half ad
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f32.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f32.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f32.ll Tue Mar 21 16:39:51 2017
@@ -25,7 +25,7 @@ declare float @llvm.fabs.f32(float) #1
 
 ; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                          float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
   %r0 = load float, float addrspace(1)* %in1
   %r1 = load float, float addrspace(1)* %in2
@@ -45,7 +45,7 @@ define void @fmuladd_f32(float addrspace
 
 ; GCN-DENORM-STRICT: v_mul_f32_e32
 ; GCN-DENORM-STRICT: v_add_f32_e32
-define void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                            float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
   %r0 = load volatile float, float addrspace(1)* %in1
   %r1 = load volatile float, float addrspace(1)* %in2
@@ -71,7 +71,7 @@ define void @fmul_fadd_f32(float addrspa
 
 ; SI-DENORM buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -100,7 +100,7 @@ define void @fmuladd_2.0_a_b_f32(float a
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -132,7 +132,7 @@ define void @fmuladd_a_2.0_b_f32(float a
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_a_a_b_f32(float addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out,
                             float addrspace(1)* %in1,
                             float addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -167,7 +167,7 @@ define void @fadd_a_a_b_f32(float addrsp
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_b_a_a_f32(float addrspace(1)* %out,
+define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out,
                             float addrspace(1)* %in1,
                             float addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -196,7 +196,7 @@ define void @fadd_b_a_a_f32(float addrsp
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -225,7 +225,7 @@ define void @fmuladd_neg_2.0_a_b_f32(flo
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -256,7 +256,7 @@ define void @fmuladd_neg_2.0_neg_a_b_f32
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -286,7 +286,7 @@ define void @fmuladd_2.0_neg_a_b_f32(flo
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -318,7 +318,7 @@ define void @fmuladd_2.0_a_neg_b_f32(flo
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -353,7 +353,7 @@ define void @mad_sub_f32(float addrspace
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -387,7 +387,7 @@ define void @mad_sub_inv_f32(float addrs
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -422,7 +422,7 @@ define void @mad_sub_fabs_f32(float addr
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -460,7 +460,7 @@ define void @mad_sub_fabs_inv_f32(float
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -496,7 +496,7 @@ define void @neg_neg_mad_f32(float addrs
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -532,7 +532,7 @@ define void @mad_fabs_sub_f32(float addr
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -563,7 +563,7 @@ define void @fsub_c_fadd_a_a_f32(float a
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f64.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f64.ll Tue Mar 21 16:39:51 2017
@@ -7,7 +7,7 @@
 
 ; GCN-LABEL: {{^}}fmuladd_f64:
 ; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                          double addrspace(1)* %in2, double addrspace(1)* %in3) #0 {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -22,7 +22,7 @@ define void @fmuladd_f64(double addrspac
 
 ; GCN-STRICT: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; GCN-STRICT: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                            double addrspace(1)* %in2, double addrspace(1)* %in3) #0 {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -44,7 +44,7 @@ define void @fmul_fadd_f64(double addrsp
 
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_a_a_b_f64(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64(double addrspace(1)* %out,
                             double addrspace(1)* %in1,
                             double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -72,7 +72,7 @@ define void @fadd_a_a_b_f64(double addrs
 
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_b_a_a_f64(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_b_a_a_f64(double addrspace(1)* %out,
                             double addrspace(1)* %in1,
                             double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -94,7 +94,7 @@ define void @fadd_b_a_a_f64(double addrs
 ; GCN-STRICT: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
 
 ; GCN-CONTRACT: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
-define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext
@@ -117,7 +117,7 @@ define void @mad_sub_f64(double addrspac
 ; GCN-STRICT: v_add_f64
 
 ; GCN-CONTRACT: v_fma_f64
-define void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out,
                                       double addrspace(1)* %in1,
                                       double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -139,7 +139,7 @@ define void @fadd_a_a_b_f64_fast_add0(do
 ; GCN-STRICT: v_add_f64
 
 ; GCN-CONTRACT: v_fma_f64
-define void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out,
                                       double addrspace(1)* %in1,
                                       double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -158,7 +158,7 @@ define void @fadd_a_a_b_f64_fast_add1(do
 
 ; GCN-LABEL: {{^}}fadd_a_a_b_f64_fast:
 ; GCN: v_fma_f64
-define void @fadd_a_a_b_f64_fast(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64_fast(double addrspace(1)* %out,
                                  double addrspace(1)* %in1,
                                 double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone

Modified: llvm/trunk/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmuladd.v2f16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmuladd.v2f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmuladd.v2f16.ll Tue Mar 21 16:39:51 2017
@@ -17,7 +17,7 @@ declare <2 x half> @llvm.fabs.v2f16(<2 x
 ; GFX9-FLUSH: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
 ; GFX9-DENORM: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-define void @fmuladd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
+define amdgpu_kernel void @fmuladd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
                          <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 {
   %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1
   %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2
@@ -37,7 +37,7 @@ define void @fmuladd_v2f16(<2 x half> ad
 
 ; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
 ; GFX9-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
@@ -61,7 +61,7 @@ define void @fmuladd_2.0_a_b_v2f16(<2 x
 
 ; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
 ; GFX9-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_a_2.0_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_a_2.0_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
@@ -86,7 +86,7 @@ define void @fmuladd_a_2.0_b_v2f16(<2 x
 
 ; GFX9-DENORM-CONTRACT: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_a_a_b_v2f16(<2 x half> addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_v2f16(<2 x half> addrspace(1)* %out,
                             <2 x half> addrspace(1)* %in1,
                             <2 x half> addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

Modified: llvm/trunk/test/CodeGen/AMDGPU/fnearbyint.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fnearbyint.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fnearbyint.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fnearbyint.ll Tue Mar 21 16:39:51 2017
@@ -13,41 +13,41 @@ declare <2 x double> @llvm.nearbyint.v2f
 declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0
 
 
-define void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 {
 entry:
   %0 = call float @llvm.nearbyint.f32(float %in)
   store float %0, float addrspace(1)* %out
   ret void
 }
 
-define void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+define amdgpu_kernel void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
 entry:
   %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in)
   store <2 x float> %0, <2 x float> addrspace(1)* %out
   ret void
 }
 
-define void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
+define amdgpu_kernel void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
 entry:
   %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in)
   store <4 x float> %0, <4 x float> addrspace(1)* %out
   ret void
 }
 
-define void @nearbyint_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @nearbyint_f64(double addrspace(1)* %out, double %in) {
 entry:
   %0 = call double @llvm.nearbyint.f64(double %in)
   store double %0, double addrspace(1)* %out
   ret void
 }
-define void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
 entry:
   %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in)
   store <2 x double> %0, <2 x double> addrspace(1)* %out
   ret void
 }
 
-define void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
 entry:
   %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in)
   store <4 x double> %0, <4 x double> addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/fneg-combines.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fneg-combines.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fneg-combines.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fneg-combines.ll Tue Mar 21 16:39:51 2017
@@ -14,7 +14,7 @@
 
 ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -35,7 +35,7 @@ define void @v_fneg_add_f32(float addrsp
 ; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -62,7 +62,7 @@ define void @v_fneg_add_store_use_add_f3
 ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[ADD]]
 ; GCN: buffer_store_dword [[NEG_ADD]]
 ; GCN-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -87,7 +87,7 @@ define void @v_fneg_add_multi_use_add_f3
 
 ; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -111,7 +111,7 @@ define void @v_fneg_add_fneg_x_f32(float
 
 ; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -135,7 +135,7 @@ define void @v_fneg_add_x_fneg_f32(float
 
 ; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -164,7 +164,7 @@ define void @v_fneg_add_fneg_fneg_f32(fl
 ; GCN-NSZ-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
-define void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -192,7 +192,7 @@ define void @v_fneg_add_store_use_fneg_x
 ; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -218,7 +218,7 @@ define void @v_fneg_add_multi_use_fneg_x
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -239,7 +239,7 @@ define void @v_fneg_mul_f32(float addrsp
 ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
 ; GCN: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -261,7 +261,7 @@ define void @v_fneg_mul_store_use_mul_f3
 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
 ; GCN-NEXT: buffer_store_dword [[MUL0]]
 ; GCN-NEXT: buffer_store_dword [[MUL1]]
-define void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -282,7 +282,7 @@ define void @v_fneg_mul_multi_use_mul_f3
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -302,7 +302,7 @@ define void @v_fneg_mul_fneg_x_f32(float
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -322,7 +322,7 @@ define void @v_fneg_mul_x_fneg_f32(float
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -345,7 +345,7 @@ define void @v_fneg_mul_fneg_fneg_f32(fl
 ; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -368,7 +368,7 @@ define void @v_fneg_mul_store_use_fneg_x
 ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -394,7 +394,7 @@ define void @v_fneg_mul_multi_use_fneg_x
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -412,7 +412,7 @@ define void @v_fneg_minnum_f32(float add
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -428,7 +428,7 @@ define void @v_fneg_self_minnum_f32(floa
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -444,7 +444,7 @@ define void @v_fneg_posk_minnum_f32(floa
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -460,7 +460,7 @@ define void @v_fneg_negk_minnum_f32(floa
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -476,7 +476,7 @@ define void @v_fneg_0_minnum_f32(float a
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -494,7 +494,7 @@ define void @v_fneg_neg0_minnum_f32(floa
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]]
 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -516,7 +516,7 @@ define void @v_fneg_0_minnum_foldable_us
 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
 ; GCN-NEXT: buffer_store_dword [[MAX0]]
 ; GCN-NEXT: buffer_store_dword [[MUL1]]
-define void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -541,7 +541,7 @@ define void @v_fneg_minnum_multi_use_min
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -559,7 +559,7 @@ define void @v_fneg_maxnum_f32(float add
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -575,7 +575,7 @@ define void @v_fneg_self_maxnum_f32(floa
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -591,7 +591,7 @@ define void @v_fneg_posk_maxnum_f32(floa
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -607,7 +607,7 @@ define void @v_fneg_negk_maxnum_f32(floa
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -623,7 +623,7 @@ define void @v_fneg_0_maxnum_f32(float a
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -641,7 +641,7 @@ define void @v_fneg_neg0_maxnum_f32(floa
 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -663,7 +663,7 @@ define void @v_fneg_0_maxnum_foldable_us
 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
 ; GCN-NEXT: buffer_store_dword [[MAX0]]
 ; GCN-NEXT: buffer_store_dword [[MUL1]]
-define void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -693,7 +693,7 @@ define void @v_fneg_maxnum_multi_use_max
 
 ; GCN-NSZ: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -717,7 +717,7 @@ define void @v_fneg_fma_f32(float addrsp
 ; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
 ; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
 ; GCN-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -748,7 +748,7 @@ define void @v_fneg_fma_store_use_fma_f3
 
 ; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
 ; GCN-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -776,7 +776,7 @@ define void @v_fneg_fma_multi_use_fma_f3
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -803,7 +803,7 @@ define void @v_fneg_fma_fneg_x_y_f32(flo
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -830,7 +830,7 @@ define void @v_fneg_fma_x_fneg_y_f32(flo
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -858,7 +858,7 @@ define void @v_fneg_fma_fneg_fneg_y_f32(
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -886,7 +886,7 @@ define void @v_fneg_fma_fneg_x_fneg_f32(
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -916,7 +916,7 @@ define void @v_fneg_fma_x_y_fneg_f32(flo
 ; GCN-NSZ-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
-define void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -946,7 +946,7 @@ define void @v_fneg_fma_store_use_fneg_x
 ; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_FMA]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
+define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -979,7 +979,7 @@ define void @v_fneg_fma_multi_use_fneg_x
 
 ; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1009,7 +1009,7 @@ define void @v_fneg_fmad_f32(float addrs
 
 ; GCN: buffer_store_dword [[NEG_MAD]]
 ; GCN-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1035,7 +1035,7 @@ define void @v_fneg_fmad_multi_use_fmad_
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
-define void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1051,7 +1051,7 @@ define void @v_fneg_fp_extend_f32_to_f64
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
-define void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1070,7 +1070,7 @@ define void @v_fneg_fp_extend_fneg_f32_t
 ; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
 ; GCN: buffer_store_dword [[FNEG_A]]
-define void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1090,7 +1090,7 @@ define void @v_fneg_fp_extend_store_use_
 ; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}
-define void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1110,7 +1110,7 @@ define void @v_fneg_multi_use_fp_extend_
 ; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}, 4.0
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1126,7 +1126,7 @@ define void @v_fneg_multi_foldable_use_f
 
 ; FIXME: Source modifiers not folded for f16->f32
 ; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
-define void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1140,7 +1140,7 @@ define void @v_fneg_multi_use_fp_extend_
 }
 
 ; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
-define void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1162,7 +1162,7 @@ define void @v_fneg_multi_foldable_use_f
 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
 ; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1178,7 +1178,7 @@ define void @v_fneg_fp_round_f64_to_f32(
 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
 ; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1197,7 +1197,7 @@ define void @v_fneg_fp_round_fneg_f64_to
 ; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]]
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[A_LO]]:[[NEG_A_HI]]{{\]}}
-define void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1217,7 +1217,7 @@ define void @v_fneg_fp_round_store_use_f
 ; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}}
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: buffer_store_dwordx2 [[USE1]]
-define void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1236,7 +1236,7 @@ define void @v_fneg_fp_round_multi_use_f
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1252,7 +1252,7 @@ define void @v_fneg_fp_round_f32_to_f16(
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1271,7 +1271,7 @@ define void @v_fneg_fp_round_fneg_f32_to
 ; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]]
 ; GCN: buffer_store_dword [[NEG]]
 ; GCN: buffer_store_dword [[CVT]]
-define void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1290,7 +1290,7 @@ define void @v_fneg_multi_use_fp_round_f
 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
 ; GCN: buffer_store_short [[RESULT]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1310,7 +1310,7 @@ define void @v_fneg_fp_round_store_use_f
 ; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s
 ; GCN: buffer_store_short [[RESULT]]
 ; GCN: buffer_store_dword [[USE1]]
-define void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1333,7 +1333,7 @@ define void @v_fneg_fp_round_multi_use_f
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1349,7 +1349,7 @@ define void @v_fneg_rcp_f32(float addrsp
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1368,7 +1368,7 @@ define void @v_fneg_rcp_fneg_f32(float a
 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1388,7 +1388,7 @@ define void @v_fneg_rcp_store_use_fneg_f
 ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1411,7 +1411,7 @@ define void @v_fneg_rcp_multi_use_fneg_f
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rcp_legacy_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1432,7 +1432,7 @@ define void @v_fneg_rcp_legacy_f32(float
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1453,7 +1453,7 @@ define void @v_fneg_mul_legacy_f32(float
 ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1476,7 +1476,7 @@ define void @v_fneg_mul_legacy_store_use
 ; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1497,7 +1497,7 @@ define void @v_fneg_mul_legacy_multi_use
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1517,7 +1517,7 @@ define void @v_fneg_mul_legacy_fneg_x_f3
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1537,7 +1537,7 @@ define void @v_fneg_mul_legacy_x_fneg_f3
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1560,7 +1560,7 @@ define void @v_fneg_mul_legacy_fneg_fneg
 ; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1583,7 +1583,7 @@ define void @v_fneg_mul_legacy_store_use
 ; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1610,7 +1610,7 @@ define void @v_fneg_mul_legacy_multi_use
 ; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]]
 ; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1626,7 +1626,7 @@ define void @v_fneg_sin_f32(float addrsp
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1646,7 +1646,7 @@ define void @v_fneg_amdgcn_sin_f32(float
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_trunc_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1673,7 +1673,7 @@ define void @v_fneg_trunc_f32(float addr
 
 ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1693,7 +1693,7 @@ define void @v_fneg_round_f32(float addr
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1713,7 +1713,7 @@ define void @v_fneg_rint_f32(float addrs
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1735,7 +1735,7 @@ define void @v_fneg_nearbyint_f32(float
 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
 ; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
 ; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
-define void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1758,7 +1758,7 @@ define void @v_fneg_interp_p1_f32(float
 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
 ; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
 ; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
-define void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1791,7 +1791,7 @@ define void @v_fneg_interp_p2_f32(float
 ; GCN: buffer_store_dword [[MUL1]]
 
 ; GCN: buffer_store_dword [[MUL0]]
-define void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
+define amdgpu_kernel void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1827,7 +1827,7 @@ endif:
 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
 ; GCN: ; use [[MUL]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
+define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1856,7 +1856,7 @@ define void @v_fneg_inlineasm_f32(float
 ; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]]
 ; GCN: ; use [[NEG]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
+define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1888,7 +1888,7 @@ define void @v_fneg_inlineasm_multi_use_
 ; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0
 ; GCN-NEXT:	buffer_store_dword [[FMA0]]
 ; GCN-NEXT:	buffer_store_dword [[FMA1]]
-define void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1920,7 +1920,7 @@ define void @multiuse_fneg_2_vop3_users_
 ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
 ; GCN-NEXT:	buffer_store_dword [[MUL0]]
 ; GCN-NEXT:	buffer_store_dword [[MUL1]]
-define void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1951,7 +1951,7 @@ define void @multiuse_fneg_2_vop2_users_
 
 ; GCN:	buffer_store_dword [[FMA0]]
 ; GCN-NEXT:	buffer_store_dword [[MUL1]]
-define void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1990,7 +1990,7 @@ define void @multiuse_fneg_vop2_vop3_use
 
 ; GCN: buffer_store_dword [[MUL1]]
 ; GCN-NEXT:	buffer_store_dword [[MUL2]]
-define void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
+define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -2025,7 +2025,7 @@ define void @free_fold_src_code_size_cos
 
 ; GCN: buffer_store_dwordx2 [[MUL0]]
 ; GCN: buffer_store_dwordx2 [[MUL1]]
-define void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 {
+define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -2058,7 +2058,7 @@ define void @free_fold_src_code_size_cos
 ; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
 ; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
 ; GCN: buffer_store_dword [[FMA0]]
-define void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
+define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -2088,7 +2088,7 @@ define void @one_use_cost_to_fold_into_s
 ; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[D]], [[TRUNC_A]]
 ; GCN: buffer_store_dword [[FMA0]]
 ; GCN: buffer_store_dword [[MUL1]]
-define void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
+define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext

Modified: llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll?rev=298444&r1=298443&r2=298444&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll Tue Mar 21 16:39:51 2017
@@ -9,7 +9,7 @@
 
 ; GFX89-NOT: _and
 ; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|
-define void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) {
+define amdgpu_kernel void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) {
   %fabs = call half @llvm.fabs.f16(half %x)
   %fsub = fsub half -0.0, %fabs
   %fadd = fadd half %y, %fsub
@@ -27,7 +27,7 @@ define void @fneg_fabs_fadd_f16(half add
 ; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{v[0-9]+}}, -|{{v[0-9]+}}|
 ; GFX89-NOT: [[MUL]]
 ; GFX89: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
-define void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
+define amdgpu_kernel void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
   %fabs = call half @llvm.fabs.f16(half %x)
   %fsub = fsub half -0.0, %fabs
   %fmul = fmul half %y, %fsub
@@ -41,7 +41,7 @@ define void @fneg_fabs_fmul_f16(half add
 
 ; GCN-LABEL: {{^}}fneg_fabs_free_f16:
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
-define void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
+define amdgpu_kernel void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
   %bc = bitcast i16 %in to half
   %fabs = call half @llvm.fabs.f16(half %bc)
   %fsub = fsub half -0.0, %fabs
@@ -51,7 +51,7 @@ define void @fneg_fabs_free_f16(half add
 
 ; GCN-LABEL: {{^}}fneg_fabs_f16:
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
-define void @fneg_fabs_f16(half addrspace(1)* %out, half %in) {
+define amdgpu_kernel void @fneg_fabs_f16(half addrspace(1)* %out, half %in) {
   %fabs = call half @llvm.fabs.f16(half %in)
   %fsub = fsub half -0.0, %fabs
   store half %fsub, half addrspace(1)* %out, align 2
@@ -60,7 +60,7 @@ define void @fneg_fabs_f16(half addrspac
 
 ; GCN-LABEL: {{^}}v_fneg_fabs_f16:
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
-define void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %val = load half, half addrspace(1)* %in, align 2
   %fabs = call half @llvm.fabs.f16(half %val)
   %fsub = fsub half -0.0, %fabs
@@ -76,7 +76,7 @@ define void @v_fneg_fabs_f16(half addrsp
 ; CIVI: flat_store_dword
 
 ; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}
-define void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
+define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
   store <2 x half> %fneg.fabs, <2 x half> addrspace(1)* %out
@@ -95,7 +95,7 @@ define void @s_fneg_fabs_v2f16(<2 x half
 ; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
 
 ; GCN: flat_store_dwordx2
-define void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
+define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
   %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
   %fsub = fsub <4 x half> <half -0.0, half -0.0, half -0.0, half -0.0>, %fabs
   store <4 x half> %fsub, <4 x half> addrspace(1)* %out
@@ -113,7 +113,7 @@ define void @fneg_fabs_v4f16(<4 x half>
 
 ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0]
-define void @fold_user_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
+define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
   %mul = fmul <2 x half> %fneg.fabs, <half 4.0, half 4.0>
@@ -125,7 +125,7 @@ define void @fold_user_fneg_fabs_v2f16(<
 ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
 ; GFX9: v_mov_b32_e32 [[VABS:v[0-9]+]], [[ABS]]
 ; GFX9: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VABS]]
-define void @s_fneg_multi_use_fabs_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
+define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   %fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs
   store <2 x half> %fabs, <2 x half> addrspace(1)* %out0
@@ -136,7 +136,7 @@ define void @s_fneg_multi_use_fabs_v2f16
 ; GCN-LABEL: {{^}}s_fneg_multi_use_fabs_foldable_neg_v2f16:
 ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0]
-define void @s_fneg_multi_use_fabs_foldable_neg_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
+define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   %fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs
   %mul = fmul <2 x half> %fneg, <half 4.0, half 4.0>




More information about the llvm-commits mailing list