[llvm] r336373 - AMDGPU/GlobalISel: Implement custom kernel arg lowering

Thu Jul 5 10:01:21 PDT 2018

Author: arsenm
Date: Thu Jul  5 10:01:20 2018
New Revision: 336373

URL: http://llvm.org/viewvc/llvm-project?rev=336373&view=rev
Log:
AMDGPU/GlobalISel: Implement custom kernel arg lowering

Avoid using allocateKernArg / AssignFn. We do not want any
of the type splitting properties of normal calling convention
lowering.

For now at least this exists alongside the IR argument lowering
pass. This is necessary to handle struct padding correctly while
some arguments are still skipped by the IR argument lowering
pass.

Added:
    llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.h
    llvm/trunk/lib/Target/AMDGPU/AMDGPUCallingConv.td
    llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/trunk/test/CodeGen/AMDGPU/kernel-args.ll

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.cpp?rev=336373&r1=336372&r2=336373&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.cpp Thu Jul  5 10:01:20 2018
@@ -43,7 +43,7 @@ bool AMDGPUCallLowering::lowerReturn(Mac
 
 unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
                                                Type *ParamTy,
-                                               unsigned Offset) const {
+                                               uint64_t Offset) const {
 
   MachineFunction &MF = MIRBuilder.getMF();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -66,7 +66,8 @@ unsigned AMDGPUCallLowering::lowerParame
 }
 
 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
-                                        Type *ParamTy, unsigned Offset,
+                                        Type *ParamTy, uint64_t Offset,
+                                        unsigned Align,
                                         unsigned DstReg) const {
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = MF.getFunction();
@@ -74,7 +75,6 @@ void AMDGPUCallLowering::lowerParameter(
   PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS);
   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
   unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
-  unsigned Align = DL.getABITypeAlignment(ParamTy);
   unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
 
   MachineMemOperand *MMO =
@@ -95,7 +95,7 @@ bool AMDGPUCallLowering::lowerFormalArgu
     return false;
 
   MachineFunction &MF = MIRBuilder.getMF();
-  const SISubtarget *Subtarget = static_cast<const SISubtarget *>(&MF.getSubtarget());
+  const SISubtarget *Subtarget = &MF.getSubtarget<SISubtarget>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
@@ -145,6 +145,36 @@ bool AMDGPUCallLowering::lowerFormalArgu
     CCInfo.AllocateReg(FlatScratchInitReg);
   }
 
+  // The infrastructure for normal calling convention lowering is essentially
+  // useless for kernels. We want to avoid any kind of legalization or argument
+  // splitting.
+  if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) {
+    unsigned i = 0;
+    const unsigned KernArgBaseAlign = 16;
+    const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
+    uint64_t ExplicitArgOffset = 0;
+
+    // TODO: Align down to dword alignment and extract bits for extending loads.
+    for (auto &Arg : F.args()) {
+      Type *ArgTy = Arg.getType();
+      unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
+      if (AllocSize == 0)
+        continue;
+
+      unsigned ABIAlign = DL.getABITypeAlignment(ArgTy);
+
+      uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
+      ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
+
+      unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset);
+      ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
+      lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, VRegs[i]);
+      ++i;
+    }
+
+    return true;
+  }
+
   unsigned NumArgs = F.arg_size();
   Function::const_arg_iterator CurOrigArg = F.arg_begin();
   const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
@@ -216,13 +246,5 @@ bool AMDGPUCallLowering::lowerFormalArgu
     return true;
   }
 
-  for (unsigned i = 0; i != ArgLocs.size(); ++i, ++Arg) {
-    // FIXME: We should be getting DebugInfo from the arguments some how.
-    CCValAssign &VA = ArgLocs[i];
-    lowerParameter(MIRBuilder, Arg->getType(),
-                   VA.getLocMemOffset() +
-                   Subtarget->getExplicitKernelArgOffset(F), VRegs[i]);
-  }
-
-  return true;
+  return false;
 }

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.h?rev=336373&r1=336372&r2=336373&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUCallLowering.h Thu Jul  5 10:01:20 2018
@@ -26,10 +26,11 @@ class AMDGPUCallLowering: public CallLow
   AMDGPUAS AMDGPUASI;
 
   unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
-                             unsigned Offset) const;
+                             uint64_t Offset) const;
 
   void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy,
-                      unsigned Offset, unsigned DstReg) const;
+                      uint64_t Offset, unsigned Align,
+                      unsigned DstReg) const;
 
  public:
   AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUCallingConv.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUCallingConv.td?rev=336373&r1=336372&r2=336373&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUCallingConv.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUCallingConv.td Thu Jul  5 10:01:20 2018
@@ -85,11 +85,6 @@ def RetCC_SI_Shader : CallingConv<[
   ]>>
 ]>;
 
-// Calling convention for compute kernels
-def CC_AMDGPU_Kernel : CallingConv<[
-  CCCustom<"allocateKernArg">
-]>;
-
 def CSR_AMDGPU_VGPRs_24_255 : CalleeSavedRegs<
   (sequence "VGPR%u", 24, 255)
 >;
@@ -137,16 +132,6 @@ def RetCC_AMDGPU_Func : CallingConv<[
 ]>;
 
 def CC_AMDGPU : CallingConv<[
-  CCIf<"static_cast<const AMDGPUSubtarget&>"
-        "(State.getMachineFunction().getSubtarget()).getGeneration() >="
-          "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
-        "!AMDGPU::isShader(State.getCallingConv())",
-       CCDelegateTo<CC_AMDGPU_Kernel>>,
-  CCIf<"static_cast<const AMDGPUSubtarget&>"
-        "(State.getMachineFunction().getSubtarget()).getGeneration() < "
-          "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
-         "!AMDGPU::isShader(State.getCallingConv())",
-        CCDelegateTo<CC_AMDGPU_Kernel>>,
    CCIf<"static_cast<const AMDGPUSubtarget&>"
          "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
            "AMDGPUSubtarget::SOUTHERN_ISLANDS",

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=336373&r1=336372&r2=336373&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Thu Jul  5 10:01:20 2018
@@ -843,7 +843,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssign
   switch (CC) {
   case CallingConv::AMDGPU_KERNEL:
   case CallingConv::SPIR_KERNEL:
-    return CC_AMDGPU_Kernel;
+    llvm_unreachable("kernels should not be handled here");
   case CallingConv::AMDGPU_VS:
   case CallingConv::AMDGPU_GS:
   case CallingConv::AMDGPU_PS:
@@ -866,7 +866,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssign
   switch (CC) {
   case CallingConv::AMDGPU_KERNEL:
   case CallingConv::SPIR_KERNEL:
-    return CC_AMDGPU_Kernel;
+    llvm_unreachable("kernels should not be handled here");
   case CallingConv::AMDGPU_VS:
   case CallingConv::AMDGPU_GS:
   case CallingConv::AMDGPU_PS:

Added: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll?rev=336373&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll Thu Jul  5 10:01:20 2018
@@ -0,0 +1,723 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; REQUIRES: global-isel
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -O0 -amdgpu-ir-lower-kernel-arguments=0 -stop-after=irtranslator -global-isel %s -o - | FileCheck -check-prefix=HSA-VI %s
+
+define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
+  ; HSA-VI-LABEL: name: i8_arg
+  ; HSA-VI: bb.1 (%ir-block.0):
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
+  ; HSA-VI:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+  %ext = zext i8 %in to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
+  ; HSA-VI-LABEL: name: i8_zext_arg
+  ; HSA-VI: bb.1 (%ir-block.0):
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
+  ; HSA-VI:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+  %ext = zext i8 %in to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
+  ; HSA-VI-LABEL: name: i8_sext_arg
+  ; HSA-VI: bb.1 (%ir-block.0):
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s8)
+  ; HSA-VI:   G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+  %ext = sext i8 %in to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
+  ; HSA-VI-LABEL: name: i16_arg
+  ; HSA-VI: bb.1 (%ir-block.0):
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 2 from `i16 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
+  ; HSA-VI:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+  %ext = zext i16 %in to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
+  ; HSA-VI-LABEL: name: i16_zext_arg
+  ; HSA-VI: bb.1 (%ir-block.0):
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 2 from `i16 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
+  ; HSA-VI:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+  %ext = zext i16 %in to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
+  ; HSA-VI-LABEL: name: i16_sext_arg
+  ; HSA-VI: bb.1 (%ir-block.0):
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 2 from `i16 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s16)
+  ; HSA-VI:   G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+  %ext = sext i16 %in to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
+  ; HSA-VI-LABEL: name: i32_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store i32 %in, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
+  ; HSA-VI-LABEL: name: f32_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `float addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 4 from `float addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store float %in, float addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
+  ; HSA-VI-LABEL: name: v2i8_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<2 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 2 from `<2 x i8> addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<2 x s8>), [[LOAD]](p1) :: (store 2 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <2 x i8> %in, <2 x i8> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
+  ; HSA-VI-LABEL: name: v2i16_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<2 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 4 from `<2 x i16> addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<2 x s16>), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <2 x i16> %in, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
+  ; HSA-VI-LABEL: name: v2i32_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<2 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `<2 x i32> addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store 8 into %ir.out, align 4, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
+  ; HSA-VI-LABEL: name: v2f32_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<2 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `<2 x float> addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store 8 into %ir.out, align 4, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
+  ; HSA-VI-LABEL: name: v3i8_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<3 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 3 from `<3 x i8> addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<3 x s8>), [[LOAD]](p1) :: (store 3 into %ir.out, align 4, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
+  ; HSA-VI-LABEL: name: v3i16_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<3 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 6 from `<3 x i16> addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<3 x s16>), [[LOAD]](p1) :: (store 6 into %ir.out, align 4, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
+  ; HSA-VI-LABEL: name: v3i32_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<3 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 12 from `<3 x i32> addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store 12 into %ir.out, align 4, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
+  ; HSA-VI-LABEL: name: v3f32_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<3 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 12 from `<3 x float> addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store 12 into %ir.out, align 4, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
+  ; HSA-VI-LABEL: name: v4i8_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<4 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<4 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 4 from `<4 x i8> addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<4 x s8>), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
+  ; HSA-VI-LABEL: name: v4i16_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<4 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `<4 x i16> addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<4 x s16>), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <4 x i16> %in, <4 x i16> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
+  ; HSA-VI-LABEL: name: v4i32_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<4 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 16 from `<4 x i32> addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store 16 into %ir.out, align 4, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
+  ; HSA-VI-LABEL: name: v4f32_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<4 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 16 from `<4 x float> addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store 16 into %ir.out, align 4, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
+  ; HSA-VI-LABEL: name: v8i8_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<8 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<8 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `<8 x i8> addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<8 x s8>), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <8 x i8> %in, <8 x i8> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
+  ; HSA-VI-LABEL: name: v8i16_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<8 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 16 from `<8 x i16> addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<8 x s16>), [[LOAD]](p1) :: (store 16 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <8 x i16> %in, <8 x i16> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
+  ; HSA-VI-LABEL: name: v8i32_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<8 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 32 from `<8 x i32> addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store 32 into %ir.out, align 4, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
+  ; HSA-VI-LABEL: name: v8f32_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<8 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 32 from `<8 x float> addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store 32 into %ir.out, align 4, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
+  ; HSA-VI-LABEL: name: v16i8_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<16 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 16 from `<16 x i8> addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<16 x s8>), [[LOAD]](p1) :: (store 16 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <16 x i8> %in, <16 x i8> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
+  ; HSA-VI-LABEL: name: v16i16_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<16 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 32 from `<16 x i16> addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<16 x s16>), [[LOAD]](p1) :: (store 32 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <16 x i16> %in, <16 x i16> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
+  ; HSA-VI-LABEL: name: v16i32_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<16 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 64 from `<16 x i32> addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store 64 into %ir.out, align 4, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
+  ; HSA-VI-LABEL: name: v16f32_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<16 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 64 from `<16 x float> addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store 64 into %ir.out, align 4, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
+  ; HSA-VI-LABEL: name: kernel_arg_i64
+  ; HSA-VI: bb.1 (%ir-block.0):
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i64 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+  store i64 %a, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
+  ; HSA-VI-LABEL: name: f64_kernel_arg
+  ; HSA-VI: bb.1.entry:
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `double addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `double addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+entry:
+  store double %in, double addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
+  ; HSA-VI-LABEL: name: i1_arg
+  ; HSA-VI: bb.1 (%ir-block.0):
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i1 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   G_STORE [[LOAD1]](s1), [[LOAD]](p1) :: (store 1 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+  store i1 %x, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+  ; HSA-VI-LABEL: name: i1_arg_zext_i32
+  ; HSA-VI: bb.1 (%ir-block.0):
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s1)
+  ; HSA-VI:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+  %ext = zext i1 %x to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+  ; HSA-VI-LABEL: name: i1_arg_zext_i64
+  ; HSA-VI: bb.1 (%ir-block.0):
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i64 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD1]](s1)
+  ; HSA-VI:   G_STORE [[ZEXT]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+  %ext = zext i1 %x to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+  ; HSA-VI-LABEL: name: i1_arg_sext_i32
+  ; HSA-VI: bb.1 (%ir-block.0):
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s1)
+  ; HSA-VI:   G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+  %ext = sext i1 %x to i32
+  store i32 %ext, i32addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+  ; HSA-VI-LABEL: name: i1_arg_sext_i64
+  ; HSA-VI: bb.1 (%ir-block.0):
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i64 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD1]](s1)
+  ; HSA-VI:   G_STORE [[SEXT]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
+  ; HSA-VI:   S_ENDPGM
+  %ext = sext i1 %x to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
+  ; HSA-VI-LABEL: name: empty_struct_arg
+  ; HSA-VI: bb.1 (%ir-block.0):
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   S_ENDPGM
+  ret void
+}
+
+; The correct load offsets for these:
+; load 4 from 0,
+; load 8 from 8
+; load 4 from 24
+; load 8 from 32
+
+; With the SelectionDAG argument lowering, the alignments for the
+; struct members is not properly considered, making these wrong.
+
+; FIXME: GlobalISel extractvalue emission broken
+
+define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
+  ; %val0 = extractvalue {i32, i64} %arg0, 0
+  ; %val1 = extractvalue {i32, i64} %arg0, 1
+  ; %val2 = extractvalue {i32, i64} %arg1, 0
+  ; %val3 = extractvalue {i32, i64} %arg1, 1
+  ; store volatile i32 %val0, i32 addrspace(1)* null
+  ; store volatile i64 %val1, i64 addrspace(1)* null
+  ; store volatile i32 %val2, i32 addrspace(1)* null
+  ; store volatile i64 %val3, i64 addrspace(1)* null
+  ; HSA-VI-LABEL: name: struct_argument_alignment
+  ; HSA-VI: bb.1 (%ir-block.1):
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 16 from `{ i32, i64 } addrspace(4)* undef`, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
+  ; HSA-VI:   [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64)
+  ; HSA-VI:   [[LOAD2:%[0-9]+]]:_(s128) = G_LOAD [[GEP2]](p4) :: (non-temporal invariant load 16 from `{ i32, i64 } addrspace(4)* undef`, align 8, addrspace 4)
+  ; HSA-VI:   [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD]](s128), 0
+  ; HSA-VI:   [[EXTRACT1:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD]](s128), 64
+  ; HSA-VI:   [[EXTRACT2:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD2]](s128), 0
+  ; HSA-VI:   [[EXTRACT3:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD2]](s128), 64
+  ; HSA-VI:   S_ENDPGM
+  ret void
+}
+
+; No padding between i8 and next struct, but round up at end to 4 byte
+; multiple.
+define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
+  ; %val0 = extractvalue <{i32, i64}> %arg0, 0
+  ; %val1 = extractvalue <{i32, i64}> %arg0, 1
+  ; %val2 = extractvalue <{i32, i64}> %arg1, 0
+  ; %val3 = extractvalue <{i32, i64}> %arg1, 1
+  ; store volatile i32 %val0, i32 addrspace(1)* null
+  ; store volatile i64 %val1, i64 addrspace(1)* null
+  ; store volatile i32 %val2, i32 addrspace(1)* null
+  ; store volatile i64 %val3, i64 addrspace(1)* null
+  ; HSA-VI-LABEL: name: packed_struct_argument_alignment
+  ; HSA-VI: bb.1 (%ir-block.1):
+  ; HSA-VI:   liveins: $sgpr4_sgpr5
+  ; HSA-VI:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+  ; HSA-VI:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; HSA-VI:   [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+  ; HSA-VI:   [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 12 from `<{ i32, i64 }> addrspace(4)* undef`, align 16, addrspace 4)
+  ; HSA-VI:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+  ; HSA-VI:   [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+  ; HSA-VI:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 4, addrspace 4)
+  ; HSA-VI:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 13
+  ; HSA-VI:   [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64)
+  ; HSA-VI:   [[LOAD2:%[0-9]+]]:_(s96) = G_LOAD [[GEP2]](p4) :: (non-temporal invariant load 12 from `<{ i32, i64 }> addrspace(4)* undef`, align 1, addrspace 4)
+  ; HSA-VI:   [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD]](s96), 0
+  ; HSA-VI:   [[EXTRACT1:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD]](s96), 32
+  ; HSA-VI:   [[EXTRACT2:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD2]](s96), 0
+  ; HSA-VI:   [[EXTRACT3:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD2]](s96), 32
+  ; HSA-VI:   S_ENDPGM
+  ret void
+}

Modified: llvm/trunk/test/CodeGen/AMDGPU/kernel-args.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/kernel-args.ll?rev=336373&r1=336372&r2=336373&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/kernel-args.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/kernel-args.ll Thu Jul  5 10:01:20 2018
@@ -14,12 +14,9 @@
 
 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
-
-
 define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
-entry:
-  %0 = zext i8 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
+  %ext = zext i8 %in to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
 }
 
@@ -33,9 +30,8 @@ entry:
 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
 define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
-entry:
-  %0 = zext i8 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
+  %ext = zext i8 %in to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
 }
 
@@ -51,9 +47,8 @@ entry:
 ; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]]
 ; HSA-VI: flat_store_dword
 define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
-entry:
-  %0 = sext i8 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
+  %ext = sext i8 %in to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
 }
 
@@ -71,9 +66,8 @@ entry:
 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
 ; HSA-VI: flat_store_dword
 define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
-entry:
-  %0 = zext i16 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
+  %ext = zext i16 %in to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
 }
 
@@ -89,9 +83,8 @@ entry:
 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
 ; HSA-VI: flat_store_dword
 define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
-entry:
-  %0 = zext i16 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
+  %ext = zext i16 %in to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
 }
 
@@ -108,9 +101,8 @@ entry:
 ; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]]
 ; HSA-VI: flat_store_dword
 define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
-entry:
-  %0 = sext i16 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
+  %ext = sext i16 %in to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
 }
 
@@ -657,3 +649,57 @@ define amdgpu_kernel void @i1_arg_sext_i
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
 }
+
+; FUNC-LABEL: {{^}}empty_struct_arg:
+; HSA: kernarg_segment_byte_size = 0
+define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
+  ret void
+}
+
+; The correct load offsets for these:
+; load 4 from 0,
+; load 8 from 8
+; load 4 from 24
+; load 8 from 32
+
+; With the SelectionDAG argument lowering, the alignments for the
+; struct members is not properly considered, making these wrong.
+
+; FIXME: Total argument size is computed wrong
+; FUNC-LABEL: {{^}}struct_argument_alignment:
+; HSA: kernarg_segment_byte_size = 40
+; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
+; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
+; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
+define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
+  %val0 = extractvalue {i32, i64} %arg0, 0
+  %val1 = extractvalue {i32, i64} %arg0, 1
+  %val2 = extractvalue {i32, i64} %arg1, 0
+  %val3 = extractvalue {i32, i64} %arg1, 1
+  store volatile i32 %val0, i32 addrspace(1)* null
+  store volatile i64 %val1, i64 addrspace(1)* null
+  store volatile i32 %val2, i32 addrspace(1)* null
+  store volatile i64 %val3, i64 addrspace(1)* null
+  ret void
+}
+
+; No padding between i8 and next struct, but round up at end to 4 byte
+; multiple.
+; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
+; HSA: kernarg_segment_byte_size = 28
+; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
+; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
+; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
+define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
+  %val0 = extractvalue <{i32, i64}> %arg0, 0
+  %val1 = extractvalue <{i32, i64}> %arg0, 1
+  %val2 = extractvalue <{i32, i64}> %arg1, 0
+  %val3 = extractvalue <{i32, i64}> %arg1, 1
+  store volatile i32 %val0, i32 addrspace(1)* null
+  store volatile i64 %val1, i64 addrspace(1)* null
+  store volatile i32 %val2, i32 addrspace(1)* null
+  store volatile i64 %val3, i64 addrspace(1)* null
+  ret void
+}