[lld] a022b1c - [AMDGPU] Add amdgpu_gfx calling convention

Sebastian Neubauer via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 9 07:51:58 PST 2020


Author: Sebastian Neubauer
Date: 2020-11-09T16:51:44+01:00
New Revision: a022b1ccd8d849dd59a8e7b76c0406cc80dee0f0

URL: https://github.com/llvm/llvm-project/commit/a022b1ccd8d849dd59a8e7b76c0406cc80dee0f0
DIFF: https://github.com/llvm/llvm-project/commit/a022b1ccd8d849dd59a8e7b76c0406cc80dee0f0.diff

LOG: [AMDGPU] Add amdgpu_gfx calling convention

Add a calling convention called amdgpu_gfx for real function calls
within graphics shaders. For the moment, this uses the same calling
convention as other calls in amdgpu, with registers excluded for return
address, stack pointer and stack buffer descriptor.

Differential Revision: https://reviews.llvm.org/D88540

Added: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
    llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll

Modified: 
    lld/test/ELF/lto/amdgcn-oses.ll
    llvm/docs/AMDGPUUsage.rst
    llvm/include/llvm/IR/CallingConv.h
    llvm/lib/AsmParser/LLLexer.cpp
    llvm/lib/AsmParser/LLParser.cpp
    llvm/lib/AsmParser/LLToken.h
    llvm/lib/IR/AsmWriter.cpp
    llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
    llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
    llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
    llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
    llvm/test/Bitcode/compatibility.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
    llvm/test/CodeGen/AMDGPU/indirect-call.ll
    llvm/test/CodeGen/AMDGPU/unsupported-calls.ll

Removed: 
    


################################################################################
diff  --git a/lld/test/ELF/lto/amdgcn-oses.ll b/lld/test/ELF/lto/amdgcn-oses.ll
index dd6f382cd627..d5350ab4d5f3 100644
--- a/lld/test/ELF/lto/amdgcn-oses.ll
+++ b/lld/test/ELF/lto/amdgcn-oses.ll
@@ -33,7 +33,7 @@ define void @_start() {
 target triple = "amdgcn-amd-amdpal"
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
-define void @_start() {
+define amdgpu_cs void @_start() {
   ret void
 }
 

diff  --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index fd3bf9c8b68e..5b9e1b86df64 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -8116,6 +8116,25 @@ The following table illustrates the required format:
   the top 32 bits of the pipeline, so the shader may use the program
   counter's top 32 bits.
 
+.. _pal_call-convention:
+
+Call Convention
+~~~~~~~~~~~~~~~
+
+For graphics use cases, the calling convention is `amdgpu_gfx`.
+
+.. note::
+
+  `amdgpu_gfx` Function calls are currently in development and are
+  subject to major changes.
+
+This calling convention shares most properties with calling non-kernel
+functions (see
+:ref:`amdgpu-amdhsa-function-call-convention-non-kernel-functions`).
+Differences are:
+
+ - Currently there are none, 
diff erences will be listed here
+
 Unspecified OS
 --------------
 

diff  --git a/llvm/include/llvm/IR/CallingConv.h b/llvm/include/llvm/IR/CallingConv.h
index d0906de3ea4e..6a4e368b2e9d 100644
--- a/llvm/include/llvm/IR/CallingConv.h
+++ b/llvm/include/llvm/IR/CallingConv.h
@@ -241,6 +241,9 @@ namespace CallingConv {
     /// The remainder matches the regular calling convention.
     WASM_EmscriptenInvoke = 99,
 
+    /// Calling convention used for AMD graphics targets.
+    AMDGPU_Gfx = 100,
+
     /// The highest possible calling convention ID. Must be some 2^k - 1.
     MaxID = 1023
   };

diff  --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index b8f7c0803376..3e36a3293c42 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -624,6 +624,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(amdgpu_ps);
   KEYWORD(amdgpu_cs);
   KEYWORD(amdgpu_kernel);
+  KEYWORD(amdgpu_gfx);
   KEYWORD(tailcc);
 
   KEYWORD(cc);

diff  --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 607691a0069e..617831484f3c 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -2134,6 +2134,7 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_hhvm_ccc:       CC = CallingConv::HHVM_C; break;
   case lltok::kw_cxx_fast_tlscc: CC = CallingConv::CXX_FAST_TLS; break;
   case lltok::kw_amdgpu_vs:      CC = CallingConv::AMDGPU_VS; break;
+  case lltok::kw_amdgpu_gfx:     CC = CallingConv::AMDGPU_Gfx; break;
   case lltok::kw_amdgpu_ls:      CC = CallingConv::AMDGPU_LS; break;
   case lltok::kw_amdgpu_hs:      CC = CallingConv::AMDGPU_HS; break;
   case lltok::kw_amdgpu_es:      CC = CallingConv::AMDGPU_ES; break;

diff  --git a/llvm/lib/AsmParser/LLToken.h b/llvm/lib/AsmParser/LLToken.h
index f5dd631af2c3..df770b059296 100644
--- a/llvm/lib/AsmParser/LLToken.h
+++ b/llvm/lib/AsmParser/LLToken.h
@@ -170,6 +170,7 @@ enum Kind {
   kw_amdgpu_ps,
   kw_amdgpu_cs,
   kw_amdgpu_kernel,
+  kw_amdgpu_gfx,
   kw_tailcc,
 
   // Attributes:

diff  --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index baa2fd48550a..83443fefcd49 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -399,6 +399,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::AMDGPU_PS:     Out << "amdgpu_ps"; break;
   case CallingConv::AMDGPU_CS:     Out << "amdgpu_cs"; break;
   case CallingConv::AMDGPU_KERNEL: Out << "amdgpu_kernel"; break;
+  case CallingConv::AMDGPU_Gfx:    Out << "amdgpu_gfx"; break;
   }
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 05368b8845ac..842eb2c696b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -456,7 +456,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     Info = analyzeResourceUsage(MF);
   }
 
-  if (STM.isAmdPalOS())
+  if (STM.isAmdPalOS() && MFI->isEntryFunction())
     EmitPALMetadata(MF, CurrentProgramInfo);
   else if (!STM.isAmdHsaOS()) {
     EmitProgramInfoSI(MF, CurrentProgramInfo);

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 16a09bd2c656..af58df181918 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -460,8 +460,8 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,
 
   CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
   const bool IsShader = AMDGPU::isShader(CC);
-  const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) ||
-                         AMDGPU::isKernel(CC);
+  const bool IsWaveEnd =
+      (IsShader && MFI->returnsVoid()) || AMDGPU::isKernel(CC);
   if (IsWaveEnd) {
     B.buildInstr(AMDGPU::S_ENDPGM)
       .addImm(0);
@@ -785,7 +785,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
   if (CC == CallingConv::AMDGPU_KERNEL)
     return lowerFormalArgumentsKernel(B, F, VRegs);
 
-  const bool IsShader = AMDGPU::isShader(CC);
+  const bool IsGraphics = AMDGPU::isGraphics(CC);
   const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
 
   MachineFunction &MF = B.getMF();
@@ -826,7 +826,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
     const bool InReg = Arg.hasAttribute(Attribute::InReg);
 
     // SGPR arguments to functions not implemented.
-    if (!IsShader && InReg)
+    if (!IsGraphics && InReg)
       return false;
 
     if (Arg.hasAttribute(Attribute::SwiftSelf) ||
@@ -937,7 +937,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
 
   // Start adding system SGPRs.
   if (IsEntryFunc) {
-    TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader);
+    TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
   } else {
     CCInfo.AllocateReg(Info->getScratchRSrcReg());
     TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
@@ -1131,11 +1131,6 @@ static bool addCallTargetOperands(MachineInstrBuilder &CallInst,
 
 bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
                                    CallLoweringInfo &Info) const {
-  if (!AMDGPUTargetMachine::EnableFixedFunctionABI) {
-    LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
-    return false;
-  }
-
   if (Info.IsVarArg) {
     LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
     return false;
@@ -1149,8 +1144,15 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
   const DataLayout &DL = F.getParent()->getDataLayout();
+  CallingConv::ID CallConv = F.getCallingConv();
+
+  if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
+      CallConv != CallingConv::AMDGPU_Gfx) {
+    LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
+    return false;
+  }
 
-  if (AMDGPU::isShader(F.getCallingConv())) {
+  if (AMDGPU::isShader(CallConv)) {
     LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n");
     return false;
   }

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 5cbd65b95dff..7129c6991f2a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -16,7 +16,75 @@ class CCIfExtend<CCAction A>
   : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
 
 // Calling convention for SI
-def CC_SI : CallingConv<[
+def CC_SI_Gfx : CallingConv<[
+  // 0-3 are reserved for the stack buffer descriptor
+  // 30-31 are reserved for the return address
+  // 32 is reserved for the stack pointer
+  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+    SGPR4, SGPR5, SGPR6, SGPR7,
+    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29,
+  ]>>>,
+
+  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
+  ]>>>,
+
+  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
+  CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
+  CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
+  CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>,
+  CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
+  CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
+]>;
+
+def RetCC_SI_Gfx : CallingConv<[
+  // 0-3 are reserved for the stack buffer descriptor
+  // 32 is reserved for the stack pointer
+  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+    SGPR4, SGPR5, SGPR6, SGPR7,
+    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
+    SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39,
+    SGPR40, SGPR41, SGPR42, SGPR43
+  ]>>>,
+
+  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31,
+    VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39,
+    VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47,
+    VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55,
+    VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63,
+    VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71,
+    VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79,
+    VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87,
+    VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95,
+    VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103,
+    VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111,
+    VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
+    VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
+    VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
+  ]>>>,
+
+  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
+  CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
+  CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
+  CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>,
+  CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
+  CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
+]>;
+
+def CC_SI_SHADER : CallingConv<[
 
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
@@ -161,7 +229,7 @@ def CC_AMDGPU : CallingConv<[
    CCIf<"static_cast<const GCNSubtarget&>"
          "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
            "AMDGPUSubtarget::SOUTHERN_ISLANDS",
-        CCDelegateTo<CC_SI>>,
+        CCDelegateTo<CC_SI_SHADER>>,
    CCIf<"static_cast<const GCNSubtarget&>"
          "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
            "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C",

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index aa269080cebe..d0c348bc75d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -942,6 +942,8 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
   case CallingConv::Fast:
   case CallingConv::Cold:
     return CC_AMDGPU_Func;
+  case CallingConv::AMDGPU_Gfx:
+    return CC_SI_Gfx;
   case CallingConv::AMDGPU_KERNEL:
   case CallingConv::SPIR_KERNEL:
   default:
@@ -963,6 +965,8 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
   case CallingConv::AMDGPU_ES:
   case CallingConv::AMDGPU_LS:
     return RetCC_SI_Shader;
+  case CallingConv::AMDGPU_Gfx:
+    return RetCC_SI_Gfx;
   case CallingConv::C:
   case CallingConv::Fast:
   case CallingConv::Cold:

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 2186739a3a24..8e1571c251fc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -829,31 +829,6 @@ int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
   }
 }
 
-static bool isArgPassedInSGPR(const Argument *A) {
-  const Function *F = A->getParent();
-
-  // Arguments to compute shaders are never a source of divergence.
-  CallingConv::ID CC = F->getCallingConv();
-  switch (CC) {
-  case CallingConv::AMDGPU_KERNEL:
-  case CallingConv::SPIR_KERNEL:
-    return true;
-  case CallingConv::AMDGPU_VS:
-  case CallingConv::AMDGPU_LS:
-  case CallingConv::AMDGPU_HS:
-  case CallingConv::AMDGPU_ES:
-  case CallingConv::AMDGPU_GS:
-  case CallingConv::AMDGPU_PS:
-  case CallingConv::AMDGPU_CS:
-    // For non-compute shaders, SGPR inputs are marked with either inreg.
-    // Everything else is in VGPRs.
-    return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg);
-  default:
-    // TODO: Should calls support inreg for SGPR inputs?
-    return false;
-  }
-}
-
 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
 /// this is analyzing the collective result of all output registers. Otherwise,
 /// this is only querying a specific result index if this returns multiple
@@ -910,7 +885,7 @@ bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
 /// 
diff erent across workitems in a wavefront.
 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
   if (const Argument *A = dyn_cast<Argument>(V))
-    return !isArgPassedInSGPR(A);
+    return !AMDGPU::isArgPassedInSGPR(A);
 
   // Loads from the private and flat address spaces are divergent, because
   // threads can execute the load instruction with the same inputs and get

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 22aa27e2ef4d..6d87b6a1a73d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -76,7 +76,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   const GCNSubtarget *ST;
   const SITargetLowering *TLI;
   AMDGPUTTIImpl CommonTTI;
-  bool IsGraphicsShader;
+  bool IsGraphics;
   bool HasFP32Denormals;
   bool HasFP64FP16Denormals;
   unsigned MaxVGPRs;
@@ -142,7 +142,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
       : BaseT(TM, F.getParent()->getDataLayout()),
         ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
         TLI(ST->getTargetLowering()), CommonTTI(TM, F),
-        IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())),
+        IsGraphics(AMDGPU::isGraphics(F.getCallingConv())),
         MaxVGPRs(ST->getMaxNumVGPRs(
             std::max(ST->getWavesPerEU(F).first,
                      ST->getWavesPerEUForWorkGroup(
@@ -222,7 +222,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   unsigned getFlatAddressSpace() const {
     // Don't bother running InferAddressSpaces pass on graphics shaders which
     // don't use flat addressing.
-    if (IsGraphicsShader)
+    if (IsGraphics)
       return -1;
     return AMDGPUAS::FLAT_ADDRESS;
   }

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e7c6b6ce6b0c..7987ac72e451 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1780,12 +1780,11 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
   return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
 }
 
-static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
-                                   CallingConv::ID CallConv,
-                                   ArrayRef<ISD::InputArg> Ins,
-                                   BitVector &Skipped,
-                                   FunctionType *FType,
-                                   SIMachineFunctionInfo *Info) {
+static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
+                               CallingConv::ID CallConv,
+                               ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
+                               FunctionType *FType,
+                               SIMachineFunctionInfo *Info) {
   for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
     const ISD::InputArg *Arg = &Ins[I];
 
@@ -2237,7 +2236,7 @@ SDValue SITargetLowering::LowerFormalArguments(
   FunctionType *FType = MF.getFunction().getFunctionType();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
-  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
+  if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
     DiagnosticInfoUnsupported NoGraphicsHSA(
         Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
     DAG.getContext()->diagnose(NoGraphicsHSA);
@@ -2250,12 +2249,21 @@ SDValue SITargetLowering::LowerFormalArguments(
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
 
-  bool IsShader = AMDGPU::isShader(CallConv);
+  bool IsGraphics = AMDGPU::isGraphics(CallConv);
   bool IsKernel = AMDGPU::isKernel(CallConv);
   bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
 
-  if (IsShader) {
-    processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
+  if (IsGraphics) {
+    assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() &&
+           (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) &&
+           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
+           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
+           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
+           !Info->hasWorkItemIDZ());
+  }
+
+  if (CallConv == CallingConv::AMDGPU_PS) {
+    processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
 
     // At least one interpolation mode must be enabled or else the GPU will
     // hang.
@@ -2270,40 +2278,28 @@ SDValue SITargetLowering::LowerFormalArguments(
     // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
     // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
     //   enabled too.
-    if (CallConv == CallingConv::AMDGPU_PS) {
-      if ((Info->getPSInputAddr() & 0x7F) == 0 ||
-           ((Info->getPSInputAddr() & 0xF) == 0 &&
-            Info->isPSInputAllocated(11))) {
-        CCInfo.AllocateReg(AMDGPU::VGPR0);
-        CCInfo.AllocateReg(AMDGPU::VGPR1);
-        Info->markPSInputAllocated(0);
-        Info->markPSInputEnabled(0);
-      }
-      if (Subtarget->isAmdPalOS()) {
-        // For isAmdPalOS, the user does not enable some bits after compilation
-        // based on run-time states; the register values being generated here are
-        // the final ones set in hardware. Therefore we need to apply the
-        // workaround to PSInputAddr and PSInputEnable together.  (The case where
-        // a bit is set in PSInputAddr but not PSInputEnable is where the
-        // frontend set up an input arg for a particular interpolation mode, but
-        // nothing uses that input arg. Really we should have an earlier pass
-        // that removes such an arg.)
-        unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
-        if ((PsInputBits & 0x7F) == 0 ||
-            ((PsInputBits & 0xF) == 0 &&
-             (PsInputBits >> 11 & 1)))
-          Info->markPSInputEnabled(
-              countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
-      }
+    if ((Info->getPSInputAddr() & 0x7F) == 0 ||
+        ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
+      CCInfo.AllocateReg(AMDGPU::VGPR0);
+      CCInfo.AllocateReg(AMDGPU::VGPR1);
+      Info->markPSInputAllocated(0);
+      Info->markPSInputEnabled(0);
+    }
+    if (Subtarget->isAmdPalOS()) {
+      // For isAmdPalOS, the user does not enable some bits after compilation
+      // based on run-time states; the register values being generated here are
+      // the final ones set in hardware. Therefore we need to apply the
+      // workaround to PSInputAddr and PSInputEnable together.  (The case where
+      // a bit is set in PSInputAddr but not PSInputEnable is where the
+      // frontend set up an input arg for a particular interpolation mode, but
+      // nothing uses that input arg. Really we should have an earlier pass
+      // that removes such an arg.)
+      unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
+      if ((PsInputBits & 0x7F) == 0 ||
+          ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
+        Info->markPSInputEnabled(
+            countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
     }
-
-    assert(!Info->hasDispatchPtr() &&
-           !Info->hasKernargSegmentPtr() &&
-           (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) &&
-           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
-           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
-           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
-           !Info->hasWorkItemIDZ());
   } else if (IsKernel) {
     assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
   } else {
@@ -2450,7 +2446,7 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   // Start adding system SGPRs.
   if (IsEntryFunc) {
-    allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
+    allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
   } else {
     CCInfo.AllocateReg(Info->getScratchRSrcReg());
     allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
@@ -2933,7 +2929,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     report_fatal_error("unsupported libcall legalization");
 
   if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
-      !CLI.CB->getCalledFunction()) {
+      !CLI.CB->getCalledFunction() && CallConv != CallingConv::AMDGPU_Gfx) {
     return lowerUnhandledCall(CLI, InVals,
                               "unsupported indirect call to function ");
   }
@@ -2943,11 +2939,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                               "unsupported required tail call to function ");
   }
 
-  if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
-    // Note the issue is with the CC of the calling function, not of the call
+  if (AMDGPU::isShader(CallConv)) {
+    // Note the issue is with the CC of the called function, not of the call
     // itself.
     return lowerUnhandledCall(CLI, InVals,
-                          "unsupported call from graphics shader of function ");
+                              "unsupported call to a shader function ");
+  }
+
+  if (AMDGPU::isShader(MF.getFunction().getCallingConv()) &&
+      CallConv != CallingConv::AMDGPU_Gfx) {
+    // Only allow calls with specific calling conventions.
+    return lowerUnhandledCall(CLI, InVals,
+                              "unsupported calling convention for call from "
+                              "graphics shader of function ");
   }
 
   if (IsTailCall) {
@@ -2978,7 +2982,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
 
-  if (AMDGPUTargetMachine::EnableFixedFunctionABI) {
+  if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
+      CallConv != CallingConv::AMDGPU_Gfx) {
     // With a fixed ABI, allocate fixed registers before user arguments.
     passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
   }
@@ -3112,7 +3117,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     }
   }
 
-  if (!AMDGPUTargetMachine::EnableFixedFunctionABI) {
+  if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
+      CallConv != CallingConv::AMDGPU_Gfx) {
     // Copy special input registers after user input arguments.
     passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
   }

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c9619d16ecef..e6c4e9cd6610 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5093,9 +5093,8 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
   // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
   // scratch memory access. In both cases, the legalization never involves
   // conversion to the addr64 form.
-  if (isMIMG(MI) ||
-      (AMDGPU::isShader(MF.getFunction().getCallingConv()) &&
-       (isMUBUF(MI) || isMTBUF(MI)))) {
+  if (isMIMG(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) &&
+                     (isMUBUF(MI) || isMTBUF(MI)))) {
     MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
     if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
       CreatedBB = loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT);

diff  --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 17f8257e6113..91da3bcc3484 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -271,6 +271,7 @@ static bool lowerShiftReservedVGPR(MachineFunction &MF,
   FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, Index);
 
   for (MachineBasicBlock &MBB : MF) {
+    assert(LowestAvailableVGPR.isValid() && "Did not find an available VGPR");
     MBB.addLiveIn(LowestAvailableVGPR);
     MBB.sortUniqueLiveIns();
   }

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index e5c560d75735..2babbaa42fa8 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -126,6 +126,7 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
   case CallingConv::C:
   case CallingConv::Fast:
   case CallingConv::Cold:
+  case CallingConv::AMDGPU_Gfx:
     return CSR_AMDGPU_HighRegs_SaveList;
   default: {
     // Dummy to not crash RegisterClassInfo.
@@ -146,6 +147,7 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
   case CallingConv::C:
   case CallingConv::Fast:
   case CallingConv::Cold:
+  case CallingConv::AMDGPU_Gfx:
     return CSR_AMDGPU_HighRegs_RegMask;
   default:
     return nullptr;

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index ca65e6203651..cd824a17750e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1043,8 +1043,12 @@ bool isShader(CallingConv::ID cc) {
   }
 }
 
+bool isGraphics(CallingConv::ID cc) {
+  return isShader(cc) || cc == CallingConv::AMDGPU_Gfx;
+}
+
 bool isCompute(CallingConv::ID cc) {
-  return !isShader(cc) || cc == CallingConv::AMDGPU_CS;
+  return !isGraphics(cc) || cc == CallingConv::AMDGPU_CS;
 }
 
 bool isEntryFunctionCC(CallingConv::ID CC) {
@@ -1439,6 +1443,7 @@ bool isArgPassedInSGPR(const Argument *A) {
   case CallingConv::AMDGPU_GS:
   case CallingConv::AMDGPU_PS:
   case CallingConv::AMDGPU_CS:
+  case CallingConv::AMDGPU_Gfx:
     // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
     // Everything else is in VGPRs.
     return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index d2b4ba6f9d27..e3cb4aa9641e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -567,6 +567,9 @@ unsigned getInitialPSInputAddr(const Function &F);
 LLVM_READNONE
 bool isShader(CallingConv::ID CC);
 
+LLVM_READNONE
+bool isGraphics(CallingConv::ID CC);
+
 LLVM_READNONE
 bool isCompute(CallingConv::ID CC);
 

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index 12f01faba04b..85cba165770f 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -736,6 +736,8 @@ static const char *getStageName(CallingConv::ID CC) {
     return ".hs";
   case CallingConv::AMDGPU_LS:
     return ".ls";
+  case CallingConv::AMDGPU_Gfx:
+    llvm_unreachable("Callable shader has no hardware stage");
   default:
     return ".cs";
   }

diff  --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index 8eb105542c0a..41fb68455bd3 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -478,6 +478,8 @@ declare cc90 void @f.cc90()
 ; CHECK: declare amdgpu_cs void @f.cc90()
 declare amdgpu_cs void @f.amdgpu_cs()
 ; CHECK: declare amdgpu_cs void @f.amdgpu_cs()
+declare amdgpu_gfx void @f.amdgpu_gfx()
+; CHECK: declare amdgpu_gfx void @f.amdgpu_gfx()
 declare cc91 void @f.cc91()
 ; CHECK: declare amdgpu_kernel void @f.cc91()
 declare amdgpu_kernel void @f.amdgpu_kernel()

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
new file mode 100644
index 000000000000..3db1182d345b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
@@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s
+
+; amdgpu_gfx calling convention
+declare hidden amdgpu_gfx void @external_gfx_void_func_void() #0
+declare hidden amdgpu_gfx void @external_gfx_void_func_i32(i32) #0
+declare hidden amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg) #0
+declare hidden amdgpu_gfx void @external_gfx_void_func_struct_i8_i32({ i8, i32 }) #0
+declare hidden amdgpu_gfx void @external_gfx_void_func_struct_i8_i32_inreg({ i8, i32 } inreg) #0
+
+define amdgpu_gfx void @test_gfx_call_external_void_func_void() #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_void
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_void
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY2]]
+  call amdgpu_gfx void @external_gfx_void_func_void()
+  ret void
+}
+
+define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm(i32) #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_i32_imm
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK:   liveins: $vgpr0, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32
+  ; CHECK:   $vgpr0 = COPY [[C]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK:   S_SETPC_B64_return [[COPY3]]
+  call amdgpu_gfx void @external_gfx_void_func_i32(i32 42)
+  ret void
+}
+
+define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg) #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_i32_imm_inreg
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK:   liveins: $sgpr4, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
+  ; CHECK:   $sgpr4 = COPY [[C]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_highregs, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK:   S_SETPC_B64_return [[COPY3]]
+  call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
+  ret void
+}
+
+define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32() #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_struct_i8_i32
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load 8 from `{ i8, i32 } addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[LOAD]](p1) :: (load 1 from %ir.ptr0, align 4, addrspace 1)
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[LOAD]], [[C]](s64)
+  ; CHECK:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load 4 from %ir.ptr0 + 4, addrspace 1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD1]](s8)
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32
+  ; CHECK:   $vgpr0 = COPY [[ANYEXT]](s32)
+  ; CHECK:   $vgpr1 = COPY [[LOAD2]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY2]]
+  %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
+  %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
+  call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32({ i8, i32 } %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_struct_i8_i32_inreg
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load 8 from `{ i8, i32 } addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[LOAD]](p1) :: (load 1 from %ir.ptr0, align 4, addrspace 1)
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[LOAD]], [[C]](s64)
+  ; CHECK:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load 4 from %ir.ptr0 + 4, addrspace 1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD1]](s8)
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg
+  ; CHECK:   $sgpr4 = COPY [[ANYEXT]](s32)
+  ; CHECK:   $sgpr5 = COPY [[LOAD2]](s32)
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_highregs, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
+  ; CHECK:   S_SETPC_B64_return [[COPY2]]
+  %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
+  %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
+  call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32_inreg({ i8, i32 } inreg %val)
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind noinline }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
index 1136afafbd4a..0b238c78486d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
@@ -56,6 +56,13 @@ declare [5 x i8] @external_a5i8_func_void() #0
 ; return value and argument
 declare hidden i32 @external_i32_func_i32(i32) #0
 
+; amdgpu_gfx calling convention
+declare i1 @external_gfx_i1_func_void() #0
+declare i8 @external_gfx_i8_func_void() #0
+declare i32 @external_gfx_i32_func_void() #0
+declare { i32, i64 } @external_gfx_i32_i64_func_void() #0
+declare hidden i32 @external_gfx_i32_func_i32(i32) #0
+
 
 define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 {
   ; GCN-LABEL: name: test_call_external_i32_func_i32_imm
@@ -115,6 +122,55 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)*
   ret void
 }
 
+define amdgpu_gfx void @test_gfx_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 {
+  ; GCN-LABEL: name: test_gfx_call_external_i32_func_i32_imm
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; GCN:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[COPY10:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; GCN:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+  ; GCN:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_i32_func_i32
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GCN:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   $vgpr0 = COPY [[C]](s32)
+  ; GCN:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY12]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[COPY13]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY14]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY16]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY17]](s32)
+  ; GCN:   $vgpr31 = COPY [[COPY18]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i32_func_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[COPY20]](s32), [[MV]](p1) :: (volatile store 4 into %ir.out, addrspace 1)
+  ; GCN:   [[COPY21:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY10]]
+  ; GCN:   S_SETPC_B64_return [[COPY21]]
+  %val = call amdgpu_gfx i32 @external_gfx_i32_func_i32(i32 42)
+  store volatile i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
 define amdgpu_kernel void @test_call_external_i1_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i1_func_void
   ; GCN: bb.1 (%ir-block.0):
@@ -171,6 +227,52 @@ define amdgpu_kernel void @test_call_external_i1_func_void() #0 {
   ret void
 }
 
+define amdgpu_gfx void @test_gfx_call_external_i1_func_void() #0 {
+  ; GCN-LABEL: name: test_gfx_call_external_i1_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; GCN:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_i1_func_void
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY13]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY15]](s32)
+  ; GCN:   $vgpr31 = COPY [[COPY16]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i1_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
+  ; GCN:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
+  ; GCN:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 1 from %fixed-stack.0, align 16, addrspace 5)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[LOAD]](s1), [[DEF]](p1) :: (volatile store 1 into `i1 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   [[COPY18:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; GCN:   S_SETPC_B64_return [[COPY18]]
+  %val = call amdgpu_gfx i1 @external_gfx_i1_func_void()
+  store volatile i1 %val, i1 addrspace(1)* undef
+  ret void
+}
+
 define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i1_zeroext_func_void
   ; GCN: bb.1 (%ir-block.0):
@@ -344,6 +446,53 @@ define amdgpu_kernel void @test_call_external_i8_func_void() #0 {
   ret void
 }
 
+define amdgpu_gfx void @test_gfx_call_external_i8_func_void() #0 {
+  ; GCN-LABEL: name: test_gfx_call_external_i8_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; GCN:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_i8_func_void
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY13]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY15]](s32)
+  ; GCN:   $vgpr31 = COPY [[COPY16]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i8_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32)
+  ; GCN:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; GCN:   S_SETPC_B64_return [[COPY19]]
+  %val = call amdgpu_gfx i8 @external_gfx_i8_func_void()
+  store volatile i8 %val, i8 addrspace(1)* undef
+  ret void
+}
+
 define amdgpu_kernel void @test_call_external_i8_zeroext_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i8_zeroext_func_void
   ; GCN: bb.1 (%ir-block.0):
@@ -689,6 +838,51 @@ define amdgpu_kernel void @test_call_external_i32_func_void() #0 {
   ret void
 }
 
+define amdgpu_gfx void @test_gfx_call_external_i32_func_void() #0 {
+  ; GCN-LABEL: name: test_gfx_call_external_i32_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; GCN:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_i32_func_void
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY13]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY15]](s32)
+  ; GCN:   $vgpr31 = COPY [[COPY16]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[COPY18]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; GCN:   S_SETPC_B64_return [[COPY19]]
+  %val = call amdgpu_gfx i32 @external_gfx_i32_func_void()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
 define amdgpu_kernel void @test_call_external_i48_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_i48_func_void
   ; GCN: bb.1 (%ir-block.0):
@@ -2362,6 +2556,59 @@ define amdgpu_kernel void @test_call_external_i32_i64_func_void() #0 {
   ret void
 }
 
+define amdgpu_gfx void @test_gfx_call_external_i32_i64_func_void() #0 {
+  ; GCN-LABEL: name: test_gfx_call_external_i32_i64_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; GCN:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_i32_i64_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[COPY12]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[COPY17]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i32_i64_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+  ; GCN:   G_STORE [[COPY19]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   G_STORE [[MV]](s64), [[COPY9]](p1) :: (volatile store 8 into `i64 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   [[COPY22:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; GCN:   S_SETPC_B64_return [[COPY22]]
+  %val = call amdgpu_gfx { i32, i64 } @external_gfx_i32_i64_func_void()
+  %val.0 = extractvalue { i32, i64 } %val, 0
+  %val.1 = extractvalue { i32, i64 } %val, 1
+  store volatile i32 %val.0, i32 addrspace(1)* undef
+  store volatile i64 %val.1, i64 addrspace(1)* undef
+  ret void
+}
+
 define amdgpu_kernel void @test_call_external_a2i32_func_void() #0 {
   ; GCN-LABEL: name: test_call_external_a2i32_func_void
   ; GCN: bb.1 (%ir-block.0):

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index f06986660a73..3cd6235cf5d6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -87,6 +87,13 @@ declare hidden void @external_void_func_12xv3f32(<3 x float>, <3 x float>, <3 x
 declare hidden void @external_void_func_8xv5f32(<5 x float>, <5 x float>, <5 x float>, <5 x float>,
     <5 x float>, <5 x float>, <5 x float>, <5 x float>) #0
 
+; amdgpu_gfx calling convention
+declare hidden amdgpu_gfx void @external_gfx_void_func_void() #0
+declare hidden amdgpu_gfx void @external_gfx_void_func_i32(i32) #0
+declare hidden amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg) #0
+declare hidden amdgpu_gfx void @external_gfx_void_func_struct_i8_i32({ i8, i32 }) #0
+declare hidden amdgpu_gfx void @external_gfx_void_func_struct_i8_i32_inreg({ i8, i32 } inreg) #0
+
 define amdgpu_kernel void @test_call_external_void_func_void() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_void
   ; CHECK: bb.1 (%ir-block.0):
@@ -138,6 +145,47 @@ define amdgpu_kernel void @test_call_external_void_func_void() #0 {
   ret void
 }
 
+define amdgpu_gfx void @test_gfx_call_external_void_func_void() #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_void
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_void
+  ; CHECK:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
+  ; CHECK:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; CHECK:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; CHECK:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; CHECK:   $sgpr12 = COPY [[COPY13]](s32)
+  ; CHECK:   $sgpr13 = COPY [[COPY14]](s32)
+  ; CHECK:   $sgpr14 = COPY [[COPY15]](s32)
+  ; CHECK:   $vgpr31 = COPY [[COPY16]](s32)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY18:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; CHECK:   S_SETPC_B64_return [[COPY18]]
+  call amdgpu_gfx void @external_gfx_void_func_void()
+  ret void
+}
+
 define void @test_func_call_external_void_func_void() #0 {
   ; CHECK-LABEL: name: test_func_call_external_void_func_void
   ; CHECK: bb.1 (%ir-block.0):
@@ -844,6 +892,94 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
   ret void
 }
 
+define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm(i32) #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_i32_imm
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK:   [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32
+  ; CHECK:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK:   $vgpr0 = COPY [[C]](s32)
+  ; CHECK:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; CHECK:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK:   $sgpr8_sgpr9 = COPY [[COPY12]](p4)
+  ; CHECK:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; CHECK:   $sgpr12 = COPY [[COPY14]](s32)
+  ; CHECK:   $sgpr13 = COPY [[COPY15]](s32)
+  ; CHECK:   $sgpr14 = COPY [[COPY16]](s32)
+  ; CHECK:   $vgpr31 = COPY [[COPY17]](s32)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+  ; CHECK:   S_SETPC_B64_return [[COPY19]]
+  call amdgpu_gfx void @external_gfx_void_func_i32(i32 42)
+  ret void
+}
+
+define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg) #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_i32_imm_inreg
+  ; CHECK: bb.1 (%ir-block.1):
+  ; CHECK:   liveins: $sgpr4, $sgpr5, $sgpr14, $sgpr15, $vgpr31, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr12_sgpr13
+  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr4
+  ; CHECK:   [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
+  ; CHECK:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK:   $sgpr15 = COPY [[C]](s32)
+  ; CHECK:   [[COPY18:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]](<4 x s32>)
+  ; CHECK:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; CHECK:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; CHECK:   $sgpr8_sgpr9 = COPY [[COPY12]](p4)
+  ; CHECK:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; CHECK:   $sgpr12 = COPY [[COPY14]](s32)
+  ; CHECK:   $sgpr13 = COPY [[COPY15]](s32)
+  ; CHECK:   $sgpr14 = COPY [[COPY16]](s32)
+  ; CHECK:   $vgpr31 = COPY [[COPY17]](s32)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_highregs, implicit $sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY19:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]]
+  ; CHECK:   S_SETPC_B64_return [[COPY19]]
+  call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
+  ret void
+}
+
 define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_i64_imm
   ; CHECK: bb.1 (%ir-block.0):
@@ -3744,6 +3880,110 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
   ret void
 }
 
+define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32() #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_struct_i8_i32
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load 8 from `{ i8, i32 } addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[LOAD]](p1) :: (load 1 from %ir.ptr0, align 4, addrspace 1)
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[LOAD]], [[C]](s64)
+  ; CHECK:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load 4 from %ir.ptr0 + 4, addrspace 1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD1]](s8)
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32
+  ; CHECK:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK:   $vgpr0 = COPY [[ANYEXT]](s32)
+  ; CHECK:   $vgpr1 = COPY [[LOAD2]](s32)
+  ; CHECK:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
+  ; CHECK:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; CHECK:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; CHECK:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; CHECK:   $sgpr12 = COPY [[COPY13]](s32)
+  ; CHECK:   $sgpr13 = COPY [[COPY14]](s32)
+  ; CHECK:   $sgpr14 = COPY [[COPY15]](s32)
+  ; CHECK:   $vgpr31 = COPY [[COPY16]](s32)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY18:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; CHECK:   S_SETPC_B64_return [[COPY18]]
+  %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
+  %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
+  call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32({ i8, i32 } %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #0 {
+  ; CHECK-LABEL: name: test_gfx_call_external_void_func_struct_i8_i32_inreg
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load 8 from `{ i8, i32 } addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[LOAD]](p1) :: (load 1 from %ir.ptr0, align 4, addrspace 1)
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[LOAD]], [[C]](s64)
+  ; CHECK:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load 4 from %ir.ptr0 + 4, addrspace 1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD1]](s8)
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg
+  ; CHECK:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK:   [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK:   $sgpr15 = COPY [[ANYEXT]](s32)
+  ; CHECK:   $sgpr16 = COPY [[LOAD2]](s32)
+  ; CHECK:   [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
+  ; CHECK:   $sgpr4_sgpr5 = COPY [[COPY9]](p4)
+  ; CHECK:   $sgpr6_sgpr7 = COPY [[COPY10]](p4)
+  ; CHECK:   $sgpr8_sgpr9 = COPY [[COPY11]](p4)
+  ; CHECK:   $sgpr10_sgpr11 = COPY [[COPY12]](s64)
+  ; CHECK:   $sgpr12 = COPY [[COPY13]](s32)
+  ; CHECK:   $sgpr13 = COPY [[COPY14]](s32)
+  ; CHECK:   $sgpr14 = COPY [[COPY15]](s32)
+  ; CHECK:   $vgpr31 = COPY [[COPY16]](s32)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_highregs, implicit $sgpr15, implicit $sgpr16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY18:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
+  ; CHECK:   S_SETPC_B64_return [[COPY18]]
+  %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
+  %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
+  call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32_inreg({ i8, i32 } inreg %val)
+  ret void
+}
+
 define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_byval_struct_i8_i32
   ; CHECK: bb.1 (%ir-block.0):

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
index 96eb596dabc5..ab4562501fe8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
@@ -52,3 +52,46 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(void()* %fptr) {
   call void %fptr()
   ret void
 }
+
+define amdgpu_gfx void @test_gfx_indirect_call_sgpr_ptr(void()* %fptr) {
+  ; CHECK-LABEL: name: test_gfx_indirect_call_sgpr_ptr
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[COPY10:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[MV:%[0-9]+]]:sreg_64(p0) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]]
+  ; CHECK:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY5]]
+  ; CHECK:   [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY4]]
+  ; CHECK:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
+  ; CHECK:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
+  ; CHECK:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
+  ; CHECK:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
+  ; CHECK:   $sgpr6_sgpr7 = COPY [[COPY12]](p4)
+  ; CHECK:   $sgpr8_sgpr9 = COPY [[COPY13]](p4)
+  ; CHECK:   $sgpr10_sgpr11 = COPY [[COPY14]](s64)
+  ; CHECK:   $sgpr12 = COPY [[COPY15]](s32)
+  ; CHECK:   $sgpr13 = COPY [[COPY16]](s32)
+  ; CHECK:   $sgpr14 = COPY [[COPY17]](s32)
+  ; CHECK:   $vgpr31 = COPY [[COPY18]](s32)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[MV]](p0), 0, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; CHECK:   [[COPY20:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY10]]
+  ; CHECK:   S_SETPC_B64_return [[COPY20]]
+  call amdgpu_gfx void %fptr()
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
new file mode 100644
index 000000000000..f50a07d9afef
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
+
+; GCN-LABEL: {{^}}gfx_callable_amdpal:
+; GCN:         .amdgpu_pal_metadata
+; GCN-NEXT: ---
+; GCN-NEXT: amdpal.pipelines:
+; GCN-NEXT:   - .registers:      {}
+; GCN-NEXT: ...
+; GCN-NEXT:         .end_amdgpu_pal_metadata
+define amdgpu_gfx half @gfx_callable_amdpal(half %arg0) {
+  %add = fadd half %arg0, 1.0
+  ret half %add
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
new file mode 100644
index 000000000000..3991b2722f9b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -0,0 +1,6562 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+
+declare hidden amdgpu_gfx void @external_void_func_i1(i1) #0
+declare hidden amdgpu_gfx void @external_void_func_i1_signext(i1 signext) #0
+declare hidden amdgpu_gfx void @external_void_func_i1_zeroext(i1 zeroext) #0
+
+declare hidden amdgpu_gfx void @external_void_func_i8(i8) #0
+declare hidden amdgpu_gfx void @external_void_func_i8_signext(i8 signext) #0
+declare hidden amdgpu_gfx void @external_void_func_i8_zeroext(i8 zeroext) #0
+
+declare hidden amdgpu_gfx void @external_void_func_i16(i16) #0
+declare hidden amdgpu_gfx void @external_void_func_i16_signext(i16 signext) #0
+declare hidden amdgpu_gfx void @external_void_func_i16_zeroext(i16 zeroext) #0
+
+declare hidden amdgpu_gfx void @external_void_func_i32(i32) #0
+declare hidden amdgpu_gfx void @external_void_func_i64(i64) #0
+declare hidden amdgpu_gfx void @external_void_func_v2i64(<2 x i64>) #0
+declare hidden amdgpu_gfx void @external_void_func_v3i64(<3 x i64>) #0
+declare hidden amdgpu_gfx void @external_void_func_v4i64(<4 x i64>) #0
+
+declare hidden amdgpu_gfx void @external_void_func_f16(half) #0
+declare hidden amdgpu_gfx void @external_void_func_f32(float) #0
+declare hidden amdgpu_gfx void @external_void_func_f64(double) #0
+declare hidden amdgpu_gfx void @external_void_func_v2f32(<2 x float>) #0
+declare hidden amdgpu_gfx void @external_void_func_v2f64(<2 x double>) #0
+declare hidden amdgpu_gfx void @external_void_func_v3f32(<3 x float>) #0
+declare hidden amdgpu_gfx void @external_void_func_v3f64(<3 x double>) #0
+declare hidden amdgpu_gfx void @external_void_func_v5f32(<5 x float>) #0
+
+declare hidden amdgpu_gfx void @external_void_func_v2i16(<2 x i16>) #0
+declare hidden amdgpu_gfx void @external_void_func_v2f16(<2 x half>) #0
+declare hidden amdgpu_gfx void @external_void_func_v3i16(<3 x i16>) #0
+declare hidden amdgpu_gfx void @external_void_func_v3f16(<3 x half>) #0
+declare hidden amdgpu_gfx void @external_void_func_v4i16(<4 x i16>) #0
+declare hidden amdgpu_gfx void @external_void_func_v4f16(<4 x half>) #0
+
+declare hidden amdgpu_gfx void @external_void_func_v2i32(<2 x i32>) #0
+declare hidden amdgpu_gfx void @external_void_func_v3i32(<3 x i32>) #0
+declare hidden amdgpu_gfx void @external_void_func_v3i32_i32(<3 x i32>, i32) #0
+declare hidden amdgpu_gfx void @external_void_func_v4i32(<4 x i32>) #0
+declare hidden amdgpu_gfx void @external_void_func_v5i32(<5 x i32>) #0
+declare hidden amdgpu_gfx void @external_void_func_v8i32(<8 x i32>) #0
+declare hidden amdgpu_gfx void @external_void_func_v16i32(<16 x i32>) #0
+declare hidden amdgpu_gfx void @external_void_func_v32i32(<32 x i32>) #0
+declare hidden amdgpu_gfx void @external_void_func_v32i32_i32(<32 x i32>, i32) #0
+
+declare hidden amdgpu_gfx void @external_void_func_i1_inreg(i1 inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_i8_inreg(i8 inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_i16_inreg(i16 inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_i32_inreg(i32 inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_i64_inreg(i64 inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v3i64_inreg(<3 x i64> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v4i64_inreg(<4 x i64> inreg) #0
+
+declare hidden amdgpu_gfx void @external_void_func_f16_inreg(half inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_f32_inreg(float inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_f64_inreg(double inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v2f32_inreg(<2 x float> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v2f64_inreg(<2 x double> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v3f32_inreg(<3 x float> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v3f64_inreg(<3 x double> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v5f32_inreg(<5 x float> inreg) #0
+
+declare hidden amdgpu_gfx void @external_void_func_v2i16_inreg(<2 x i16> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v2f16_inreg(<2 x half> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v4f16_inreg(<4 x half> inreg) #0
+
+declare hidden amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v3i32_inreg(<3 x i32> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v3i32_i32_inreg(<3 x i32> inreg, i32 inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v5i32_inreg(<5 x i32> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v16i32_inreg(<16 x i32> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v32i32_inreg(<32 x i32> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v32i32_i32_inreg(<32 x i32> inreg, i32 inreg) #0
+
+; return value and argument
+declare hidden amdgpu_gfx i32 @external_i32_func_i32(i32) #0
+
+; Structs
+declare hidden amdgpu_gfx void @external_void_func_struct_i8_i32({ i8, i32 }) #0
+declare hidden amdgpu_gfx void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval) #0
+declare hidden amdgpu_gfx void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* sret, { i8, i32 } addrspace(5)* byval) #0
+
+declare hidden amdgpu_gfx void @external_void_func_v16i8(<16 x i8>) #0
+
+define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_i1_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_i1_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 1
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_i1(i1 true)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_i1_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_i1_signext:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %var = load volatile i1, i1 addrspace(1)* undef
+  call amdgpu_gfx void @external_void_func_i1_signext(i1 %var)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_i1_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_i1_zeroext:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %var = load volatile i1, i1 addrspace(1)* undef
+  call amdgpu_gfx void @external_void_func_i1_zeroext(i1 %var)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_i8_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i8 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i8 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_i8_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i8 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i8 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_i8(i8 123)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_i8_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    global_load_sbyte v0, v[0:1], off
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i8_signext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_signext at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_i8_signext:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    global_load_sbyte v0, v[0:1], off
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i8_signext at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_signext at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %var = load volatile i8, i8 addrspace(1)* undef
+  call amdgpu_gfx void @external_void_func_i8_signext(i8 %var)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_i8_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i8_zeroext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_zeroext at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_i8_zeroext:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i8_zeroext at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_zeroext at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %var = load volatile i8, i8 addrspace(1)* undef
+  call amdgpu_gfx void @external_void_func_i8_zeroext(i8 %var)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_i16_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i16 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_i16_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i16 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_i16(i16 123)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_i16_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i16_signext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_signext at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_i16_signext:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i16_signext at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_signext at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %var = load volatile i16, i16 addrspace(1)* undef
+  call amdgpu_gfx void @external_void_func_i16_signext(i16 %var)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_i16_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i16_zeroext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_zeroext at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_i16_zeroext:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i16_zeroext at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_zeroext at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %var = load volatile i16, i16 addrspace(1)* undef
+  call amdgpu_gfx void @external_void_func_i16_zeroext(i16 %var)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_i32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_i32_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i32 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_i32(i32 42)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_i64_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i64 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_i64_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i64 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_i64(i64 123)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
+; GFX9-LABEL: test_call_external_void_func_v2i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v2i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %val = load <2 x i64>, <2 x i64> addrspace(1)* null
+  call amdgpu_gfx void @external_void_func_v2i64(<2 x i64> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_v2i64_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 3
+; GFX9-NEXT:    v_mov_b32_e32 v3, 4
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v2i64_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 2
+; GFX10-NEXT:    v_mov_b32_e32 v2, 3
+; GFX10-NEXT:    v_mov_b32_e32 v3, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v2i64(<2 x i64> <i64 8589934593, i64 17179869187>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
+; GFX9-LABEL: test_call_external_void_func_v3i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v4, 1
+; GFX9-NEXT:    v_mov_b32_e32 v5, 2
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i64 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v3i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v4, 1
+; GFX10-NEXT:    v_mov_b32_e32 v5, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v3i64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i64 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %load = load <2 x i64>, <2 x i64> addrspace(1)* null
+  %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2>
+
+  call amdgpu_gfx void @external_void_func_v3i64(<3 x i64> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
+; GFX9-LABEL: test_call_external_void_func_v4i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v4, 1
+; GFX9-NEXT:    v_mov_b32_e32 v5, 2
+; GFX9-NEXT:    v_mov_b32_e32 v6, 3
+; GFX9-NEXT:    v_mov_b32_e32 v7, 4
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i64 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v4i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v4, 1
+; GFX10-NEXT:    v_mov_b32_e32 v5, 2
+; GFX10-NEXT:    v_mov_b32_e32 v6, 3
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_mov_b32_e32 v7, 4
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v4i64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i64 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %load = load <2 x i64>, <2 x i64> addrspace(1)* null
+  %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  call amdgpu_gfx void @external_void_func_v4i64(<4 x i64> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_f16_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_f16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_f16 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_f16_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0x4400
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_f16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_f16 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_f16(half 4.0)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_f32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 4.0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_f32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_f32_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 4.0
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_f32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_f32 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_f32(float 4.0)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_v2f32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v2f32_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v2f32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f32 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v2f32(<2 x float> <float 1.0, float 2.0>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_v3f32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 4.0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v3f32_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 4.0
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v3f32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f32 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v3f32(<3 x float> <float 1.0, float 2.0, float 4.0>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_v5f32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 4.0
+; GFX9-NEXT:    v_mov_b32_e32 v3, -1.0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0.5
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v5f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v5f32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v5f32_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 4.0
+; GFX10-NEXT:    v_mov_b32_e32 v3, -1.0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0.5
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v5f32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v5f32 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v5f32(<5 x float> <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_f64_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_f64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_f64 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_f64_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_f64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_f64 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_f64(double 4.0)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_v2f64_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2f64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f64 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v2f64_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v2f64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f64 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v2f64(<2 x double> <double 2.0, double 4.0>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_v3f64_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x40200000
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f64 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v3f64_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0x40200000
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v3f64 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f64 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v3f64(<3 x double> <double 2.0, double 4.0, double 8.0>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
+; GFX9-LABEL: test_call_external_void_func_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i16 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v2i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v2i16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i16 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %val = load <2 x i16>, <2 x i16> addrspace(1)* undef
+  call amdgpu_gfx void @external_void_func_v2i16(<2 x i16> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
+; GFX9-LABEL: test_call_external_void_func_v3i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v3i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %val = load <3 x i16>, <3 x i16> addrspace(1)* undef
+  call amdgpu_gfx void @external_void_func_v3i16(<3 x i16> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
+; GFX9-LABEL: test_call_external_void_func_v3f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v3f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %val = load <3 x half>, <3 x half> addrspace(1)* undef
+  call amdgpu_gfx void @external_void_func_v3f16(<3 x half> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_v3i16_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX9-NEXT:    v_mov_b32_e32 v1, 3
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v3i16_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX10-NEXT:    v_mov_b32_e32 v1, 3
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v3i16(<3 x i16> <i16 1, i16 2, i16 3>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_v3f16_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x40003c00
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v3f16_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0x40003c00
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x4400
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v3f16(<3 x half> <half 1.0, half 2.0, half 4.0>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
+; GFX9-LABEL: test_call_external_void_func_v4i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v4i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %val = load <4 x i16>, <4 x i16> addrspace(1)* undef
+  call amdgpu_gfx void @external_void_func_v4i16(<4 x i16> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_v4i16_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40003
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v4i16_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x40003
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
+; GFX9-LABEL: test_call_external_void_func_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2f16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f16 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v2f16 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f16 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %val = load <2 x half>, <2 x half> addrspace(1)* undef
+  call amdgpu_gfx void @external_void_func_v2f16(<2 x half> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
+; GFX9-LABEL: test_call_external_void_func_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v2i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* undef
+  call amdgpu_gfx void @external_void_func_v2i32(<2 x i32> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_v2i32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v2i32_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v2i32(<2 x i32> <i32 1, i32 2>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_v3i32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-NEXT:    v_mov_b32_e32 v2, 5
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v3i32_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 3
+; GFX10-NEXT:    v_mov_b32_e32 v1, 4
+; GFX10-NEXT:    v_mov_b32_e32 v2, 5
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v3i32(<3 x i32> <i32 3, i32 4, i32 5>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_v3i32_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-NEXT:    v_mov_b32_e32 v2, 5
+; GFX9-NEXT:    v_mov_b32_e32 v3, 6
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32_i32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v3i32_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 3
+; GFX10-NEXT:    v_mov_b32_e32 v1, 4
+; GFX10-NEXT:    v_mov_b32_e32 v2, 5
+; GFX10-NEXT:    v_mov_b32_e32 v3, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32_i32 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v3i32_i32(<3 x i32> <i32 3, i32 4, i32 5>, i32 6)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
+; GFX9-LABEL: test_call_external_void_func_v4i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v4i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* undef
+  call amdgpu_gfx void @external_void_func_v4i32(<4 x i32> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_v4i32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 3
+; GFX9-NEXT:    v_mov_b32_e32 v3, 4
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v4i32_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 2
+; GFX10-NEXT:    v_mov_b32_e32 v2, 3
+; GFX10-NEXT:    v_mov_b32_e32 v3, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_v5i32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 3
+; GFX9-NEXT:    v_mov_b32_e32 v3, 4
+; GFX9-NEXT:    v_mov_b32_e32 v4, 5
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v5i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v5i32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v5i32_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 2
+; GFX10-NEXT:    v_mov_b32_e32 v2, 3
+; GFX10-NEXT:    v_mov_b32_e32 v3, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 5
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v5i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v5i32 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v5i32(<5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
+; GFX9-LABEL: test_call_external_void_func_v8i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[4:5], off
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[4:5], off offset:16
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v8i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[4:5], off
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[4:5], off offset:16
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef
+  %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
+  call amdgpu_gfx void @external_void_func_v8i32(<8 x i32> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
+; GFX9-LABEL: test_call_external_void_func_v8i32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 3
+; GFX9-NEXT:    v_mov_b32_e32 v3, 4
+; GFX9-NEXT:    v_mov_b32_e32 v4, 5
+; GFX9-NEXT:    v_mov_b32_e32 v5, 6
+; GFX9-NEXT:    v_mov_b32_e32 v6, 7
+; GFX9-NEXT:    v_mov_b32_e32 v7, 8
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v8i32_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 2
+; GFX10-NEXT:    v_mov_b32_e32 v2, 3
+; GFX10-NEXT:    v_mov_b32_e32 v3, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 5
+; GFX10-NEXT:    v_mov_b32_e32 v5, 6
+; GFX10-NEXT:    v_mov_b32_e32 v6, 7
+; GFX10-NEXT:    v_mov_b32_e32 v7, 8
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
+; GFX9-LABEL: test_call_external_void_func_v16i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v13, s5
+; GFX9-NEXT:    v_mov_b32_e32 v12, s4
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[12:13], off
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[12:13], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[12:13], off offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[12:13], off offset:48
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v16i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v13, s5
+; GFX10-NEXT:    v_mov_b32_e32 v12, s4
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32 at rel32@hi+12
+; GFX10-NEXT:    s_clause 0x3
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[12:13], off
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[12:13], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[12:13], off offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[12:13], off offset:48
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef
+  %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
+  call amdgpu_gfx void @external_void_func_v16i32(<16 x i32> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
+; GFX9-LABEL: test_call_external_void_func_v32i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v29, s5
+; GFX9-NEXT:    v_mov_b32_e32 v28, s4
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[28:29], off
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[28:29], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[28:29], off offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[28:29], off offset:48
+; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[28:29], off offset:64
+; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[28:29], off offset:80
+; GFX9-NEXT:    global_load_dwordx4 v[24:27], v[28:29], off offset:96
+; GFX9-NEXT:    global_load_dwordx4 v[28:31], v[28:29], off offset:112
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v32i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v29, s5
+; GFX10-NEXT:    v_mov_b32_e32 v28, s4
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32 at rel32@hi+12
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[28:29], off
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[28:29], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[28:29], off offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[28:29], off offset:48
+; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[28:29], off offset:64
+; GFX10-NEXT:    global_load_dwordx4 v[20:23], v[28:29], off offset:80
+; GFX10-NEXT:    global_load_dwordx4 v[24:27], v[28:29], off offset:96
+; GFX10-NEXT:    global_load_dwordx4 v[28:31], v[28:29], off offset:112
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
+  %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
+  call amdgpu_gfx void @external_void_func_v32i32(<32 x i32> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_v32i32_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v29, s5
+; GFX9-NEXT:    v_mov_b32_e32 v28, s4
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[28:29], off
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[28:29], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[28:29], off offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[28:29], off offset:48
+; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[28:29], off offset:64
+; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[28:29], off offset:80
+; GFX9-NEXT:    global_load_dwordx4 v[24:27], v[28:29], off offset:96
+; GFX9-NEXT:    global_load_dwordx4 v[28:31], v[28:29], off offset:112
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    global_load_dword v32, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v32i32_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v29, s5
+; GFX10-NEXT:    v_mov_b32_e32 v28, s4
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32 at rel32@hi+12
+; GFX10-NEXT:    global_load_dword v32, v[0:1], off
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[28:29], off
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[28:29], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[28:29], off offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[28:29], off offset:48
+; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[28:29], off offset:64
+; GFX10-NEXT:    global_load_dwordx4 v[20:23], v[28:29], off offset:80
+; GFX10-NEXT:    global_load_dwordx4 v[24:27], v[28:29], off offset:96
+; GFX10-NEXT:    global_load_dwordx4 v[28:31], v[28:29], off offset:112
+; GFX10-NEXT:    s_waitcnt vmcnt(8)
+; GFX10-NEXT:    buffer_store_dword v32, off, s[0:3], s32
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
+  %val0 = load <32 x i32>, <32 x i32> addrspace(1)* %ptr0
+  %val1 = load i32, i32 addrspace(1)* undef
+  call amdgpu_gfx void @external_void_func_v32i32_i32(<32 x i32> %val0, i32 %val1)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 {
+; GFX9-LABEL: test_call_external_i32_func_i32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v42, s33, 2
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v40, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_i32_func_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX9-NEXT:    v_mov_b32_e32 v41, v1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    global_store_dword v[40:41], v0, off
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s4, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v42, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v42, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_i32_func_i32_imm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v42, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_mov_b32_e32 v40, v0
+; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_i32_func_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32 at rel32@hi+12
+; GFX10-NEXT:    v_mov_b32_e32 v41, v1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    global_store_dword v[40:41], v0, off
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    v_readlane_b32 s4, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v42, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v42, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %val = call amdgpu_gfx i32 @external_i32_func_i32(i32 42)
+  store volatile i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
+; GFX9-LABEL: test_call_external_void_func_struct_i8_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    global_load_dword v1, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_ubyte v0, v[2:3], off
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_struct_i8_i32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_struct_i8_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_struct_i8_i32 at rel32@hi+12
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_ubyte v0, v[1:2], off
+; GFX10-NEXT:    global_load_dword v1, v[1:2], off offset:4
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
+  %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
+  call amdgpu_gfx void @external_void_func_struct_i8_i32({ i8, i32 } %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
+; GFX9-LABEL: test_call_external_void_func_byval_struct_i8_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s33
+; GFX9-NEXT:    v_mov_b32_e32 v0, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_byval_struct_i8_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 3
+; GFX10-NEXT:    v_mov_b32_e32 v1, 8
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s33
+; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %val = alloca { i8, i32 }, align 4, addrspace(5)
+  %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %val, i32 0, i32 0
+  %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %val, i32 0, i32 1
+  store i8 3, i8 addrspace(5)* %gep0
+  store i32 8, i32 addrspace(5)* %gep1
+  call amdgpu_gfx void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s33
+; GFX9-NEXT:    v_mov_b32_e32 v0, 8
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x800
+; GFX9-NEXT:    v_add_u32_e32 v0, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:8
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x800
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_byte v[0:1], v0, off
+; GFX9-NEXT:    global_store_dword v[0:1], v1, off
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, 3
+; GFX10-NEXT:    v_mov_b32_e32 v1, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s33
+; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_lshrrev_b32_e64 v1, 5, s33
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 8, v0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:8
+; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_byte v[0:1], v0, off
+; GFX10-NEXT:    global_store_dword v[0:1], v1, off
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %in.val = alloca { i8, i32 }, align 4, addrspace(5)
+  %out.val = alloca { i8, i32 }, align 4, addrspace(5)
+  %in.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 0
+  %in.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 1
+  store i8 3, i8 addrspace(5)* %in.gep0
+  store i32 8, i32 addrspace(5)* %in.gep1
+  call amdgpu_gfx void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* %out.val, { i8, i32 } addrspace(5)* %in.val)
+  %out.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 0
+  %out.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 1
+  %out.val0 = load i8, i8 addrspace(5)* %out.gep0
+  %out.val1 = load i32, i32 addrspace(5)* %out.gep1
+
+  store volatile i8 %out.val0, i8 addrspace(1)* undef
+  store volatile i32 %out.val1, i32 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
+; GFX9-LABEL: test_call_external_void_func_v16i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i8 at rel32@hi+12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX9-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX9-NEXT:    v_mov_b32_e32 v12, v3
+; GFX9-NEXT:    v_mov_b32_e32 v1, v16
+; GFX9-NEXT:    v_mov_b32_e32 v2, v17
+; GFX9-NEXT:    v_mov_b32_e32 v3, v18
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v16i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i8 at rel32@hi+12
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX10-NEXT:    v_mov_b32_e32 v8, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX10-NEXT:    v_mov_b32_e32 v12, v3
+; GFX10-NEXT:    v_mov_b32_e32 v1, v16
+; GFX10-NEXT:    v_mov_b32_e32 v2, v17
+; GFX10-NEXT:    v_mov_b32_e32 v3, v18
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %ptr = load <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(4)* undef
+  %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  call amdgpu_gfx void @external_void_func_v16i8(<16 x i8> %val)
+  ret void
+}
+
+define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
+; GFX9-LABEL: tail_call_byval_align16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:8
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:12
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x800
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x800
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: tail_call_byval_align16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:12
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:8
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+entry:
+  %alloca = alloca double, align 8, addrspace(5)
+  tail call amdgpu_gfx void @byval_align16_f64_arg(<32 x i32> %val, double addrspace(5)* byval align 16 %alloca)
+  ret void
+}
+
+; inreg arguments are put in sgprs
+define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_i1_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_i1_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 1
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_i1_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_inreg at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_i1_inreg(i1 true)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_i8_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_movk_i32 s4, 0x7b
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_i8_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_i8_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_i8_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_movk_i32 s4, 0x7b
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_i8_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_i8_inreg at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_i8_inreg(i8 123)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_i16_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_movk_i32 s4, 0x7b
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_i16_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_i16_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_movk_i32 s4, 0x7b
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_i16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_i16_inreg at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_i16_inreg(i16 123)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_i32_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s4, 42
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_i32_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s4, 42
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_i32_inreg at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_i32_inreg(i32 42)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_i64_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_movk_i32 s4, 0x7b
+; GFX9-NEXT:    s_mov_b32 s5, 0
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_i64_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_i64_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_movk_i32 s4, 0x7b
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_i64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_i64_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_i64_inreg(i64 123)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v2i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v2i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i64_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v2i64_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b64 s[4:5], 0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[8:9]
+; GFX10-NEXT:    s_add_u32 s8, s8, external_void_func_v2i64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i64_inreg at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %val = load <2 x i64>, <2 x i64> addrspace(4)* null
+  call amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v2i64_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s4, 1
+; GFX9-NEXT:    s_mov_b32 s5, 2
+; GFX9-NEXT:    s_mov_b32 s6, 3
+; GFX9-NEXT:    s_mov_b32 s7, 4
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v2i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i64_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v2i64_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s4, 1
+; GFX10-NEXT:    s_mov_b32 s5, 2
+; GFX10-NEXT:    s_mov_b32 s6, 3
+; GFX10-NEXT:    s_mov_b32 s7, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[8:9]
+; GFX10-NEXT:    s_add_u32 s8, s8, external_void_func_v2i64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i64_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> <i64 8589934593, i64 17179869187>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v3i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s8, 1
+; GFX9-NEXT:    s_mov_b32 s9, 2
+; GFX9-NEXT:    s_getpc_b64 s[10:11]
+; GFX9-NEXT:    s_add_u32 s10, s10, external_void_func_v3i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s11, s11, external_void_func_v3i64_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[10:11]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v3i64_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b64 s[4:5], 0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX10-NEXT:    s_mov_b32 s8, 1
+; GFX10-NEXT:    s_mov_b32 s9, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[10:11]
+; GFX10-NEXT:    s_add_u32 s10, s10, external_void_func_v3i64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s11, s11, external_void_func_v3i64_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[10:11]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %load = load <2 x i64>, <2 x i64> addrspace(4)* null
+  %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2>
+
+  call amdgpu_gfx void @external_void_func_v3i64_inreg(<3 x i64> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v4i64_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s8, 1
+; GFX9-NEXT:    s_mov_b32 s9, 2
+; GFX9-NEXT:    s_mov_b32 s10, 3
+; GFX9-NEXT:    s_mov_b32 s11, 4
+; GFX9-NEXT:    s_getpc_b64 s[12:13]
+; GFX9-NEXT:    s_add_u32 s12, s12, external_void_func_v4i64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s13, s13, external_void_func_v4i64_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[12:13]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v4i64_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_mov_b64 s[4:5], 0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX10-NEXT:    s_mov_b32 s8, 1
+; GFX10-NEXT:    s_mov_b32 s9, 2
+; GFX10-NEXT:    s_mov_b32 s10, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s11, 4
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[12:13]
+; GFX10-NEXT:    s_add_u32 s12, s12, external_void_func_v4i64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s13, s13, external_void_func_v4i64_inreg at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[12:13]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %load = load <2 x i64>, <2 x i64> addrspace(4)* null
+  %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  call amdgpu_gfx void @external_void_func_v4i64_inreg(<4 x i64> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_f16_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_movk_i32 s4, 0x4400
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_f16_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_f16_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_movk_i32 s4, 0x4400
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_f16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_f16_inreg at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_f16_inreg(half 4.0)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_f32_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s4, 4.0
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_f32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_f32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_f32_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s4, 4.0
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_f32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_f32_inreg at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_f32_inreg(float 4.0)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v2f32_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s4, 1.0
+; GFX9-NEXT:    s_mov_b32 s5, 2.0
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v2f32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v2f32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v2f32_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s4, 1.0
+; GFX10-NEXT:    s_mov_b32 s5, 2.0
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v2f32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v2f32_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v2f32_inreg(<2 x float> <float 1.0, float 2.0>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v3f32_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s4, 1.0
+; GFX9-NEXT:    s_mov_b32 s5, 2.0
+; GFX9-NEXT:    s_mov_b32 s6, 4.0
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v3f32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v3f32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v3f32_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s4, 1.0
+; GFX10-NEXT:    s_mov_b32 s5, 2.0
+; GFX10-NEXT:    s_mov_b32 s6, 4.0
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[8:9]
+; GFX10-NEXT:    s_add_u32 s8, s8, external_void_func_v3f32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s9, s9, external_void_func_v3f32_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v3f32_inreg(<3 x float> <float 1.0, float 2.0, float 4.0>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v5f32_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s4, 1.0
+; GFX9-NEXT:    s_mov_b32 s5, 2.0
+; GFX9-NEXT:    s_mov_b32 s6, 4.0
+; GFX9-NEXT:    s_mov_b32 s7, -1.0
+; GFX9-NEXT:    s_mov_b32 s8, 0.5
+; GFX9-NEXT:    s_getpc_b64 s[10:11]
+; GFX9-NEXT:    s_add_u32 s10, s10, external_void_func_v5f32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s11, s11, external_void_func_v5f32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[10:11]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v5f32_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s4, 1.0
+; GFX10-NEXT:    s_mov_b32 s5, 2.0
+; GFX10-NEXT:    s_mov_b32 s6, 4.0
+; GFX10-NEXT:    s_mov_b32 s7, -1.0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s8, 0.5
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[10:11]
+; GFX10-NEXT:    s_add_u32 s10, s10, external_void_func_v5f32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s11, s11, external_void_func_v5f32_inreg at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[10:11]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v5f32_inreg(<5 x float> <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_f64_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40100000
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_f64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_f64_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_f64_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s5, 0x40100000
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_f64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_f64_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_f64_inreg(double 4.0)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v2f64_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 2.0
+; GFX9-NEXT:    s_mov_b32 s6, 0
+; GFX9-NEXT:    s_mov_b32 s7, 0x40100000
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v2f64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v2f64_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v2f64_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s5, 2.0
+; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    s_mov_b32 s7, 0x40100000
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[8:9]
+; GFX10-NEXT:    s_add_u32 s8, s8, external_void_func_v2f64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s9, s9, external_void_func_v2f64_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v2f64_inreg(<2 x double> <double 2.0, double 4.0>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v3f64_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 2.0
+; GFX9-NEXT:    s_mov_b32 s6, 0
+; GFX9-NEXT:    s_mov_b32 s7, 0x40100000
+; GFX9-NEXT:    s_mov_b32 s8, 0
+; GFX9-NEXT:    s_mov_b32 s9, 0x40200000
+; GFX9-NEXT:    s_getpc_b64 s[10:11]
+; GFX9-NEXT:    s_add_u32 s10, s10, external_void_func_v3f64_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s11, s11, external_void_func_v3f64_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[10:11]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v3f64_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s5, 2.0
+; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    s_mov_b32 s7, 0x40100000
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s8, 0
+; GFX10-NEXT:    s_mov_b32 s9, 0x40200000
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[10:11]
+; GFX10-NEXT:    s_add_u32 s10, s10, external_void_func_v3f64_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s11, s11, external_void_func_v3f64_inreg at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[10:11]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v3f64_inreg(<3 x double> <double 2.0, double 4.0, double 8.0>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v2i16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v2i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v2i16_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v2i16_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v2i16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v2i16_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %val = load <2 x i16>, <2 x i16> addrspace(4)* undef
+  call amdgpu_gfx void @external_void_func_v2i16_inreg(<2 x i16> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v3i16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v3i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v3i16_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v3i16_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v3i16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v3i16_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %val = load <3 x i16>, <3 x i16> addrspace(4)* undef
+  call amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v3f16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v3f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v3f16_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v3f16_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v3f16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v3f16_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %val = load <3 x half>, <3 x half> addrspace(4)* undef
+  call amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v3i16_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s4, 0x20001
+; GFX9-NEXT:    s_mov_b32 s5, 3
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v3i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v3i16_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v3i16_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s4, 0x20001
+; GFX10-NEXT:    s_mov_b32 s5, 3
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v3i16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v3i16_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> <i16 1, i16 2, i16 3>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v3f16_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s4, 0x40003c00
+; GFX9-NEXT:    s_movk_i32 s5, 0x4400
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v3f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v3f16_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v3f16_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s4, 0x40003c00
+; GFX10-NEXT:    s_movk_i32 s5, 0x4400
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v3f16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v3f16_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> <half 1.0, half 2.0, half 4.0>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v4i16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v4i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v4i16_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v4i16_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v4i16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v4i16_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %val = load <4 x i16>, <4 x i16> addrspace(4)* undef
+  call amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v4i16_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s4, 0x20001
+; GFX9-NEXT:    s_mov_b32 s5, 0x40003
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v4i16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v4i16_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v4i16_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s4, 0x20001
+; GFX10-NEXT:    s_mov_b32 s5, 0x40003
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v4i16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v4i16_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> <i16 1, i16 2, i16 3, i16 4>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v2f16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v2f16_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v2f16_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v2f16_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v2f16_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v2f16_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %val = load <2 x half>, <2 x half> addrspace(4)* undef
+  call amdgpu_gfx void @external_void_func_v2f16_inreg(<2 x half> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v2i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v2i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v2i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v2i32_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v2i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v2i32_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %val = load <2 x i32>, <2 x i32> addrspace(4)* undef
+  call amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v2i32_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s4, 1
+; GFX9-NEXT:    s_mov_b32 s5, 2
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, external_void_func_v2i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, external_void_func_v2i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v2i32_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s4, 1
+; GFX10-NEXT:    s_mov_b32 s5, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_getpc_b64 s[6:7]
+; GFX10-NEXT:    s_add_u32 s6, s6, external_void_func_v2i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s7, s7, external_void_func_v2i32_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> <i32 1, i32 2>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_v3i32_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s4, 3
+; GFX9-NEXT:    s_mov_b32 s5, 4
+; GFX9-NEXT:    s_mov_b32 s6, 5
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v3i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v3i32_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s4, 3
+; GFX10-NEXT:    s_mov_b32 s5, 4
+; GFX10-NEXT:    s_mov_b32 s6, 5
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[8:9]
+; GFX10-NEXT:    s_add_u32 s8, s8, external_void_func_v3i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i32_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v3i32_inreg(<3 x i32> <i32 3, i32 4, i32 5>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_v3i32_i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s4, 3
+; GFX9-NEXT:    s_mov_b32 s5, 4
+; GFX9-NEXT:    s_mov_b32 s6, 5
+; GFX9-NEXT:    s_mov_b32 s7, 6
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v3i32_i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i32_i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v3i32_i32_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s4, 3
+; GFX10-NEXT:    s_mov_b32 s5, 4
+; GFX10-NEXT:    s_mov_b32 s6, 5
+; GFX10-NEXT:    s_mov_b32 s7, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[8:9]
+; GFX10-NEXT:    s_add_u32 s8, s8, external_void_func_v3i32_i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i32_i32_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v3i32_i32_inreg(<3 x i32> <i32 3, i32 4, i32 5>, i32 6)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v4i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v4i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v4i32_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[8:9]
+; GFX10-NEXT:    s_add_u32 s8, s8, external_void_func_v4i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i32_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %val = load <4 x i32>, <4 x i32> addrspace(4)* undef
+  call amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v4i32_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s4, 1
+; GFX9-NEXT:    s_mov_b32 s5, 2
+; GFX9-NEXT:    s_mov_b32 s6, 3
+; GFX9-NEXT:    s_mov_b32 s7, 4
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v4i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v4i32_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s4, 1
+; GFX10-NEXT:    s_mov_b32 s5, 2
+; GFX10-NEXT:    s_mov_b32 s6, 3
+; GFX10-NEXT:    s_mov_b32 s7, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[8:9]
+; GFX10-NEXT:    s_add_u32 s8, s8, external_void_func_v4i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i32_inreg at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> <i32 1, i32 2, i32 3, i32 4>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v5i32_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s4, 1
+; GFX9-NEXT:    s_mov_b32 s5, 2
+; GFX9-NEXT:    s_mov_b32 s6, 3
+; GFX9-NEXT:    s_mov_b32 s7, 4
+; GFX9-NEXT:    s_mov_b32 s8, 5
+; GFX9-NEXT:    s_getpc_b64 s[10:11]
+; GFX9-NEXT:    s_add_u32 s10, s10, external_void_func_v5i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s11, s11, external_void_func_v5i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[10:11]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v5i32_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s4, 1
+; GFX10-NEXT:    s_mov_b32 s5, 2
+; GFX10-NEXT:    s_mov_b32 s6, 3
+; GFX10-NEXT:    s_mov_b32 s7, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s8, 5
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[10:11]
+; GFX10-NEXT:    s_add_u32 s10, s10, external_void_func_v5i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s11, s11, external_void_func_v5i32_inreg at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[10:11]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v5i32_inreg(<5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v8i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX9-NEXT:    s_getpc_b64 s[12:13]
+; GFX9-NEXT:    s_add_u32 s12, s12, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s13, s13, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[12:13]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v8i32_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_getpc_b64 s[12:13]
+; GFX10-NEXT:    s_add_u32 s12, s12, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s13, s13, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[12:13]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %ptr = load <8 x i32> addrspace(4)*, <8 x i32> addrspace(4)* addrspace(4)* undef
+  %val = load <8 x i32>, <8 x i32> addrspace(4)* %ptr
+  call amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v8i32_imm_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s4, 1
+; GFX9-NEXT:    s_mov_b32 s5, 2
+; GFX9-NEXT:    s_mov_b32 s6, 3
+; GFX9-NEXT:    s_mov_b32 s7, 4
+; GFX9-NEXT:    s_mov_b32 s8, 5
+; GFX9-NEXT:    s_mov_b32 s9, 6
+; GFX9-NEXT:    s_mov_b32 s10, 7
+; GFX9-NEXT:    s_mov_b32 s11, 8
+; GFX9-NEXT:    s_getpc_b64 s[12:13]
+; GFX9-NEXT:    s_add_u32 s12, s12, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s13, s13, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[12:13]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v8i32_imm_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s4, 1
+; GFX10-NEXT:    s_mov_b32 s5, 2
+; GFX10-NEXT:    s_mov_b32 s6, 3
+; GFX10-NEXT:    s_mov_b32 s7, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s8, 5
+; GFX10-NEXT:    s_mov_b32 s9, 6
+; GFX10-NEXT:    s_mov_b32 s10, 7
+; GFX10-NEXT:    s_mov_b32 s11, 8
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[12:13]
+; GFX10-NEXT:    s_add_u32 s12, s12, external_void_func_v8i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s13, s13, external_void_func_v8i32_inreg at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[12:13]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v16i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
+; GFX9-NEXT:    s_getpc_b64 s[20:21]
+; GFX9-NEXT:    s_add_u32 s20, s20, external_void_func_v16i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s21, s21, external_void_func_v16i32_inreg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v16i32_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_getpc_b64 s[20:21]
+; GFX10-NEXT:    s_add_u32 s20, s20, external_void_func_v16i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s21, s21, external_void_func_v16i32_inreg at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[20:21]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %ptr = load <16 x i32> addrspace(4)*, <16 x i32> addrspace(4)* addrspace(4)* undef
+  %val = load <16 x i32>, <16 x i32> addrspace(4)* %ptr
+  call amdgpu_gfx void @external_void_func_v16i32_inreg(<16 x i32> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
+; GFX9-LABEL: test_call_external_void_func_v32i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 18
+; GFX9-NEXT:    v_writelane_b32 v40, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s40, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s41, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s42, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s43, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s44, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s45, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s46, 10
+; GFX9-NEXT:    s_load_dwordx2 s[20:21], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s47, 11
+; GFX9-NEXT:    v_writelane_b32 v40, s48, 12
+; GFX9-NEXT:    v_writelane_b32 v40, s49, 13
+; GFX9-NEXT:    v_writelane_b32 v40, s50, 14
+; GFX9-NEXT:    v_writelane_b32 v40, s51, 15
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[20:21], 0x0
+; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[20:21], 0x40
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 16
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 17
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s46
+; GFX9-NEXT:    v_mov_b32_e32 v1, s47
+; GFX9-NEXT:    v_mov_b32_e32 v2, s48
+; GFX9-NEXT:    v_mov_b32_e32 v3, s49
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    v_mov_b32_e32 v0, s50
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    v_mov_b32_e32 v0, s51
+; GFX9-NEXT:    s_mov_b32 s20, s36
+; GFX9-NEXT:    s_mov_b32 s21, s37
+; GFX9-NEXT:    s_mov_b32 s22, s38
+; GFX9-NEXT:    s_mov_b32 s23, s39
+; GFX9-NEXT:    s_mov_b32 s24, s40
+; GFX9-NEXT:    s_mov_b32 s25, s41
+; GFX9-NEXT:    s_mov_b32 s26, s42
+; GFX9-NEXT:    s_mov_b32 s27, s43
+; GFX9-NEXT:    s_mov_b32 s28, s44
+; GFX9-NEXT:    s_mov_b32 s29, s45
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_inreg at rel32@hi+12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 16
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 17
+; GFX9-NEXT:    v_readlane_b32 s51, v40, 15
+; GFX9-NEXT:    v_readlane_b32 s50, v40, 14
+; GFX9-NEXT:    v_readlane_b32 s49, v40, 13
+; GFX9-NEXT:    v_readlane_b32 s48, v40, 12
+; GFX9-NEXT:    v_readlane_b32 s47, v40, 11
+; GFX9-NEXT:    v_readlane_b32 s46, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s45, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s44, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s43, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s42, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s41, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s40, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v40, 0
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 18
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v32i32_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 18
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s36, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s37, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s38, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s39, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s40, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s41, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s42, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s43, 7
+; GFX10-NEXT:    v_writelane_b32 v40, s44, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s45, 9
+; GFX10-NEXT:    v_writelane_b32 v40, s46, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s47, 11
+; GFX10-NEXT:    v_writelane_b32 v40, s48, 12
+; GFX10-NEXT:    v_writelane_b32 v40, s49, 13
+; GFX10-NEXT:    v_writelane_b32 v40, s50, 14
+; GFX10-NEXT:    v_writelane_b32 v40, s51, 15
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x40
+; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 16
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 17
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s46
+; GFX10-NEXT:    v_mov_b32_e32 v1, s47
+; GFX10-NEXT:    v_mov_b32_e32 v2, s48
+; GFX10-NEXT:    v_mov_b32_e32 v3, s49
+; GFX10-NEXT:    s_mov_b32 s20, s36
+; GFX10-NEXT:    s_mov_b32 s21, s37
+; GFX10-NEXT:    s_mov_b32 s22, s38
+; GFX10-NEXT:    s_mov_b32 s23, s39
+; GFX10-NEXT:    s_mov_b32 s24, s40
+; GFX10-NEXT:    s_mov_b32 s25, s41
+; GFX10-NEXT:    s_mov_b32 s26, s42
+; GFX10-NEXT:    s_mov_b32 s27, s43
+; GFX10-NEXT:    s_mov_b32 s28, s44
+; GFX10-NEXT:    s_mov_b32 s29, s45
+; GFX10-NEXT:    v_mov_b32_e32 v4, s50
+; GFX10-NEXT:    v_mov_b32_e32 v5, s51
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16
+; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 16
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 17
+; GFX10-NEXT:    v_readlane_b32 s51, v40, 15
+; GFX10-NEXT:    v_readlane_b32 s50, v40, 14
+; GFX10-NEXT:    v_readlane_b32 s49, v40, 13
+; GFX10-NEXT:    v_readlane_b32 s48, v40, 12
+; GFX10-NEXT:    v_readlane_b32 s47, v40, 11
+; GFX10-NEXT:    v_readlane_b32 s46, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s45, v40, 9
+; GFX10-NEXT:    v_readlane_b32 s44, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s43, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s42, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s41, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s40, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s39, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s38, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s37, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s36, v40, 0
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 18
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %ptr = load <32 x i32> addrspace(4)*, <32 x i32> addrspace(4)* addrspace(4)* undef
+  %val = load <32 x i32>, <32 x i32> addrspace(4)* %ptr
+  call amdgpu_gfx void @external_void_func_v32i32_inreg(<32 x i32> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
+; GFX9-LABEL: test_call_external_void_func_v32i32_i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 18
+; GFX9-NEXT:    v_writelane_b32 v40, s36, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s38, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s39, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s40, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s41, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s42, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s43, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s44, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s45, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s46, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s47, 11
+; GFX9-NEXT:    s_load_dwordx2 s[20:21], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s22, s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s48, 12
+; GFX9-NEXT:    v_writelane_b32 v40, s49, 13
+; GFX9-NEXT:    v_writelane_b32 v40, s50, 14
+; GFX9-NEXT:    v_writelane_b32 v40, s51, 15
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[20:21], 0x0
+; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[20:21], 0x40
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s22
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s46
+; GFX9-NEXT:    v_mov_b32_e32 v1, s47
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s48
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s49
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    v_mov_b32_e32 v0, s50
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 16
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    v_mov_b32_e32 v0, s51
+; GFX9-NEXT:    s_mov_b32 s20, s36
+; GFX9-NEXT:    s_mov_b32 s21, s37
+; GFX9-NEXT:    s_mov_b32 s22, s38
+; GFX9-NEXT:    s_mov_b32 s23, s39
+; GFX9-NEXT:    s_mov_b32 s24, s40
+; GFX9-NEXT:    s_mov_b32 s25, s41
+; GFX9-NEXT:    s_mov_b32 s26, s42
+; GFX9-NEXT:    s_mov_b32 s27, s43
+; GFX9-NEXT:    s_mov_b32 s28, s44
+; GFX9-NEXT:    s_mov_b32 s29, s45
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 17
+; GFX9-NEXT:    s_getpc_b64 s[30:31]
+; GFX9-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg at rel32@hi+12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 16
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 17
+; GFX9-NEXT:    v_readlane_b32 s51, v40, 15
+; GFX9-NEXT:    v_readlane_b32 s50, v40, 14
+; GFX9-NEXT:    v_readlane_b32 s49, v40, 13
+; GFX9-NEXT:    v_readlane_b32 s48, v40, 12
+; GFX9-NEXT:    v_readlane_b32 s47, v40, 11
+; GFX9-NEXT:    v_readlane_b32 s46, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s45, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s44, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s43, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s42, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s41, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s40, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s39, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s38, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s37, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v40, 0
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 18
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_v32i32_i32_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 18
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s36, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s37, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s38, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s39, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s40, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s41, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s42, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s43, 7
+; GFX10-NEXT:    v_writelane_b32 v40, s44, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s45, 9
+; GFX10-NEXT:    v_writelane_b32 v40, s46, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s47, 11
+; GFX10-NEXT:    v_writelane_b32 v40, s48, 12
+; GFX10-NEXT:    v_writelane_b32 v40, s49, 13
+; GFX10-NEXT:    v_writelane_b32 v40, s50, 14
+; GFX10-NEXT:    v_writelane_b32 v40, s51, 15
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    s_load_dword s20, s[4:5], 0x0
+; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x40
+; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 16
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 17
+; GFX10-NEXT:    s_getpc_b64 s[30:31]
+; GFX10-NEXT:    s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg at rel32@hi+12
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s20
+; GFX10-NEXT:    v_mov_b32_e32 v1, s47
+; GFX10-NEXT:    v_mov_b32_e32 v2, s48
+; GFX10-NEXT:    v_mov_b32_e32 v3, s49
+; GFX10-NEXT:    s_mov_b32 s20, s36
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    v_mov_b32_e32 v0, s46
+; GFX10-NEXT:    s_mov_b32 s21, s37
+; GFX10-NEXT:    s_mov_b32 s22, s38
+; GFX10-NEXT:    s_mov_b32 s23, s39
+; GFX10-NEXT:    s_mov_b32 s24, s40
+; GFX10-NEXT:    s_mov_b32 s25, s41
+; GFX10-NEXT:    s_mov_b32 s26, s42
+; GFX10-NEXT:    s_mov_b32 s27, s43
+; GFX10-NEXT:    s_mov_b32 s28, s44
+; GFX10-NEXT:    s_mov_b32 s29, s45
+; GFX10-NEXT:    v_mov_b32_e32 v4, s50
+; GFX10-NEXT:    v_mov_b32_e32 v5, s51
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16
+; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[30:31]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 16
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 17
+; GFX10-NEXT:    v_readlane_b32 s51, v40, 15
+; GFX10-NEXT:    v_readlane_b32 s50, v40, 14
+; GFX10-NEXT:    v_readlane_b32 s49, v40, 13
+; GFX10-NEXT:    v_readlane_b32 s48, v40, 12
+; GFX10-NEXT:    v_readlane_b32 s47, v40, 11
+; GFX10-NEXT:    v_readlane_b32 s46, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s45, v40, 9
+; GFX10-NEXT:    v_readlane_b32 s44, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s43, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s42, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s41, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s40, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s39, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s38, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s37, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s36, v40, 0
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 18
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %ptr0 = load <32 x i32> addrspace(4)*, <32 x i32> addrspace(4)* addrspace(4)* undef
+  %val0 = load <32 x i32>, <32 x i32> addrspace(4)* %ptr0
+  %val1 = load i32, i32 addrspace(4)* undef
+  call amdgpu_gfx void @external_void_func_v32i32_i32_inreg(<32 x i32> %val0, i32 %val1)
+  ret void
+}
+
+define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
+; GFX9-LABEL: stack_passed_arg_alignment_v32i32_f64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s33
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:4
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: stack_passed_arg_alignment_v32i32_f64:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    buffer_store_dword v32, off, s[0:3], s32
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+entry:
+  call amdgpu_gfx void @stack_passed_f64_arg(<32 x i32> %val, double %tmp)
+  ret void
+}
+
+define amdgpu_gfx void @stack_12xv3i32() #0 {
+; GFX9-LABEL: stack_12xv3i32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX9-NEXT:    v_mov_b32_e32 v0, 13
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    v_mov_b32_e32 v0, 14
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v0, 15
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 1
+; GFX9-NEXT:    v_mov_b32_e32 v5, 1
+; GFX9-NEXT:    v_mov_b32_e32 v6, 2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 2
+; GFX9-NEXT:    v_mov_b32_e32 v8, 2
+; GFX9-NEXT:    v_mov_b32_e32 v9, 3
+; GFX9-NEXT:    v_mov_b32_e32 v10, 3
+; GFX9-NEXT:    v_mov_b32_e32 v11, 3
+; GFX9-NEXT:    v_mov_b32_e32 v12, 4
+; GFX9-NEXT:    v_mov_b32_e32 v13, 4
+; GFX9-NEXT:    v_mov_b32_e32 v14, 4
+; GFX9-NEXT:    v_mov_b32_e32 v15, 5
+; GFX9-NEXT:    v_mov_b32_e32 v16, 5
+; GFX9-NEXT:    v_mov_b32_e32 v17, 5
+; GFX9-NEXT:    v_mov_b32_e32 v18, 6
+; GFX9-NEXT:    v_mov_b32_e32 v19, 6
+; GFX9-NEXT:    v_mov_b32_e32 v20, 6
+; GFX9-NEXT:    v_mov_b32_e32 v21, 7
+; GFX9-NEXT:    v_mov_b32_e32 v22, 7
+; GFX9-NEXT:    v_mov_b32_e32 v23, 7
+; GFX9-NEXT:    v_mov_b32_e32 v24, 8
+; GFX9-NEXT:    v_mov_b32_e32 v25, 8
+; GFX9-NEXT:    v_mov_b32_e32 v26, 8
+; GFX9-NEXT:    v_mov_b32_e32 v27, 9
+; GFX9-NEXT:    v_mov_b32_e32 v28, 9
+; GFX9-NEXT:    v_mov_b32_e32 v29, 9
+; GFX9-NEXT:    v_mov_b32_e32 v30, 10
+; GFX9-NEXT:    v_mov_b32_e32 v31, 11
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3i32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: stack_12xv3i32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, 12
+; GFX10-NEXT:    v_mov_b32_e32 v1, 13
+; GFX10-NEXT:    v_mov_b32_e32 v2, 14
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 1
+; GFX10-NEXT:    v_mov_b32_e32 v4, 1
+; GFX10-NEXT:    v_mov_b32_e32 v5, 1
+; GFX10-NEXT:    v_mov_b32_e32 v6, 2
+; GFX10-NEXT:    v_mov_b32_e32 v7, 2
+; GFX10-NEXT:    v_mov_b32_e32 v8, 2
+; GFX10-NEXT:    v_mov_b32_e32 v9, 3
+; GFX10-NEXT:    v_mov_b32_e32 v10, 3
+; GFX10-NEXT:    v_mov_b32_e32 v11, 3
+; GFX10-NEXT:    v_mov_b32_e32 v12, 4
+; GFX10-NEXT:    v_mov_b32_e32 v13, 4
+; GFX10-NEXT:    v_mov_b32_e32 v14, 4
+; GFX10-NEXT:    v_mov_b32_e32 v15, 5
+; GFX10-NEXT:    v_mov_b32_e32 v16, 5
+; GFX10-NEXT:    v_mov_b32_e32 v17, 5
+; GFX10-NEXT:    v_mov_b32_e32 v18, 6
+; GFX10-NEXT:    v_mov_b32_e32 v19, 6
+; GFX10-NEXT:    v_mov_b32_e32 v20, 6
+; GFX10-NEXT:    v_mov_b32_e32 v21, 7
+; GFX10-NEXT:    v_mov_b32_e32 v22, 7
+; GFX10-NEXT:    v_mov_b32_e32 v23, 7
+; GFX10-NEXT:    v_mov_b32_e32 v24, 8
+; GFX10-NEXT:    v_mov_b32_e32 v25, 8
+; GFX10-NEXT:    v_mov_b32_e32 v26, 8
+; GFX10-NEXT:    v_mov_b32_e32 v27, 9
+; GFX10-NEXT:    v_mov_b32_e32 v28, 9
+; GFX10-NEXT:    v_mov_b32_e32 v29, 9
+; GFX10-NEXT:    v_mov_b32_e32 v30, 10
+; GFX10-NEXT:    v_mov_b32_e32 v31, 11
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3i32 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+entry:
+  call amdgpu_gfx void @external_void_func_12xv3i32(
+      <3 x i32><i32 0, i32 0, i32 0>,
+      <3 x i32><i32 1, i32 1, i32 1>,
+      <3 x i32><i32 2, i32 2, i32 2>,
+      <3 x i32><i32 3, i32 3, i32 3>,
+      <3 x i32><i32 4, i32 4, i32 4>,
+      <3 x i32><i32 5, i32 5, i32 5>,
+      <3 x i32><i32 6, i32 6, i32 6>,
+      <3 x i32><i32 7, i32 7, i32 7>,
+      <3 x i32><i32 8, i32 8, i32 8>,
+      <3 x i32><i32 9, i32 9, i32 9>,
+      <3 x i32><i32 10, i32 11, i32 12>,
+      <3 x i32><i32 13, i32 14, i32 15>)
+  ret void
+}
+
+define amdgpu_gfx void @stack_8xv5i32() #0 {
+; GFX9-LABEL: stack_8xv5i32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 8
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX9-NEXT:    v_mov_b32_e32 v0, 9
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    v_mov_b32_e32 v0, 10
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v0, 11
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    v_mov_b32_e32 v0, 12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 13
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    v_mov_b32_e32 v0, 14
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 15
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 1
+; GFX9-NEXT:    v_mov_b32_e32 v6, 1
+; GFX9-NEXT:    v_mov_b32_e32 v7, 1
+; GFX9-NEXT:    v_mov_b32_e32 v8, 1
+; GFX9-NEXT:    v_mov_b32_e32 v9, 1
+; GFX9-NEXT:    v_mov_b32_e32 v10, 2
+; GFX9-NEXT:    v_mov_b32_e32 v11, 2
+; GFX9-NEXT:    v_mov_b32_e32 v12, 2
+; GFX9-NEXT:    v_mov_b32_e32 v13, 2
+; GFX9-NEXT:    v_mov_b32_e32 v14, 2
+; GFX9-NEXT:    v_mov_b32_e32 v15, 3
+; GFX9-NEXT:    v_mov_b32_e32 v16, 3
+; GFX9-NEXT:    v_mov_b32_e32 v17, 3
+; GFX9-NEXT:    v_mov_b32_e32 v18, 3
+; GFX9-NEXT:    v_mov_b32_e32 v19, 3
+; GFX9-NEXT:    v_mov_b32_e32 v20, 4
+; GFX9-NEXT:    v_mov_b32_e32 v21, 4
+; GFX9-NEXT:    v_mov_b32_e32 v22, 4
+; GFX9-NEXT:    v_mov_b32_e32 v23, 4
+; GFX9-NEXT:    v_mov_b32_e32 v24, 4
+; GFX9-NEXT:    v_mov_b32_e32 v25, 5
+; GFX9-NEXT:    v_mov_b32_e32 v26, 5
+; GFX9-NEXT:    v_mov_b32_e32 v27, 5
+; GFX9-NEXT:    v_mov_b32_e32 v28, 5
+; GFX9-NEXT:    v_mov_b32_e32 v29, 5
+; GFX9-NEXT:    v_mov_b32_e32 v30, 6
+; GFX9-NEXT:    v_mov_b32_e32 v31, 7
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5i32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: stack_8xv5i32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, 8
+; GFX10-NEXT:    v_mov_b32_e32 v1, 9
+; GFX10-NEXT:    v_mov_b32_e32 v2, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_mov_b32_e32 v3, 14
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    v_mov_b32_e32 v0, 11
+; GFX10-NEXT:    v_mov_b32_e32 v1, 12
+; GFX10-NEXT:    v_mov_b32_e32 v2, 13
+; GFX10-NEXT:    v_mov_b32_e32 v4, 15
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:16
+; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, 1
+; GFX10-NEXT:    v_mov_b32_e32 v6, 1
+; GFX10-NEXT:    v_mov_b32_e32 v7, 1
+; GFX10-NEXT:    v_mov_b32_e32 v8, 1
+; GFX10-NEXT:    v_mov_b32_e32 v9, 1
+; GFX10-NEXT:    v_mov_b32_e32 v10, 2
+; GFX10-NEXT:    v_mov_b32_e32 v11, 2
+; GFX10-NEXT:    v_mov_b32_e32 v12, 2
+; GFX10-NEXT:    v_mov_b32_e32 v13, 2
+; GFX10-NEXT:    v_mov_b32_e32 v14, 2
+; GFX10-NEXT:    v_mov_b32_e32 v15, 3
+; GFX10-NEXT:    v_mov_b32_e32 v16, 3
+; GFX10-NEXT:    v_mov_b32_e32 v17, 3
+; GFX10-NEXT:    v_mov_b32_e32 v18, 3
+; GFX10-NEXT:    v_mov_b32_e32 v19, 3
+; GFX10-NEXT:    v_mov_b32_e32 v20, 4
+; GFX10-NEXT:    v_mov_b32_e32 v21, 4
+; GFX10-NEXT:    v_mov_b32_e32 v22, 4
+; GFX10-NEXT:    v_mov_b32_e32 v23, 4
+; GFX10-NEXT:    v_mov_b32_e32 v24, 4
+; GFX10-NEXT:    v_mov_b32_e32 v25, 5
+; GFX10-NEXT:    v_mov_b32_e32 v26, 5
+; GFX10-NEXT:    v_mov_b32_e32 v27, 5
+; GFX10-NEXT:    v_mov_b32_e32 v28, 5
+; GFX10-NEXT:    v_mov_b32_e32 v29, 5
+; GFX10-NEXT:    v_mov_b32_e32 v30, 6
+; GFX10-NEXT:    v_mov_b32_e32 v31, 7
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5i32 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+entry:
+  call amdgpu_gfx void @external_void_func_8xv5i32(
+      <5 x i32><i32 0, i32 0, i32 0, i32 0, i32 0>,
+      <5 x i32><i32 1, i32 1, i32 1, i32 1, i32 1>,
+      <5 x i32><i32 2, i32 2, i32 2, i32 2, i32 2>,
+      <5 x i32><i32 3, i32 3, i32 3, i32 3, i32 3>,
+      <5 x i32><i32 4, i32 4, i32 4, i32 4, i32 4>,
+      <5 x i32><i32 5, i32 5, i32 5, i32 5, i32 5>,
+      <5 x i32><i32 6, i32 7, i32 8, i32 9, i32 10>,
+      <5 x i32><i32 11, i32 12, i32 13, i32 14, i32 15>)
+  ret void
+}
+
+define amdgpu_gfx void @stack_8xv5f32() #0 {
+; GFX9-LABEL: stack_8xv5f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41000000
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41100000
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41200000
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41300000
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41400000
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41500000
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41600000
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41700000
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v7, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v9, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v10, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v11, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v12, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v13, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v14, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v15, 0x40400000
+; GFX9-NEXT:    v_mov_b32_e32 v16, 0x40400000
+; GFX9-NEXT:    v_mov_b32_e32 v17, 0x40400000
+; GFX9-NEXT:    v_mov_b32_e32 v18, 0x40400000
+; GFX9-NEXT:    v_mov_b32_e32 v19, 0x40400000
+; GFX9-NEXT:    v_mov_b32_e32 v20, 4.0
+; GFX9-NEXT:    v_mov_b32_e32 v21, 4.0
+; GFX9-NEXT:    v_mov_b32_e32 v22, 4.0
+; GFX9-NEXT:    v_mov_b32_e32 v23, 4.0
+; GFX9-NEXT:    v_mov_b32_e32 v24, 4.0
+; GFX9-NEXT:    v_mov_b32_e32 v25, 0x40a00000
+; GFX9-NEXT:    v_mov_b32_e32 v26, 0x40a00000
+; GFX9-NEXT:    v_mov_b32_e32 v27, 0x40a00000
+; GFX9-NEXT:    v_mov_b32_e32 v28, 0x40a00000
+; GFX9-NEXT:    v_mov_b32_e32 v29, 0x40a00000
+; GFX9-NEXT:    v_mov_b32_e32 v30, 0x40c00000
+; GFX9-NEXT:    v_mov_b32_e32 v31, 0x40e00000
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5f32 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: stack_8xv5f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0x41000000
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41100000
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x41200000
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x41600000
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0x41300000
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41400000
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x41500000
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0x41700000
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:16
+; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, 1.0
+; GFX10-NEXT:    v_mov_b32_e32 v6, 1.0
+; GFX10-NEXT:    v_mov_b32_e32 v7, 1.0
+; GFX10-NEXT:    v_mov_b32_e32 v8, 1.0
+; GFX10-NEXT:    v_mov_b32_e32 v9, 1.0
+; GFX10-NEXT:    v_mov_b32_e32 v10, 2.0
+; GFX10-NEXT:    v_mov_b32_e32 v11, 2.0
+; GFX10-NEXT:    v_mov_b32_e32 v12, 2.0
+; GFX10-NEXT:    v_mov_b32_e32 v13, 2.0
+; GFX10-NEXT:    v_mov_b32_e32 v14, 2.0
+; GFX10-NEXT:    v_mov_b32_e32 v15, 0x40400000
+; GFX10-NEXT:    v_mov_b32_e32 v16, 0x40400000
+; GFX10-NEXT:    v_mov_b32_e32 v17, 0x40400000
+; GFX10-NEXT:    v_mov_b32_e32 v18, 0x40400000
+; GFX10-NEXT:    v_mov_b32_e32 v19, 0x40400000
+; GFX10-NEXT:    v_mov_b32_e32 v20, 4.0
+; GFX10-NEXT:    v_mov_b32_e32 v21, 4.0
+; GFX10-NEXT:    v_mov_b32_e32 v22, 4.0
+; GFX10-NEXT:    v_mov_b32_e32 v23, 4.0
+; GFX10-NEXT:    v_mov_b32_e32 v24, 4.0
+; GFX10-NEXT:    v_mov_b32_e32 v25, 0x40a00000
+; GFX10-NEXT:    v_mov_b32_e32 v26, 0x40a00000
+; GFX10-NEXT:    v_mov_b32_e32 v27, 0x40a00000
+; GFX10-NEXT:    v_mov_b32_e32 v28, 0x40a00000
+; GFX10-NEXT:    v_mov_b32_e32 v29, 0x40a00000
+; GFX10-NEXT:    v_mov_b32_e32 v30, 0x40c00000
+; GFX10-NEXT:    v_mov_b32_e32 v31, 0x40e00000
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5f32 at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+entry:
+  call amdgpu_gfx void @external_void_func_8xv5f32(
+      <5 x float><float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>,
+      <5 x float><float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>,
+      <5 x float><float 2.0, float 2.0, float 2.0, float 2.0, float 2.0>,
+      <5 x float><float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>,
+      <5 x float><float 4.0, float 4.0, float 4.0, float 4.0, float 4.0>,
+      <5 x float><float 5.0, float 5.0, float 5.0, float 5.0, float 5.0>,
+      <5 x float><float 6.0, float 7.0, float 8.0, float 9.0, float 10.0>,
+      <5 x float><float 11.0, float 12.0, float 13.0, float 14.0, float 15.0>)
+  ret void
+}
+
+declare hidden amdgpu_gfx void @byval_align16_f64_arg(<32 x i32>, double addrspace(5)* byval align 16) #0
+declare hidden amdgpu_gfx void @stack_passed_f64_arg(<32 x i32>, double) #0
+declare hidden amdgpu_gfx void @external_void_func_12xv3i32(<3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>,
+    <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>) #0
+declare hidden amdgpu_gfx void @external_void_func_8xv5i32(<5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>,
+    <5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>) #0
+declare hidden amdgpu_gfx void @external_void_func_12xv3f32(<3 x float>, <3 x float>, <3 x float>, <3 x float>,
+    <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>) #0
+declare hidden amdgpu_gfx void @external_void_func_8xv5f32(<5 x float>, <5 x float>, <5 x float>, <5 x float>,
+    <5 x float>, <5 x float>, <5 x float>, <5 x float>) #0
+attributes #0 = { nounwind }
+attributes #1 = { nounwind noinline }

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
new file mode 100644
index 000000000000..bf946eb33f49
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@@ -0,0 +1,835 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+
+declare hidden amdgpu_gfx void @external_void_func_void() #0
+
+define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
+; GFX9-LABEL: test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[34:35]
+; GFX9-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 4
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX10-NEXT:    s_getpc_b64 s[34:35]
+; GFX10-NEXT:    s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 4
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @external_void_func_void()
+  call void asm sideeffect "", ""() #0
+  call amdgpu_gfx void @external_void_func_void()
+  ret void
+}
+
+define amdgpu_gfx void @void_func_void_clobber_s30_s31() #1 {
+; GFX9-LABEL: void_func_void_clobber_s30_s31:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; clobber
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: void_func_void_clobber_s30_s31:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_mov_b64 s[4:5], s[30:31]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; clobber
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call void asm sideeffect "; clobber", "~{s[30:31]}"() #0
+  ret void
+}
+
+define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)* %out) #0 {
+; GFX9-LABEL: test_call_void_func_void_mayclobber_s31:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s31
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    s_mov_b32 s34, s31
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX9-NEXT:    s_mov_b32 s31, s34
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s31
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_void_func_void_mayclobber_s31:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; def s31
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_mov_b32 s34, s31
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX10-NEXT:    s_mov_b32 s31, s34
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use s31
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %s31 = call i32 asm sideeffect "; def $0", "={s31}"()
+  call amdgpu_gfx void @external_void_func_void()
+  call void asm sideeffect "; use $0", "{s31}"(i32 %s31)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 {
+; GFX9-LABEL: test_call_void_func_void_mayclobber_v31:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v41, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def v31
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v41, s31, 1
+; GFX9-NEXT:    v_mov_b32_e32 v40, v31
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v31, v40
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v31
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v41, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v41, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_void_func_void_mayclobber_v31:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v41, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; def v31
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_writelane_b32 v41, s30, 0
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_mov_b32_e32 v40, v31
+; GFX10-NEXT:    v_writelane_b32 v41, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v31, v40
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v31
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v41, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v41, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %v31 = call i32 asm sideeffect "; def $0", "={v31}"()
+  call amdgpu_gfx void @external_void_func_void()
+  call void asm sideeffect "; use $0", "{v31}"(i32 %v31)
+  ret void
+}
+
+
+define amdgpu_gfx void @test_call_void_func_void_preserves_s33(i32 addrspace(1)* %out) #0 {
+; GFX9-LABEL: test_call_void_func_void_preserves_s33:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s33
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s33
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 2
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_void_func_void_preserves_s33:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; def s33
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use s33
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 2
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %s33 = call i32 asm sideeffect "; def $0", "={s33}"()
+  call amdgpu_gfx void @external_void_func_void()
+  call void asm sideeffect "; use $0", "{s33}"(i32 %s33)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_void_func_void_preserves_s34(i32 addrspace(1)* %out) #0 {
+; GFX9-LABEL: test_call_void_func_void_preserves_s34:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s34
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s34
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_void_func_void_preserves_s34:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; def s34
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use s34
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %s34 = call i32 asm sideeffect "; def $0", "={s34}"()
+  call amdgpu_gfx void @external_void_func_void()
+  call void asm sideeffect "; use $0", "{s34}"(i32 %s34)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)* %out) #0 {
+; GFX9-LABEL: test_call_void_func_void_preserves_v40:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v41, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v41, s31, 1
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def v40
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v40
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v41, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v41, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_void_func_void_preserves_v40:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v41, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v41, s30, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; def v40
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_writelane_b32 v41, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v40
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v41, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v41, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %v40 = call i32 asm sideeffect "; def $0", "={v40}"()
+  call amdgpu_gfx void @external_void_func_void()
+  call void asm sideeffect "; use $0", "{v40}"(i32 %v40)
+  ret void
+}
+
+define hidden void @void_func_void_clobber_s33() #1 {
+; GFX9-LABEL: void_func_void_clobber_s33:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_writelane_b32 v0, s33, 0
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; clobber
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_readlane_b32 s33, v0, 0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: void_func_void_clobber_s33:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_writelane_b32 v0, s33, 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; clobber
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_readlane_b32 s33, v0, 0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  call void asm sideeffect "; clobber", "~{s33}"() #0
+  ret void
+}
+
+define hidden void @void_func_void_clobber_s34() #1 {
+; GFX9-LABEL: void_func_void_clobber_s34:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_writelane_b32 v0, s34, 0
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; clobber
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_readlane_b32 s34, v0, 0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: void_func_void_clobber_s34:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_writelane_b32 v0, s34, 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; clobber
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_readlane_b32 s34, v0, 0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  call void asm sideeffect "; clobber", "~{s34}"() #0
+  ret void
+}
+
+define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
+; GFX9-LABEL: test_call_void_func_void_clobber_s33:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, void_func_void_clobber_s33 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, void_func_void_clobber_s33 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_void_func_void_clobber_s33:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, void_func_void_clobber_s33 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, void_func_void_clobber_s33 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @void_func_void_clobber_s33()
+  ret void
+}
+
+define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
+; GFX9-LABEL: test_call_void_func_void_clobber_s34:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, void_func_void_clobber_s34 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, void_func_void_clobber_s34 at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: test_call_void_func_void_clobber_s34:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, void_func_void_clobber_s34 at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, void_func_void_clobber_s34 at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void @void_func_void_clobber_s34()
+  ret void
+}
+
+define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
+; GFX9-LABEL: callee_saved_sgpr_kernel:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s40, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s40
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s40
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s40, v40, 0
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: callee_saved_sgpr_kernel:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 3
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_writelane_b32 v40, s40, 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; def s40
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 1
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use s40
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s40, v40, 0
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
+  call amdgpu_gfx void @external_void_func_void()
+  call void asm sideeffect "; use $0", "s"(i32 %s40) #0
+  ret void
+}
+
+define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
+; GFX9-LABEL: callee_saved_sgpr_vgpr_kernel:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v41, s33, 3
+; GFX9-NEXT:    v_writelane_b32 v41, s40, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s30, 1
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_add_u32 s32, s32, 0x400
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def s40
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; def v32
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v41, s31, 2
+; GFX9-NEXT:    v_mov_b32_e32 v40, v32
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s40
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v40
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s4, v41, 1
+; GFX9-NEXT:    v_readlane_b32 s5, v41, 2
+; GFX9-NEXT:    v_readlane_b32 s40, v41, 0
+; GFX9-NEXT:    s_sub_u32 s32, s32, 0x400
+; GFX9-NEXT:    v_readlane_b32 s33, v41, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: callee_saved_sgpr_vgpr_kernel:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_writelane_b32 v41, s33, 3
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_add_u32 s32, s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_writelane_b32 v41, s40, 0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; def s40
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; def v32
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_mov_b32_e32 v40, v32
+; GFX10-NEXT:    v_writelane_b32 v41, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v41, s31, 2
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use s40
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v40
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    v_readlane_b32 s4, v41, 1
+; GFX10-NEXT:    v_readlane_b32 s5, v41, 2
+; GFX10-NEXT:    v_readlane_b32 s40, v41, 0
+; GFX10-NEXT:    s_sub_u32 s32, s32, 0x200
+; GFX10-NEXT:    v_readlane_b32 s33, v41, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[4:5]
+  %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
+  %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0
+  call amdgpu_gfx void @external_void_func_void()
+  call void asm sideeffect "; use $0", "s"(i32 %s40) #0
+  call void asm sideeffect "; use $0", "v"(i32 %v32) #0
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind noinline }

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 3fa6be3110cc..8e77a1a84ee4 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -577,3 +577,54 @@ bb1:
 bb2:
   ret void
 }
+
+define void @test_indirect_call_vgpr_ptr_inreg_arg(void(i32)* %fptr) {
+; GCN-LABEL: test_indirect_call_vgpr_ptr_inreg_arg:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_writelane_b32 v42, s33, 6
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_add_u32 s32, s32, 0x400
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v42, s34, 0
+; GCN-NEXT:    v_writelane_b32 v42, s35, 1
+; GCN-NEXT:    v_writelane_b32 v42, s36, 2
+; GCN-NEXT:    v_writelane_b32 v42, s37, 3
+; GCN-NEXT:    v_writelane_b32 v42, s30, 4
+; GCN-NEXT:    v_writelane_b32 v42, s31, 5
+; GCN-NEXT:    v_mov_b32_e32 v41, v1
+; GCN-NEXT:    v_mov_b32_e32 v40, v0
+; GCN-NEXT:    s_mov_b64 s[34:35], exec
+; GCN-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s6, v40
+; GCN-NEXT:    v_readfirstlane_b32 s7, v41
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[40:41]
+; GCN-NEXT:    s_and_saveexec_b64 s[36:37], vcc
+; GCN-NEXT:    s_movk_i32 s4, 0x7b
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    s_xor_b64 exec, exec, s[36:37]
+; GCN-NEXT:    s_cbranch_execnz BB6_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b64 exec, s[34:35]
+; GCN-NEXT:    v_readlane_b32 s4, v42, 4
+; GCN-NEXT:    v_readlane_b32 s5, v42, 5
+; GCN-NEXT:    v_readlane_b32 s37, v42, 3
+; GCN-NEXT:    v_readlane_b32 s36, v42, 2
+; GCN-NEXT:    v_readlane_b32 s35, v42, 1
+; GCN-NEXT:    v_readlane_b32 s34, v42, 0
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
+; GCN-NEXT:    v_readlane_b32 s33, v42, 6
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void %fptr(i32 inreg 123)
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll b/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll
index 3cff62735a10..44c33347c9b5 100644
--- a/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll
@@ -1,4 +1,5 @@
-; RUN: not llc -march=amdgcn -mtriple=amdgcn-- -tailcallopt < %s 2>&1 | FileCheck -check-prefix=GCN %s
+; RUN: not llc -march=amdgcn -mtriple=amdgcn-mesa-mesa3d -tailcallopt < %s 2>&1 | FileCheck -check-prefixes=GCN,MESA %s
+; RUN: not llc -march=amdgcn -mtriple=amdgcn--amdpal -tailcallopt < %s 2>&1 | FileCheck -check-prefixes=GCN,PAL %s
 ; RUN: not llc -march=r600 -mtriple=r600-- -mcpu=cypress -tailcallopt < %s 2>&1 | FileCheck -check-prefix=R600 %s
 
 declare i32 @external_function(i32) nounwind
@@ -68,13 +69,20 @@ define void @test_indirect_call(void()* %fptr) {
   ret void
 }
 
-; GCN: :0:0: in function test_call_from_shader i32 (): unsupported call from graphics shader of function defined_function
-; R600: in function test_call{{.*}}: unsupported call to function defined_function
-define amdgpu_ps i32 @test_call_from_shader() {
+; GCN: :0:0: in function test_c_call_from_shader i32 (): unsupported calling convention for call from graphics shader of function defined_function
+; R600: in function test_c_call{{.*}}: unsupported call to function defined_function
+define amdgpu_ps i32 @test_c_call_from_shader() {
   %call = call i32 @defined_function(i32 0)
   ret i32 %call
 }
 
+; GCN-NOT: in function test_gfx_call{{.*}}unsupported
+; R600: in function test_gfx_call{{.*}}: unsupported call to function defined_function
+define amdgpu_ps i32 @test_gfx_call_from_shader() {
+  %call = call amdgpu_gfx i32 @defined_function(i32 0)
+  ret i32 %call
+}
+
 ; FIXME: Bad error message
 ; GCN: error: <unknown>:0:0: in function test_call_absolute void (): unsupported indirect call to function <unknown>
 ; R600: error: <unknown>:0:0: in function test_call_absolute void (): unsupported call to function <unknown>


        


More information about the llvm-commits mailing list