[llvm] 20d2015 - [AMDGPU] gfx11 VINTERP intrinsics and ISel support

Fri Jun 17 06:46:28 PDT 2022

Author: Joe Nash
Date: 2022-06-17T09:16:59-04:00
New Revision: 20d20156f4ce478407c4d5a0ef89afce17eddcf0

URL: https://github.com/llvm/llvm-project/commit/20d20156f4ce478407c4d5a0ef89afce17eddcf0
DIFF: https://github.com/llvm/llvm-project/commit/20d20156f4ce478407c4d5a0ef89afce17eddcf0.diff

LOG: [AMDGPU] gfx11 VINTERP intrinsics and ISel support

Depends on D127664

Reviewed By: rampitec, #amdgpu

Differential Revision: https://reviews.llvm.org/D127756

Added: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll

Modified: 
    llvm/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/lib/Target/AMDGPU/AMDGPUGISel.td
    llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
    llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
    llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
    llvm/lib/Target/AMDGPU/VINTERPInstructions.td

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index c2a6534def95..f55b83bab699 100644

--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1505,6 +1505,34 @@ def int_amdgcn_lds_param_load :
             [IntrNoMem, IntrSpeculatable, IntrWillReturn,
              ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
 
+// __int_amdgcn_interp_inreg_p10 <p>, <i>, <p0>
+def int_amdgcn_interp_inreg_p10 :
+  Intrinsic<[llvm_float_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+            [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+
+// __int_amdgcn_interp_inreg_p2 <p>, <j>, <tmp>
+def int_amdgcn_interp_inreg_p2 :
+  Intrinsic<[llvm_float_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+            [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+
+// __int_amdgcn_interp_inreg_p10_f16 <p>, <i>, <p0>, <high>
+// high selects whether high or low 16-bits are used for p and p0 operands
+def int_amdgcn_interp_inreg_p10_f16:
+  Intrinsic<[llvm_float_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
+            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
+             ImmArg<ArgIndex<3>>]>;
+
+// __int_amdgcn_interp_inreg_p2_f16 <p>, <j>, <tmp>, <high>
+// high selects whether high or low 16-bits are used for p operand
+def int_amdgcn_interp_inreg_p2_f16 :
+  Intrinsic<[llvm_half_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
+            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
+             ImmArg<ArgIndex<3>>]>;
+
 // Deprecated: use llvm.amdgcn.live.mask instead.
 def int_amdgcn_ps_live : Intrinsic <
   [llvm_i1_ty],

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index eba63ccdcba1..af4b4895809f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -59,6 +59,14 @@ def gi_vop3opselmods :
     GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
     GIComplexPatternEquiv<VOP3OpSelMods>;
 
+def gi_vinterpmods :
+    GIComplexOperandMatcher<s32, "selectVINTERPMods">,
+    GIComplexPatternEquiv<VINTERPMods>;
+
+def gi_vinterpmods_hi :
+    GIComplexOperandMatcher<s32, "selectVINTERPModsHi">,
+    GIComplexPatternEquiv<VINTERPModsHi>;
+
 // FIXME: Why do we have both VOP3OpSel and VOP3OpSelMods?
 def gi_vop3opsel :
     GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index e25165eac914..19648310f607 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -13,7 +13,9 @@
 
 #include "AMDGPUISelDAGToDAG.h"
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/R600MCTargetDesc.h"
 #include "R600RegisterInfo.h"
 #include "SIMachineFunctionInfo.h"
@@ -2606,6 +2608,30 @@ bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
   return true;
 }
 
+bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
+                                               SDValue &SrcMods,
+                                               bool OpSel) const {
+  unsigned Mods;
+  if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) {
+    if (OpSel)
+      Mods |= SISrcMods::OP_SEL_0;
+    SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
+                                           SDValue &SrcMods) const {
+  return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
+                                             SDValue &SrcMods) const {
+  return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
                                          SDValue &SrcMods, SDValue &Clamp,
                                          SDValue &Omod) const {

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 69d79fc815a3..28c79d83dbfb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -218,6 +218,11 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
                          SDValue &Clamp, SDValue &Omod) const;
 
+  bool SelectVINTERPModsImpl(SDValue In, SDValue &Src, SDValue &SrcMods,
+                             bool OpSel) const;
+  bool SelectVINTERPMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVINTERPModsHi(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+
   bool SelectVOP3OMods(SDValue In, SDValue &Src, SDValue &Clamp,
                        SDValue &Omod) const;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index f89cdd42ee45..6bf80e0dc90c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3497,9 +3497,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
 
 }
 
-std::pair<Register, unsigned>
-AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
-                                              bool AllowAbs) const {
+std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
+    MachineOperand &Root, bool AllowAbs, bool OpSel, bool ForceVGPR) const {
   Register Src = Root.getReg();
   Register OrigSrc = Src;
   unsigned Mods = 0;
@@ -3516,7 +3515,10 @@ AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
     Mods |= SISrcMods::ABS;
   }
 
-  if (Mods != 0 &&
+  if (OpSel)
+    Mods |= SISrcMods::OP_SEL_0;
+
+  if ((Mods != 0 || ForceVGPR) &&
       RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
     MachineInstr *UseMI = Root.getParent();
 
@@ -3708,6 +3710,36 @@ AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
   }};
 }
 
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
+  Register Src;
+  unsigned Mods;
+  std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+                                           /* AllowAbs */ false,
+                                           /* OpSel */ false,
+                                           /* ForceVGPR */ true);
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
+  }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
+  Register Src;
+  unsigned Mods;
+  std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+                                           /* AllowAbs */ false,
+                                           /* OpSel */ true,
+                                           /* ForceVGPR */ true);
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
+  }};
+}
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
   SmallVector<GEPInfo, 4> AddrInfo;

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 27565f9ca687..fcf96cb85d9a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -150,8 +150,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool selectSMFMACIntrin(MachineInstr &I) const;
   bool selectWaveAddress(MachineInstr &I) const;
 
-  std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
-                                                   bool AllowAbs = true) const;
+  std::pair<Register, unsigned>
+  selectVOP3ModsImpl(MachineOperand &Root, bool AllowAbs = true,
+                     bool OpSel = false, bool ForceVGPR = false) const;
 
   InstructionSelector::ComplexRendererFns
   selectVCSRC(MachineOperand &Root) const;
@@ -191,6 +192,11 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   InstructionSelector::ComplexRendererFns
   selectVOP3OpSelMods(MachineOperand &Root) const;
 
+  InstructionSelector::ComplexRendererFns
+  selectVINTERPMods(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectVINTERPModsHi(MachineOperand &Root) const;
+
   InstructionSelector::ComplexRendererFns
   selectSmrdImm(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 8f046d07ed06..79fdae5bcee3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3017,6 +3017,12 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
       return;
     }
+    case Intrinsic::amdgcn_interp_inreg_p10:
+    case Intrinsic::amdgcn_interp_inreg_p2:
+    case Intrinsic::amdgcn_interp_inreg_p10_f16:
+    case Intrinsic::amdgcn_interp_inreg_p2_f16:
+      applyDefaultMapping(OpdMapper);
+      return;
     case Intrinsic::amdgcn_permlane16:
     case Intrinsic::amdgcn_permlanex16: {
       // Doing a waterfall loop over these wouldn't make any sense.
@@ -4469,6 +4475,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
       break;
     }
+    case Intrinsic::amdgcn_interp_inreg_p10:
+    case Intrinsic::amdgcn_interp_inreg_p2:
+    case Intrinsic::amdgcn_interp_inreg_p10_f16:
+    case Intrinsic::amdgcn_interp_inreg_p2_f16: {
+      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
+      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+      break;
+    }
     case Intrinsic::amdgcn_ballot: {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();

diff  --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
index 11553c3d1c9a..c63fbbc241d9 100644
--- a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
@@ -111,10 +111,55 @@ def V_INTERP_P2_RTZ_F16_F32_inreg :
 
 } // SubtargetPredicate = isGFX11Plus
 
+class VInterpF32Pat <SDPatternOperator op, Instruction inst> : GCNPat <
+   (f32 (op
+      (VINTERPMods f32:$src0, i32:$src0_modifiers),
+      (VINTERPMods f32:$src1, i32:$src1_modifiers),
+      (VINTERPMods f32:$src2, i32:$src2_modifiers))),
+    (inst $src0_modifiers, $src0,
+          $src1_modifiers, $src1,
+          $src2_modifiers, $src2,
+          0, /* clamp */
+          7) /* wait_exp */
+>;
+
 def VINTERP_OPSEL {
   int LOW = 0;
   int HIGH = 0xa;
 }
+
+class VInterpF16Pat <SDPatternOperator op, Instruction inst,
+                     ValueType dst_type, bit high,
+                     list<ComplexPattern> pat> : GCNPat <
+   (dst_type (op
+      (pat[0] f32:$src0, i32:$src0_modifiers),
+      (pat[1] f32:$src1, i32:$src1_modifiers),
+      (pat[2] f32:$src2, i32:$src2_modifiers),
+      !if(high, (i1 -1), (i1 0)))),
+    (inst $src0_modifiers, $src0,
+          $src1_modifiers, $src1,
+          $src2_modifiers, $src2,
+          0, /* clamp */
+          /* op_sel = 0 */
+          7) /* wait_exp */
+>;
+
+multiclass VInterpF16Pat <SDPatternOperator op, Instruction inst,
+                          ValueType dst_type, list<ComplexPattern> high_pat> {
+  def : VInterpF16Pat<op, inst, dst_type, 0,
+                      [VINTERPMods, VINTERPMods, VINTERPMods]>;
+  def : VInterpF16Pat<op, inst, dst_type, 1, high_pat>;
+}
+
+def : VInterpF32Pat<int_amdgcn_interp_inreg_p10, V_INTERP_P10_F32_inreg>;
+def : VInterpF32Pat<int_amdgcn_interp_inreg_p2, V_INTERP_P2_F32_inreg>;
+defm : VInterpF16Pat<int_amdgcn_interp_inreg_p10_f16,
+                     V_INTERP_P10_F16_F32_inreg, f32,
+                     [VINTERPModsHi, VINTERPMods, VINTERPModsHi]>;
+defm : VInterpF16Pat<int_amdgcn_interp_inreg_p2_f16,
+                     V_INTERP_P2_F16_F32_inreg, f16,
+                     [VINTERPModsHi, VINTERPMods, VINTERPMods]>;
+
 //===----------------------------------------------------------------------===//
 // VINTERP Real Instructions
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
new file mode 100644
index 000000000000..b2e1542c1bc0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_f32:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    lds_param_load v0, attr0.y
+; GCN-NEXT:    lds_param_load v1, attr1.x
+; GCN-NEXT:    v_mov_b32_e32 v4, s1
+; GCN-NEXT:    v_interp_p10_f32 v3, v0, v2, v0 wait_exp:7
+; GCN-NEXT:    v_interp_p10_f32 v2, v1, v2, v1 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v0, v0, v4, v3 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v1, v1, v4, v0 wait_exp:7
+; GCN-NEXT:    exp mrt0 v3, v2, v0, v1 done
+; GCN-NEXT:    s_endpgm
+main_body:
+  %p0 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %m0)
+  %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
+  %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0)
+  %p1_0 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0)
+  %p0_1 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1)
+  %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_0, float %p0_1, float %p1_0, float %p1_1, i1 true, i1 true) #0
+  ret void
+}
+
+define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_f32_many:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    lds_param_load v0, attr0.x
+; GCN-NEXT:    lds_param_load v1, attr1.x
+; GCN-NEXT:    lds_param_load v2, attr2.x
+; GCN-NEXT:    lds_param_load v3, attr3.x
+; GCN-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-NEXT:    v_interp_p10_f32 v6, v0, v4, v0 wait_exp:7
+; GCN-NEXT:    v_interp_p10_f32 v7, v1, v4, v1 wait_exp:7
+; GCN-NEXT:    v_interp_p10_f32 v8, v2, v4, v2 wait_exp:7
+; GCN-NEXT:    v_interp_p10_f32 v4, v3, v4, v3 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v0, v0, v5, v6 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v1, v1, v5, v7 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v2, v2, v5, v8 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v3, v3, v5, v4 wait_exp:7
+; GCN-NEXT:    exp mrt0 v0, v1, v2, v3 done
+; GCN-NEXT:    s_endpgm
+main_body:
+  %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
+  %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
+  %p2 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 2, i32 %m0)
+  %p3 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 3, i32 %m0)
+  %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0)
+  %p0_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0)
+  %p1_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1)
+  %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0)
+  %p2_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p2, float %i, float %p2)
+  %p2_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p2, float %j, float %p2_0)
+  %p3_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p3, float %i, float %p3)
+  %p3_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p3, float %j, float %p3_0)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_1, float %p1_1, float %p2_1, float %p3_1, i1 true, i1 true) #0
+  ret void
+}
+
+define amdgpu_ps void @v_interp_f32_many_vm(float addrspace(1)* %ptr, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_f32_many_vm:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:4
+; GCN-NEXT:    s_mov_b32 m0, s0
+; GCN-NEXT:    lds_param_load v2, attr0.x
+; GCN-NEXT:    lds_param_load v3, attr1.x
+; GCN-NEXT:    lds_param_load v4, attr2.x
+; GCN-NEXT:    lds_param_load v5, attr3.x
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_interp_p10_f32 v6, v2, v0, v2 wait_exp:7
+; GCN-NEXT:    v_interp_p10_f32 v7, v3, v0, v3 wait_exp:7
+; GCN-NEXT:    v_interp_p10_f32 v8, v4, v0, v4 wait_exp:7
+; GCN-NEXT:    v_interp_p10_f32 v0, v5, v0, v5 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v2, v2, v1, v6 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v3, v3, v1, v7 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
+; GCN-NEXT:    exp mrt0 v2, v3, v4, v0 done
+; GCN-NEXT:    s_endpgm
+main_body:
+  %i.ptr = getelementptr float, float addrspace(1)* %ptr, i32 1
+  %i = load float, float addrspace(1)* %i.ptr, align 4
+  %j.ptr = getelementptr float, float addrspace(1)* %ptr, i32 2
+  %j = load float, float addrspace(1)* %j.ptr, align 4
+  %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
+  %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
+  %p2 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 2, i32 %m0)
+  %p3 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 3, i32 %m0)
+  %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0)
+  %p0_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0)
+  %p1_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1)
+  %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0)
+  %p2_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p2, float %i, float %p2)
+  %p2_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p2, float %j, float %p2_0)
+  %p3_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p3, float %i, float %p3)
+  %p3_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p3, float %j, float %p3_0)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_1, float %p1_1, float %p2_1, float %p3_1, i1 true, i1 true) #0
+  ret void
+}
+
+define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_f16:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    lds_param_load v0, attr0.x
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    v_interp_p10_f16_f32 v3, v0, v1, v0 wait_exp:7
+; GCN-NEXT:    v_interp_p10_f16_f32 v1, v0, v1, v0 op_sel:[1,0,1,0] wait_exp:7
+; GCN-NEXT:    v_interp_p2_f16_f32 v3, v0, v2, v3 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f16_f32 v0, v0, v2, v1 op_sel:[1,0,0,0] wait_exp:7
+; GCN-NEXT:    v_add_f16_e32 v0, v3, v0
+; GCN-NEXT:    ; return to shader part epilog
+main_body:
+  %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
+  %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 0)
+  %l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float %p0, float %j, float %l_p0, i1 0)
+  %h_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 1)
+  %h_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float %p0, float %j, float %h_p0, i1 1)
+  %res = fadd half %l_p1, %h_p1
+  ret half %res
+}
+
+declare float @llvm.amdgcn.lds.param.load(i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #0
+declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #0
+declare float @llvm.amdgcn.interp.inreg.p10.f16(float, float, float, i1) #0
+declare half @llvm.amdgcn.interp.inreg.p2.f16(float, float, float, i1) #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare void @llvm.amdgcn.exp.f16(i32, i32, float, float, float, float, i1, i1) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
new file mode 100644
index 000000000000..0e00a67fcf98
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_f32:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    lds_param_load v0, attr0.y
+; GCN-NEXT:    lds_param_load v1, attr1.x
+; GCN-NEXT:    v_mov_b32_e32 v4, s1
+; GCN-NEXT:    v_interp_p10_f32 v3, v0, v2, v0 wait_exp:7
+; GCN-NEXT:    v_interp_p10_f32 v2, v1, v2, v1 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v0, v0, v4, v3 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v1, v1, v4, v0 wait_exp:7
+; GCN-NEXT:    exp mrt0 v3, v2, v0, v1 done
+; GCN-NEXT:    s_endpgm
+main_body:
+  %p0 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %m0)
+  %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
+  %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0)
+  %p1_0 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0)
+  %p0_1 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1)
+  %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_0, float %p0_1, float %p1_0, float %p1_1, i1 true, i1 true) #0
+  ret void
+}
+
+define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_f32_many:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    lds_param_load v0, attr0.x
+; GCN-NEXT:    lds_param_load v1, attr1.x
+; GCN-NEXT:    lds_param_load v2, attr2.x
+; GCN-NEXT:    lds_param_load v3, attr3.x
+; GCN-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-NEXT:    v_interp_p10_f32 v6, v0, v4, v0 wait_exp:7
+; GCN-NEXT:    v_interp_p10_f32 v7, v1, v4, v1 wait_exp:7
+; GCN-NEXT:    v_interp_p10_f32 v8, v2, v4, v2 wait_exp:7
+; GCN-NEXT:    v_interp_p10_f32 v4, v3, v4, v3 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v0, v0, v5, v6 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v1, v1, v5, v7 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v2, v2, v5, v8 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v3, v3, v5, v4 wait_exp:7
+; GCN-NEXT:    exp mrt0 v0, v1, v2, v3 done
+; GCN-NEXT:    s_endpgm
+main_body:
+  %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
+  %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
+  %p2 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 2, i32 %m0)
+  %p3 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 3, i32 %m0)
+  %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0)
+  %p0_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0)
+  %p1_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1)
+  %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0)
+  %p2_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p2, float %i, float %p2)
+  %p2_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p2, float %j, float %p2_0)
+  %p3_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p3, float %i, float %p3)
+  %p3_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p3, float %j, float %p3_0)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_1, float %p1_1, float %p2_1, float %p3_1, i1 true, i1 true) #0
+  ret void
+}
+
+define amdgpu_ps void @v_interp_f32_many_vm(float addrspace(1)* %ptr, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_f32_many_vm:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:4
+; GCN-NEXT:    s_mov_b32 m0, s0
+; GCN-NEXT:    lds_param_load v2, attr0.x
+; GCN-NEXT:    lds_param_load v3, attr1.x
+; GCN-NEXT:    lds_param_load v4, attr2.x
+; GCN-NEXT:    lds_param_load v5, attr3.x
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_interp_p10_f32 v6, v2, v0, v2 wait_exp:7
+; GCN-NEXT:    v_interp_p10_f32 v7, v3, v0, v3 wait_exp:7
+; GCN-NEXT:    v_interp_p10_f32 v8, v4, v0, v4 wait_exp:7
+; GCN-NEXT:    v_interp_p10_f32 v0, v5, v0, v5 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v2, v2, v1, v6 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v3, v3, v1, v7 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
+; GCN-NEXT:    exp mrt0 v2, v3, v4, v0 done
+; GCN-NEXT:    s_endpgm
+main_body:
+  %i.ptr = getelementptr float, float addrspace(1)* %ptr, i32 1
+  %i = load float, float addrspace(1)* %i.ptr, align 4
+  %j.ptr = getelementptr float, float addrspace(1)* %ptr, i32 2
+  %j = load float, float addrspace(1)* %j.ptr, align 4
+  %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
+  %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
+  %p2 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 2, i32 %m0)
+  %p3 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 3, i32 %m0)
+  %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0)
+  %p0_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0)
+  %p1_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1)
+  %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0)
+  %p2_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p2, float %i, float %p2)
+  %p2_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p2, float %j, float %p2_0)
+  %p3_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p3, float %i, float %p3)
+  %p3_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p3, float %j, float %p3_0)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_1, float %p1_1, float %p2_1, float %p3_1, i1 true, i1 true) #0
+  ret void
+}
+
+define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_f16:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    lds_param_load v0, attr0.x
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    v_interp_p10_f16_f32 v3, v0, v1, v0 wait_exp:7
+; GCN-NEXT:    v_interp_p10_f16_f32 v1, v0, v1, v0 op_sel:[1,0,1,0] wait_exp:7
+; GCN-NEXT:    v_interp_p2_f16_f32 v3, v0, v2, v3 wait_exp:7
+; GCN-NEXT:    v_interp_p2_f16_f32 v0, v0, v2, v1 op_sel:[1,0,0,0] wait_exp:7
+; GCN-NEXT:    v_add_f16_e32 v0, v3, v0
+; GCN-NEXT:    ; return to shader part epilog
+main_body:
+  %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
+  %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 0)
+  %l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float %p0, float %j, float %l_p0, i1 0)
+  %h_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 1)
+  %h_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float %p0, float %j, float %h_p0, i1 1)
+  %res = fadd half %l_p1, %h_p1
+  ret half %res
+}
+
+declare float @llvm.amdgcn.lds.param.load(i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #0
+declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #0
+declare float @llvm.amdgcn.interp.inreg.p10.f16(float, float, float, i1) #0
+declare half @llvm.amdgcn.interp.inreg.p2.f16(float, float, float, i1) #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare void @llvm.amdgcn.exp.f16(i32, i32, float, float, float, float, i1, i1) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }