[PATCH 1/2] R600/SI: add Gather4 intrinsics (v2)

Marek Olšák maraeo at gmail.com
Wed Jun 11 17:11:10 PDT 2014


From: Marek Olšák <marek.olsak at amd.com>

This adds a new type of intrinsic and SDNode: SampleRaw.
All fields of the MIMG opcodes are exposed and can be set by Mesa,
even DMASK. All GATHER4 variants are added and there are a lot of them.

v2: document DMASK behavior
---
 lib/Target/R600/AMDGPUISelLowering.cpp | 24 +++++++++
 lib/Target/R600/AMDGPUISelLowering.h   | 31 +++++++++++
 lib/Target/R600/SIISelLowering.cpp     | 72 +++++++++++++++++++++++++
 lib/Target/R600/SIISelLowering.h       |  2 +
 lib/Target/R600/SIInstrInfo.td         | 91 ++++++++++++++++++++++++++++++++
 lib/Target/R600/SIInstructions.td      | 96 +++++++++++++++++++++++++---------
 lib/Target/R600/SIIntrinsics.td        | 48 +++++++++++++++++
 7 files changed, 340 insertions(+), 24 deletions(-)

diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 849f169..359161c 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -1542,6 +1542,30 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(SAMPLEB)
   NODE_NAME_CASE(SAMPLED)
   NODE_NAME_CASE(SAMPLEL)
+  NODE_NAME_CASE(GATHER4)
+  NODE_NAME_CASE(GATHER4_CL)
+  NODE_NAME_CASE(GATHER4_L)
+  NODE_NAME_CASE(GATHER4_B)
+  NODE_NAME_CASE(GATHER4_B_CL)
+  NODE_NAME_CASE(GATHER4_LZ)
+  NODE_NAME_CASE(GATHER4_C)
+  NODE_NAME_CASE(GATHER4_C_CL)
+  NODE_NAME_CASE(GATHER4_C_L)
+  NODE_NAME_CASE(GATHER4_C_B)
+  NODE_NAME_CASE(GATHER4_C_B_CL)
+  NODE_NAME_CASE(GATHER4_C_LZ)
+  NODE_NAME_CASE(GATHER4_O)
+  NODE_NAME_CASE(GATHER4_CL_O)
+  NODE_NAME_CASE(GATHER4_L_O)
+  NODE_NAME_CASE(GATHER4_B_O)
+  NODE_NAME_CASE(GATHER4_B_CL_O)
+  NODE_NAME_CASE(GATHER4_LZ_O)
+  NODE_NAME_CASE(GATHER4_C_O)
+  NODE_NAME_CASE(GATHER4_C_CL_O)
+  NODE_NAME_CASE(GATHER4_C_L_O)
+  NODE_NAME_CASE(GATHER4_C_B_O)
+  NODE_NAME_CASE(GATHER4_C_B_CL_O)
+  NODE_NAME_CASE(GATHER4_C_LZ_O)
   NODE_NAME_CASE(STORE_MSKOR)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
   }
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index d5d821d..a9af195 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -203,6 +203,37 @@ enum {
   SAMPLEB,
   SAMPLED,
   SAMPLEL,
+
+  // Gather4 opcodes
+  GATHER4,
+  GATHER4_CL,
+  GATHER4_L,
+  GATHER4_B,
+  GATHER4_B_CL,
+  GATHER4_LZ,
+
+  GATHER4_C,
+  GATHER4_C_CL,
+  GATHER4_C_L,
+  GATHER4_C_B,
+  GATHER4_C_B_CL,
+  GATHER4_C_LZ,
+
+  GATHER4_O,
+  GATHER4_CL_O,
+  GATHER4_L_O,
+  GATHER4_B_O,
+  GATHER4_B_CL_O,
+  GATHER4_LZ_O,
+
+  GATHER4_C_O,
+  GATHER4_C_CL_O,
+  GATHER4_C_L_O,
+  GATHER4_C_B_O,
+  GATHER4_C_B_CL_O,
+  GATHER4_C_LZ_O,
+
+  // Nemory opcodes
   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
   STORE_MSKOR,
   LOAD_CONSTANT,
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 1a861d4..909255d 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -688,6 +688,59 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
                          Op.getOperand(1),
                          Op.getOperand(2),
                          Op.getOperand(3));
+
+    // Gather4 intrinsics
+    case AMDGPUIntrinsic::SI_gather4:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_cl:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_CL, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_l:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_L, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_b:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_B, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_b_cl:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_B_CL, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_lz:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_LZ, Op, DAG);
+
+    case AMDGPUIntrinsic::SI_gather4_c:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_C, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_c_cl:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_C_CL, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_c_l:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_C_L, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_c_b:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_C_B, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_c_b_cl:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_C_B_CL, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_c_lz:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_C_LZ, Op, DAG);
+
+    case AMDGPUIntrinsic::SI_gather4_o:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_O, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_cl_o:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_CL_O, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_l_o:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_L_O, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_b_o:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_B_O, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_b_cl_o:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_B_CL_O, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_lz_o:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_LZ_O, Op, DAG);
+
+    case AMDGPUIntrinsic::SI_gather4_c_o:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_C_O, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_c_cl_o:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_C_CL_O, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_c_l_o:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_C_L_O, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_c_b_o:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_C_B_O, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_c_b_cl_o:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_C_B_CL_O, Op, DAG);
+    case AMDGPUIntrinsic::SI_gather4_c_lz_o:
+      return LowerSampleRawIntrinsic(AMDGPUISD::GATHER4_C_LZ_O, Op, DAG);
     }
   }
 
@@ -876,6 +929,25 @@ SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
                      Op.getOperand(4));
 }
 
+SDValue SITargetLowering::LowerSampleRawIntrinsic(unsigned Opcode,
+                                                  const SDValue &Op,
+                                                  SelectionDAG &DAG) const {
+  SDValue Ops[] = {
+    Op.getOperand(1),
+    Op.getOperand(2),
+    Op.getOperand(3),
+    Op.getOperand(4),
+    Op.getOperand(5),
+    Op.getOperand(6),
+    Op.getOperand(7),
+    Op.getOperand(8),
+    Op.getOperand(9),
+    Op.getOperand(10),
+    Op.getOperand(11)
+  };
+  return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Ops);
+}
+
 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   if (Op.getValueType() != MVT::i64)
     return SDValue();
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index c6eaa81..b48da3b 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -25,6 +25,8 @@ class SITargetLowering : public AMDGPUTargetLowering {
                          SDValue Chain, unsigned Offset, bool Signed) const;
   SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op,
                                SelectionDAG &DAG) const;
+  SDValue LowerSampleRawIntrinsic(unsigned Opcode, const SDValue &Op,
+                                  SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 3368d49..23a7ca3 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -57,6 +57,50 @@ def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">;
 def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
 def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
 
+class SDSampleRaw<string opcode> : SDNode <opcode,
+  SDTypeProfile<1, 11,
+    [SDTCisVT<0, v4f32>, // vdata(VGPR)
+     SDTCisVT<2, v32i8>, // rsrc(SGPR)
+     SDTCisVT<3, v4i32>, // sampler(SGPR)
+     SDTCisVT<4, i32>,   // dmask(imm)
+     SDTCisVT<5, i32>,   // unorm(imm)
+     SDTCisVT<6, i32>,   // r128(imm)
+     SDTCisVT<7, i32>,   // da(imm)
+     SDTCisVT<8, i32>,   // glc(imm)
+     SDTCisVT<9, i32>,   // slc(imm)
+     SDTCisVT<10, i32>,   // tfe(imm)
+     SDTCisVT<11, i32>   // lwe(imm)
+    ]>
+>;
+
+def SIgather4 : SDSampleRaw<"AMDGPUISD::GATHER4">;
+def SIgather4_cl : SDSampleRaw<"AMDGPUISD::GATHER4_CL">;
+def SIgather4_l : SDSampleRaw<"AMDGPUISD::GATHER4_L">;
+def SIgather4_b : SDSampleRaw<"AMDGPUISD::GATHER4_B">;
+def SIgather4_b_cl : SDSampleRaw<"AMDGPUISD::GATHER4_B_CL">;
+def SIgather4_lz : SDSampleRaw<"AMDGPUISD::GATHER4_LZ">;
+
+def SIgather4_c : SDSampleRaw<"AMDGPUISD::GATHER4_C">;
+def SIgather4_c_cl : SDSampleRaw<"AMDGPUISD::GATHER4_C_CL">;
+def SIgather4_c_l : SDSampleRaw<"AMDGPUISD::GATHER4_C_L">;
+def SIgather4_c_b : SDSampleRaw<"AMDGPUISD::GATHER4_C_B">;
+def SIgather4_c_b_cl : SDSampleRaw<"AMDGPUISD::GATHER4_C_B_CL">;
+def SIgather4_c_lz : SDSampleRaw<"AMDGPUISD::GATHER4_C_LZ">;
+
+def SIgather4_o : SDSampleRaw<"AMDGPUISD::GATHER4_O">;
+def SIgather4_cl_o : SDSampleRaw<"AMDGPUISD::GATHER4_CL_O">;
+def SIgather4_l_o : SDSampleRaw<"AMDGPUISD::GATHER4_L_O">;
+def SIgather4_b_o : SDSampleRaw<"AMDGPUISD::GATHER4_B_O">;
+def SIgather4_b_cl_o : SDSampleRaw<"AMDGPUISD::GATHER4_B_CL_O">;
+def SIgather4_lz_o : SDSampleRaw<"AMDGPUISD::GATHER4_LZ_O">;
+
+def SIgather4_c_o : SDSampleRaw<"AMDGPUISD::GATHER4_C_O">;
+def SIgather4_c_cl_o : SDSampleRaw<"AMDGPUISD::GATHER4_C_CL_O">;
+def SIgather4_c_l_o : SDSampleRaw<"AMDGPUISD::GATHER4_C_L_O">;
+def SIgather4_c_b_o : SDSampleRaw<"AMDGPUISD::GATHER4_C_B_O">;
+def SIgather4_c_b_cl_o : SDSampleRaw<"AMDGPUISD::GATHER4_C_B_CL_O">;
+def SIgather4_c_lz_o : SDSampleRaw<"AMDGPUISD::GATHER4_C_LZ_O">;
+
 // Transformation function, extract the lower 32bit of a 64bit immediate
 def LO32 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32);
@@ -658,6 +702,53 @@ multiclass MIMG_Sampler <bits<7> op, string asm> {
   defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4>;
 }
 
+class MIMG_Gather_Helper <bits<7> op, string asm,
+                          RegisterClass dst_rc,
+                          RegisterClass src_rc> : MIMG <
+  op,
+  (outs dst_rc:$vdata),
+  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
+       SReg_256:$srsrc, SReg_128:$ssamp),
+  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
+     #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
+  []> {
+  let mayLoad = 1;
+  let mayStore = 0;
+
+  // DMASK was repurposed for GATHER4. 4 components are always
+  // returned and DMASK works like a swizzle - it selects
+  // the component to fetch. The only useful DMASK values are
+  // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
+  // (red,red,red,red) etc.) The ISA document doesn't mention
+  // this.
+  // Therefore, disable all code which updates DMASK by setting these two:
+  let MIMG = 0;
+  let hasPostISelHook = 0;
+}
+
+multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
+                                    RegisterClass dst_rc,
+                                    int channels> {
+  def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_32>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128>,
+            MIMG_Mask<asm#"_V4", channels>;
+  def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256>,
+            MIMG_Mask<asm#"_V8", channels>;
+  def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512>,
+            MIMG_Mask<asm#"_V16", channels>;
+}
+
+multiclass MIMG_Gather <bits<7> op, string asm> {
+  defm _V1 : MIMG_Gather_Src_Helper<op, asm, VReg_32, 1>;
+  defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2>;
+  defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3>;
+  defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4>;
+}
+
 //===----------------------------------------------------------------------===//
 // Vector instruction mappings
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index d4a7c5c..d65d88b 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -887,30 +887,30 @@ defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">;
 //def IMAGE_SAMPLE_C_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_O", 0x0000003d>;
 //def IMAGE_SAMPLE_C_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL_O", 0x0000003e>;
 //def IMAGE_SAMPLE_C_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ_O", 0x0000003f>;
-//def IMAGE_GATHER4 : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4", 0x00000040>;
-//def IMAGE_GATHER4_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL", 0x00000041>;
-//def IMAGE_GATHER4_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L", 0x00000044>;
-//def IMAGE_GATHER4_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B", 0x00000045>;
-//def IMAGE_GATHER4_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL", 0x00000046>;
-//def IMAGE_GATHER4_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ", 0x00000047>;
-//def IMAGE_GATHER4_C : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C", 0x00000048>;
-//def IMAGE_GATHER4_C_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL", 0x00000049>;
-//def IMAGE_GATHER4_C_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L", 0x0000004c>;
-//def IMAGE_GATHER4_C_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B", 0x0000004d>;
-//def IMAGE_GATHER4_C_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL", 0x0000004e>;
-//def IMAGE_GATHER4_C_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ", 0x0000004f>;
-//def IMAGE_GATHER4_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_O", 0x00000050>;
-//def IMAGE_GATHER4_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL_O", 0x00000051>;
-//def IMAGE_GATHER4_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L_O", 0x00000054>;
-//def IMAGE_GATHER4_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_O", 0x00000055>;
-//def IMAGE_GATHER4_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL_O", 0x00000056>;
-//def IMAGE_GATHER4_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ_O", 0x00000057>;
-//def IMAGE_GATHER4_C_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_O", 0x00000058>;
-//def IMAGE_GATHER4_C_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL_O", 0x00000059>;
-//def IMAGE_GATHER4_C_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L_O", 0x0000005c>;
-//def IMAGE_GATHER4_C_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_O", 0x0000005d>;
-//def IMAGE_GATHER4_C_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL_O", 0x0000005e>;
-//def IMAGE_GATHER4_C_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ_O", 0x0000005f>;
+defm IMAGE_GATHER4          : MIMG_Gather <0x00000040, "IMAGE_GATHER4">;
+defm IMAGE_GATHER4_CL       : MIMG_Gather <0x00000041, "IMAGE_GATHER4_CL">;
+defm IMAGE_GATHER4_L        : MIMG_Gather <0x00000044, "IMAGE_GATHER4_L">;
+defm IMAGE_GATHER4_B        : MIMG_Gather <0x00000045, "IMAGE_GATHER4_B">;
+defm IMAGE_GATHER4_B_CL     : MIMG_Gather <0x00000046, "IMAGE_GATHER4_B_CL">;
+defm IMAGE_GATHER4_LZ       : MIMG_Gather <0x00000047, "IMAGE_GATHER4_LZ">;
+defm IMAGE_GATHER4_C        : MIMG_Gather <0x00000048, "IMAGE_GATHER4_C">;
+defm IMAGE_GATHER4_C_CL     : MIMG_Gather <0x00000049, "IMAGE_GATHER4_C_CL">;
+defm IMAGE_GATHER4_C_L      : MIMG_Gather <0x0000004c, "IMAGE_GATHER4_C_L">;
+defm IMAGE_GATHER4_C_B      : MIMG_Gather <0x0000004d, "IMAGE_GATHER4_C_B">;
+defm IMAGE_GATHER4_C_B_CL   : MIMG_Gather <0x0000004e, "IMAGE_GATHER4_C_B_CL">;
+defm IMAGE_GATHER4_C_LZ     : MIMG_Gather <0x0000004f, "IMAGE_GATHER4_C_LZ">;
+defm IMAGE_GATHER4_O        : MIMG_Gather <0x00000050, "IMAGE_GATHER4_O">;
+defm IMAGE_GATHER4_CL_O     : MIMG_Gather <0x00000051, "IMAGE_GATHER4_CL_O">;
+defm IMAGE_GATHER4_L_O      : MIMG_Gather <0x00000054, "IMAGE_GATHER4_L_O">;
+defm IMAGE_GATHER4_B_O      : MIMG_Gather <0x00000055, "IMAGE_GATHER4_B_O">;
+defm IMAGE_GATHER4_B_CL_O   : MIMG_Gather <0x00000056, "IMAGE_GATHER4_B_CL_O">;
+defm IMAGE_GATHER4_LZ_O     : MIMG_Gather <0x00000057, "IMAGE_GATHER4_LZ_O">;
+defm IMAGE_GATHER4_C_O      : MIMG_Gather <0x00000058, "IMAGE_GATHER4_C_O">;
+defm IMAGE_GATHER4_C_CL_O   : MIMG_Gather <0x00000059, "IMAGE_GATHER4_C_CL_O">;
+defm IMAGE_GATHER4_C_L_O    : MIMG_Gather <0x0000005c, "IMAGE_GATHER4_C_L_O">;
+defm IMAGE_GATHER4_C_B_O    : MIMG_Gather <0x0000005d, "IMAGE_GATHER4_C_B_O">;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "IMAGE_GATHER4_C_B_CL_O">;
+defm IMAGE_GATHER4_C_LZ_O   : MIMG_Gather <0x0000005f, "IMAGE_GATHER4_C_LZ_O">;
 //def IMAGE_GET_LOD : MIMG_NoPattern_ <"IMAGE_GET_LOD", 0x00000060>;
 //def IMAGE_SAMPLE_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD", 0x00000068>;
 //def IMAGE_SAMPLE_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL", 0x00000069>;
@@ -1655,6 +1655,54 @@ def : SextInReg <i16, 16>;
 /********** Image sampling patterns **********/
 /********** ======================= **********/
 
+class SampleRawPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+  (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm,
+        i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
+  (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da),
+          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc),
+          $addr, $rsrc, $sampler)
+>;
+
+// Gather4 patterns. Only the variants which make sense are defined.
+def : SampleRawPattern<SIgather4,           IMAGE_GATHER4_V4_V2,        v2i32>;
+def : SampleRawPattern<SIgather4,           IMAGE_GATHER4_V4_V4,        v4i32>;
+def : SampleRawPattern<SIgather4_cl,        IMAGE_GATHER4_CL_V4_V4,     v4i32>;
+def : SampleRawPattern<SIgather4_l,         IMAGE_GATHER4_L_V4_V4,      v4i32>;
+def : SampleRawPattern<SIgather4_b,         IMAGE_GATHER4_B_V4_V4,      v4i32>;
+def : SampleRawPattern<SIgather4_b_cl,      IMAGE_GATHER4_B_CL_V4_V4,   v4i32>;
+def : SampleRawPattern<SIgather4_b_cl,      IMAGE_GATHER4_B_CL_V4_V8,   v8i32>;
+def : SampleRawPattern<SIgather4_lz,        IMAGE_GATHER4_LZ_V4_V2,     v2i32>;
+def : SampleRawPattern<SIgather4_lz,        IMAGE_GATHER4_LZ_V4_V4,     v4i32>;
+
+def : SampleRawPattern<SIgather4_c,         IMAGE_GATHER4_C_V4_V4,      v4i32>;
+def : SampleRawPattern<SIgather4_c_cl,      IMAGE_GATHER4_C_CL_V4_V4,   v4i32>;
+def : SampleRawPattern<SIgather4_c_cl,      IMAGE_GATHER4_C_CL_V4_V8,   v8i32>;
+def : SampleRawPattern<SIgather4_c_l,       IMAGE_GATHER4_C_L_V4_V4,    v4i32>;
+def : SampleRawPattern<SIgather4_c_l,       IMAGE_GATHER4_C_L_V4_V8,    v8i32>;
+def : SampleRawPattern<SIgather4_c_b,       IMAGE_GATHER4_C_B_V4_V4,    v4i32>;
+def : SampleRawPattern<SIgather4_c_b,       IMAGE_GATHER4_C_B_V4_V8,    v8i32>;
+def : SampleRawPattern<SIgather4_c_b_cl,    IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>;
+def : SampleRawPattern<SIgather4_c_lz,      IMAGE_GATHER4_C_LZ_V4_V4,   v4i32>;
+
+def : SampleRawPattern<SIgather4_o,         IMAGE_GATHER4_O_V4_V4,      v4i32>;
+def : SampleRawPattern<SIgather4_cl_o,      IMAGE_GATHER4_CL_O_V4_V4,   v4i32>;
+def : SampleRawPattern<SIgather4_cl_o,      IMAGE_GATHER4_CL_O_V4_V8,   v8i32>;
+def : SampleRawPattern<SIgather4_l_o,       IMAGE_GATHER4_L_O_V4_V4,    v4i32>;
+def : SampleRawPattern<SIgather4_l_o,       IMAGE_GATHER4_L_O_V4_V8,    v8i32>;
+def : SampleRawPattern<SIgather4_b_o,       IMAGE_GATHER4_B_O_V4_V4,    v4i32>;
+def : SampleRawPattern<SIgather4_b_o,       IMAGE_GATHER4_B_O_V4_V8,    v8i32>;
+def : SampleRawPattern<SIgather4_b_cl_o,    IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<SIgather4_lz_o,      IMAGE_GATHER4_LZ_O_V4_V4,   v4i32>;
+
+def : SampleRawPattern<SIgather4_c_o,       IMAGE_GATHER4_C_O_V4_V4,    v4i32>;
+def : SampleRawPattern<SIgather4_c_o,       IMAGE_GATHER4_C_O_V4_V8,    v8i32>;
+def : SampleRawPattern<SIgather4_c_cl_o,    IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<SIgather4_c_l_o,     IMAGE_GATHER4_C_L_O_V4_V8,  v8i32>;
+def : SampleRawPattern<SIgather4_c_b_o,     IMAGE_GATHER4_C_B_O_V4_V8,  v8i32>;
+def : SampleRawPattern<SIgather4_c_b_cl_o,  IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<SIgather4_c_lz_o,    IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>;
+def : SampleRawPattern<SIgather4_c_lz_o,    IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>;
+
 /* SIsample for simple 1D texture lookup */
 def : Pat <
   (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
index 00e32c0..9d85f17 100644
--- a/lib/Target/R600/SIIntrinsics.td
+++ b/lib/Target/R600/SIIntrinsics.td
@@ -56,11 +56,59 @@ let TargetPrefix = "SI", isTarget = 1 in {
 
   class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
 
+  // Fully-flexible SAMPLE instruction.
+  class SampleRaw : Intrinsic <
+    [llvm_v4f32_ty],    // vdata(VGPR)
+    [llvm_anyvector_ty, // vaddr(VGPR)
+     llvm_v32i8_ty,     // rsrc(SGPR)
+     llvm_v16i8_ty,     // sampler(SGPR)
+     llvm_i32_ty,       // dmask(imm)
+     llvm_i32_ty,       // unorm(imm)
+     llvm_i32_ty,       // r128(imm)
+     llvm_i32_ty,       // da(imm)
+     llvm_i32_ty,       // glc(imm)
+     llvm_i32_ty,       // slc(imm)
+     llvm_i32_ty,       // tfe(imm)
+     llvm_i32_ty],      // lwe(imm)
+    [IntrNoMem]>;
+
   def int_SI_sample : Sample;
   def int_SI_sampleb : Sample;
   def int_SI_sampled : Sample;
   def int_SI_samplel : Sample;
 
+  // Basic gather4
+  def int_SI_gather4 : SampleRaw;
+  def int_SI_gather4_cl : SampleRaw;
+  def int_SI_gather4_l : SampleRaw;
+  def int_SI_gather4_b : SampleRaw;
+  def int_SI_gather4_b_cl : SampleRaw;
+  def int_SI_gather4_lz : SampleRaw;
+
+  // Gather4 with comparison
+  def int_SI_gather4_c : SampleRaw;
+  def int_SI_gather4_c_cl : SampleRaw;
+  def int_SI_gather4_c_l : SampleRaw;
+  def int_SI_gather4_c_b : SampleRaw;
+  def int_SI_gather4_c_b_cl : SampleRaw;
+  def int_SI_gather4_c_lz : SampleRaw;
+
+  // Gather4 with offsets
+  def int_SI_gather4_o : SampleRaw;
+  def int_SI_gather4_cl_o : SampleRaw;
+  def int_SI_gather4_l_o : SampleRaw;
+  def int_SI_gather4_b_o : SampleRaw;
+  def int_SI_gather4_b_cl_o : SampleRaw;
+  def int_SI_gather4_lz_o : SampleRaw;
+
+  // Gather4 with comparison and offsets
+  def int_SI_gather4_c_o : SampleRaw;
+  def int_SI_gather4_c_cl_o : SampleRaw;
+  def int_SI_gather4_c_l_o : SampleRaw;
+  def int_SI_gather4_c_b_o : SampleRaw;
+  def int_SI_gather4_c_b_cl_o : SampleRaw;
+  def int_SI_gather4_c_lz_o : SampleRaw;
+
   def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
   def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
-- 
1.9.1




More information about the llvm-commits mailing list