[PATCH] R600/SI: Adjust buffer_load opcodes

Tue Apr 2 06:11:26 PDT 2013

From: Christian König <christian.koenig at amd.com>

Only try to load the used vector components. Practically
the same what I've done before for textures.

Signed-off-by: Christian König <christian.koenig at amd.com>
---
 lib/Target/R600/SIISelLowering.cpp         |   72 ++++++++++++++++++++++++----
 lib/Target/R600/SIISelLowering.h           |    1 +
 lib/Target/R600/SIInstructions.td          |    6 +--
 test/CodeGen/R600/llvm.SI.vs.load.input.ll |   25 ++++++++++
 4 files changed, 91 insertions(+), 13 deletions(-)
 create mode 100644 test/CodeGen/R600/llvm.SI.vs.load.input.ll

diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index e4b3111..9c23f3a 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -671,7 +671,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
 }
 
 /// \brief Helper function for adjustWritemask
-unsigned SubIdx2Lane(unsigned Idx) {
+static unsigned subIdx2Lane(unsigned Idx) {
   switch (Idx) {
   default: return 0;
   case AMDGPU::sub0: return 0;
@@ -681,11 +681,11 @@ unsigned SubIdx2Lane(unsigned Idx) {
   }
 }
 
-/// \brief Adjust the writemask of MIMG instructions
-void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
-                                       SelectionDAG &DAG) const {
-  SDNode *Users[4] = { };
-  unsigned Writemask = 0, Lane = 0;
+/// \brief Get the sub components used in a nodes result
+static unsigned getUsedComponents(SDNode *Node, SDNode **Users,
+                                  unsigned &Lane) {
+
+  unsigned Writemask = 0;
 
   // Try to figure out the used register components
   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
@@ -694,18 +694,29 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
     // Abort if we can't understand the usage
     if (!I->isMachineOpcode() ||
         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
-      return;
+      return 0xf;
 
-    Lane = SubIdx2Lane(I->getConstantOperandVal(1));
+    Lane = subIdx2Lane(I->getConstantOperandVal(1));
 
     // Abort if we have more than one user per component
     if (Users[Lane])
-      return;
+      return 0xf;
 
     Users[Lane] = *I;
     Writemask |= 1 << Lane;
   }
 
+  return Writemask;
+}
+
+/// \brief Adjust the writemask of MIMG instructions
+void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
+                                       SelectionDAG &DAG) const {
+  unsigned Writemask = 0, Lane = 0;
+  SDNode *Users[4] = { };
+
+  Writemask = getUsedComponents(Node, Users, Lane);
+
   // Abort if all components are used
   if (Writemask == 0xf)
     return;
@@ -719,6 +730,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
 
   // If we only got one lane, replace it with a copy
   if (Writemask == (1U << Lane)) {
+
     SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32);
     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
                                       DebugLoc(), MVT::f32,
@@ -746,13 +758,53 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
   }
 }
 
+void SITargetLowering::adjustBufferLoad(MachineSDNode *&Node,
+                                        SelectionDAG &DAG) const {
+  unsigned Writemask = 0, Lane = 0;
+  SDNode *Users[4] = { };
+
+  Writemask = getUsedComponents(Node, Users, Lane);
+
+  unsigned Opcode;
+  if (Writemask >= 0x8)
+    return;
+  else if (Writemask >= 0x4)
+    Opcode = AMDGPU::BUFFER_LOAD_FORMAT_XYZ;
+  else if (Writemask >= 0x2)
+    Opcode = AMDGPU::BUFFER_LOAD_FORMAT_XY;
+  else if (Writemask == 0x1) {
+    Opcode = AMDGPU::BUFFER_LOAD_FORMAT_X;
+
+    // If only X is used replace it with a copy
+    SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32);
+    SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                      DebugLoc(), MVT::f32,
+                                      SDValue(Node, 0), RC);
+    DAG.ReplaceAllUsesWith(Users[0], Copy);
+  } else
+    llvm_unreachable("Unused buffer load not optimized away!");
+
+  // Create a node with the new opcode
+  std::vector<SDValue> Ops;
+  for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i)
+    Ops.push_back(Node->getOperand(i));
+
+  Node = DAG.getMachineNode(Opcode, Node->getDebugLoc(),
+                            Node->getValueType(0), Ops.data(),
+                            Ops.size());
+}
+
 /// \brief Fold the instructions after slecting them
 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
                                           SelectionDAG &DAG) const {
 
-  if (AMDGPU::isMIMG(Node->getMachineOpcode()) != -1)
+  unsigned Opcode = Node->getMachineOpcode();
+  if (AMDGPU::isMIMG(Opcode) != -1)
     adjustWritemask(Node, DAG);
 
+  if (AMDGPU::BUFFER_LOAD_FORMAT_XYZW == Opcode)
+    adjustBufferLoad(Node, DAG);
+
   return foldOperands(Node, DAG);
 }
 
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index ef548ca..0c0b639 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -35,6 +35,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
 
   SDNode *foldOperands(MachineSDNode *N, SelectionDAG &DAG) const;
   void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
+  void adjustBufferLoad(MachineSDNode *&Node, SelectionDAG &DAG) const;
 
 public:
   SITargetLowering(TargetMachine &tm);
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index eb410d7..3c847bf 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -391,9 +391,9 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
 
 } // End isCompare = 1
 
-//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
-//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
-//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>;
+def BUFFER_LOAD_FORMAT_X : MUBUF_Load_Helper <0x00000000, "BUFFER_LOAD_FORMAT_X", VReg_32>;
+def BUFFER_LOAD_FORMAT_XY : MUBUF_Load_Helper <0x00000001, "BUFFER_LOAD_FORMAT_XY", VReg_64>;
+def BUFFER_LOAD_FORMAT_XYZ : MUBUF_Load_Helper <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", VReg_96>;
 def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>;
 //def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>;
 //def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>;
diff --git a/test/CodeGen/R600/llvm.SI.vs.load.input.ll b/test/CodeGen/R600/llvm.SI.vs.load.input.ll
new file mode 100644
index 0000000..905794d
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.vs.load.input.ll
@@ -0,0 +1,25 @@
+;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
+
+;CHECK: BUFFER_LOAD_FORMAT_XYZW {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}
+;CHECK: BUFFER_LOAD_FORMAT_XYZ {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}
+;CHECK: BUFFER_LOAD_FORMAT_XY {{VGPR[0-9]+_VGPR[0-9]+}}
+;CHECK: BUFFER_LOAD_FORMAT_X {{VGPR[0-9]+}}
+
+define void @main(<16 x i8> addrspace(2)* inreg %rp, i32 %i0, i32 %i1, i32 %i2, i32 %i3) {
+main_body:
+  %r = load <16 x i8> addrspace(2)* %rp
+  %v0 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %r, i32 0, i32 %i0)
+  %e0 = extractelement <4 x float> %v0, i32 0
+  %v1 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %r, i32 0, i32 %i1)
+  %e1 = extractelement <4 x float> %v1, i32 1
+  %v2 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %r, i32 0, i32 %i2)
+  %e2 = extractelement <4 x float> %v2, i32 2
+  %v3 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %r, i32 0, i32 %i3)
+  %e3 = extractelement <4 x float> %v3, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %e0, float %e1, float %e2, float %e3)
+  ret void
+}
+
+declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) nounwind readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-- 
1.7.9.5