[PATCH] R600/SI: Adjust buffer_load opcodes
Christian König
deathsimple at vodafone.de
Tue Apr 2 06:11:26 PDT 2013
From: Christian König <christian.koenig at amd.com>
Only try to load the used vector components. Practically
the same what I've done before for textures.
Signed-off-by: Christian König <christian.koenig at amd.com>
---
lib/Target/R600/SIISelLowering.cpp | 72 ++++++++++++++++++++++++----
lib/Target/R600/SIISelLowering.h | 1 +
lib/Target/R600/SIInstructions.td | 6 +--
test/CodeGen/R600/llvm.SI.vs.load.input.ll | 25 ++++++++++
4 files changed, 91 insertions(+), 13 deletions(-)
create mode 100644 test/CodeGen/R600/llvm.SI.vs.load.input.ll
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index e4b3111..9c23f3a 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -671,7 +671,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
}
/// \brief Helper function for adjustWritemask
-unsigned SubIdx2Lane(unsigned Idx) {
+static unsigned subIdx2Lane(unsigned Idx) {
switch (Idx) {
default: return 0;
case AMDGPU::sub0: return 0;
@@ -681,11 +681,11 @@ unsigned SubIdx2Lane(unsigned Idx) {
}
}
-/// \brief Adjust the writemask of MIMG instructions
-void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
- SelectionDAG &DAG) const {
- SDNode *Users[4] = { };
- unsigned Writemask = 0, Lane = 0;
+/// \brief Get the sub components used in a nodes result
+static unsigned getUsedComponents(SDNode *Node, SDNode **Users,
+ unsigned &Lane) {
+
+ unsigned Writemask = 0;
// Try to figure out the used register components
for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
@@ -694,18 +694,29 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
// Abort if we can't understand the usage
if (!I->isMachineOpcode() ||
I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
- return;
+ return 0xf;
- Lane = SubIdx2Lane(I->getConstantOperandVal(1));
+ Lane = subIdx2Lane(I->getConstantOperandVal(1));
// Abort if we have more than one user per component
if (Users[Lane])
- return;
+ return 0xf;
Users[Lane] = *I;
Writemask |= 1 << Lane;
}
+ return Writemask;
+}
+
+/// \brief Adjust the writemask of MIMG instructions
+void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
+ SelectionDAG &DAG) const {
+ unsigned Writemask = 0, Lane = 0;
+ SDNode *Users[4] = { };
+
+ Writemask = getUsedComponents(Node, Users, Lane);
+
// Abort if all components are used
if (Writemask == 0xf)
return;
@@ -719,6 +730,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
// If we only got one lane, replace it with a copy
if (Writemask == (1U << Lane)) {
+
SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32);
SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
DebugLoc(), MVT::f32,
@@ -746,13 +758,53 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
}
}
+void SITargetLowering::adjustBufferLoad(MachineSDNode *&Node,
+ SelectionDAG &DAG) const {
+ unsigned Writemask = 0, Lane = 0;
+ SDNode *Users[4] = { };
+
+ Writemask = getUsedComponents(Node, Users, Lane);
+
+ unsigned Opcode;
+ if (Writemask >= 0x8)
+ return;
+ else if (Writemask >= 0x4)
+ Opcode = AMDGPU::BUFFER_LOAD_FORMAT_XYZ;
+ else if (Writemask >= 0x2)
+ Opcode = AMDGPU::BUFFER_LOAD_FORMAT_XY;
+ else if (Writemask == 0x1) {
+ Opcode = AMDGPU::BUFFER_LOAD_FORMAT_X;
+
+ // If only X is used replace it with a copy
+ SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32);
+ SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+ DebugLoc(), MVT::f32,
+ SDValue(Node, 0), RC);
+ DAG.ReplaceAllUsesWith(Users[0], Copy);
+ } else
+ llvm_unreachable("Unused buffer load not optimized away!");
+
+ // Create a node with the new opcode
+ std::vector<SDValue> Ops;
+ for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i)
+ Ops.push_back(Node->getOperand(i));
+
+ Node = DAG.getMachineNode(Opcode, Node->getDebugLoc(),
+ Node->getValueType(0), Ops.data(),
+ Ops.size());
+}
+
/// \brief Fold the instructions after slecting them
SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
SelectionDAG &DAG) const {
- if (AMDGPU::isMIMG(Node->getMachineOpcode()) != -1)
+ unsigned Opcode = Node->getMachineOpcode();
+ if (AMDGPU::isMIMG(Opcode) != -1)
adjustWritemask(Node, DAG);
+ if (AMDGPU::BUFFER_LOAD_FORMAT_XYZW == Opcode)
+ adjustBufferLoad(Node, DAG);
+
return foldOperands(Node, DAG);
}
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index ef548ca..0c0b639 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -35,6 +35,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
SDNode *foldOperands(MachineSDNode *N, SelectionDAG &DAG) const;
void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
+ void adjustBufferLoad(MachineSDNode *&Node, SelectionDAG &DAG) const;
public:
SITargetLowering(TargetMachine &tm);
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index eb410d7..3c847bf 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -391,9 +391,9 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
} // End isCompare = 1
-//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
-//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
-//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>;
+def BUFFER_LOAD_FORMAT_X : MUBUF_Load_Helper <0x00000000, "BUFFER_LOAD_FORMAT_X", VReg_32>;
+def BUFFER_LOAD_FORMAT_XY : MUBUF_Load_Helper <0x00000001, "BUFFER_LOAD_FORMAT_XY", VReg_64>;
+def BUFFER_LOAD_FORMAT_XYZ : MUBUF_Load_Helper <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", VReg_96>;
def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>;
//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>;
//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>;
diff --git a/test/CodeGen/R600/llvm.SI.vs.load.input.ll b/test/CodeGen/R600/llvm.SI.vs.load.input.ll
new file mode 100644
index 0000000..905794d
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.vs.load.input.ll
@@ -0,0 +1,25 @@
+;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
+
+;CHECK: BUFFER_LOAD_FORMAT_XYZW {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}
+;CHECK: BUFFER_LOAD_FORMAT_XYZ {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}
+;CHECK: BUFFER_LOAD_FORMAT_XY {{VGPR[0-9]+_VGPR[0-9]+}}
+;CHECK: BUFFER_LOAD_FORMAT_X {{VGPR[0-9]+}}
+
+define void @main(<16 x i8> addrspace(2)* inreg %rp, i32 %i0, i32 %i1, i32 %i2, i32 %i3) {
+main_body:
+ %r = load <16 x i8> addrspace(2)* %rp
+ %v0 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %r, i32 0, i32 %i0)
+ %e0 = extractelement <4 x float> %v0, i32 0
+ %v1 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %r, i32 0, i32 %i1)
+ %e1 = extractelement <4 x float> %v1, i32 1
+ %v2 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %r, i32 0, i32 %i2)
+ %e2 = extractelement <4 x float> %v2, i32 2
+ %v3 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %r, i32 0, i32 %i3)
+ %e3 = extractelement <4 x float> %v3, i32 3
+ call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %e0, float %e1, float %e2, float %e3)
+ ret void
+}
+
+declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) nounwind readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
--
1.7.9.5
More information about the llvm-commits
mailing list