[PATCH] R600/SI: Use VALU instructions for i1 ops

Thu May 8 21:24:26 PDT 2014

---
 lib/Target/R600/SIInstructions.td   | 20 ++++++++++----------
 lib/Target/R600/SILowerI1Copies.cpp | 20 +++++++++++++++++++-
 test/CodeGen/R600/setcc.ll          | 24 ++++++++++++++++++------
 test/CodeGen/R600/setcc64.ll        | 24 ++++++++++++++++++------
 4 files changed, 65 insertions(+), 23 deletions(-)

diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 57b3724..0a7b218 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -1408,6 +1408,16 @@ def V_MOV_I1 : InstSI <
   "", [(set i1:$dst, (imm:$src))]
 >;
 
+def V_AND_I1 : InstSI <
+   (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
+   [(set i1:$dst, (and i1:$src0, i1:$src1))]
+>;
+
+def V_OR_I1 : InstSI <
+   (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
+   [(set i1:$dst, (or i1:$src0, i1:$src1))]
+>;
+
 def LOAD_CONST : AMDGPUShaderInst <
   (outs GPRF32:$dst),
   (ins i32imm:$src),
@@ -1664,16 +1674,6 @@ def : Pat <
 //===----------------------------------------------------------------------===//
 
 def : Pat <
-  (i1 (and i1:$src0, i1:$src1)),
-  (S_AND_B64 $src0, $src1)
->;
-
-def : Pat <
-  (i1 (or i1:$src0, i1:$src1)),
-  (S_OR_B64 $src0, $src1)
->;
-
-def : Pat <
   (i1 (xor i1:$src0, i1:$src1)),
   (S_XOR_B64 $src0, $src1)
 >;
diff --git a/lib/Target/R600/SILowerI1Copies.cpp b/lib/Target/R600/SILowerI1Copies.cpp
index 766380e..738c90b 100644
--- a/lib/Target/R600/SILowerI1Copies.cpp
+++ b/lib/Target/R600/SILowerI1Copies.cpp
@@ -73,6 +73,7 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
   const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
       MF.getTarget().getInstrInfo());
   const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  std::vector<unsigned> I1Defs;
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                   BI != BE; ++BI) {
@@ -84,10 +85,23 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
       MachineInstr &MI = *I;
 
       if (MI.getOpcode() == AMDGPU::V_MOV_I1) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
         MI.setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
         continue;
       }
 
+      if (MI.getOpcode() == AMDGPU::V_AND_I1) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        MI.setDesc(TII->get(AMDGPU::V_AND_B32_e32));
+        continue;
+      }
+
+      if (MI.getOpcode() == AMDGPU::V_OR_I1) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        MI.setDesc(TII->get(AMDGPU::V_OR_B32_e32));
+        continue;
+      }
+
       if (MI.getOpcode() != AMDGPU::COPY ||
           !TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) ||
           !TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg()))
@@ -101,6 +115,7 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
 
       if (DstRC == &AMDGPU::VReg_1RegClass &&
           TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
         BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CNDMASK_B32_e64))
                 .addOperand(MI.getOperand(0))
                 .addImm(0)
@@ -123,8 +138,11 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
                 .addImm(0);
         MI.eraseFromParent();
       }
-
     }
   }
+
+  for (unsigned Reg : I1Defs)
+    MRI.setRegClass(Reg, &AMDGPU::VReg_32RegClass);
+
   return false;
 }
diff --git a/test/CodeGen/R600/setcc.ll b/test/CodeGen/R600/setcc.ll
index ad72732..5bd95b7 100644
--- a/test/CodeGen/R600/setcc.ll
+++ b/test/CodeGen/R600/setcc.ll
@@ -96,7 +96,9 @@ entry:
 ; R600-DAG: SETNE_INT
 ; SI: V_CMP_O_F32
 ; SI: V_CMP_NEQ_F32
-; SI: S_AND_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_AND_B32_e32
 define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp one float %a, %b
@@ -128,7 +130,9 @@ entry:
 ; R600-DAG: SETNE_INT
 ; SI: V_CMP_U_F32
 ; SI: V_CMP_EQ_F32
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ueq float %a, %b
@@ -142,7 +146,9 @@ entry:
 ; R600: SETE_DX10
 ; SI: V_CMP_U_F32
 ; SI: V_CMP_GT_F32
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ugt float %a, %b
@@ -156,7 +162,9 @@ entry:
 ; R600: SETE_DX10
 ; SI: V_CMP_U_F32
 ; SI: V_CMP_GE_F32
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp uge float %a, %b
@@ -170,7 +178,9 @@ entry:
 ; R600: SETE_DX10
 ; SI: V_CMP_U_F32
 ; SI: V_CMP_LT_F32
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ult float %a, %b
@@ -184,7 +194,9 @@ entry:
 ; R600: SETE_DX10
 ; SI: V_CMP_U_F32
 ; SI: V_CMP_LE_F32
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ule float %a, %b
diff --git a/test/CodeGen/R600/setcc64.ll b/test/CodeGen/R600/setcc64.ll
index c137125..54a33b3 100644
--- a/test/CodeGen/R600/setcc64.ll
+++ b/test/CodeGen/R600/setcc64.ll
@@ -59,7 +59,9 @@ entry:
 ; FUNC-LABEL: @f64_one
 ; SI: V_CMP_O_F64
 ; SI: V_CMP_NEQ_F64
-; SI: S_AND_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_AND_B32_e32
 define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp one double %a, %b
@@ -81,7 +83,9 @@ entry:
 ; FUNC-LABEL: @f64_ueq
 ; SI: V_CMP_U_F64
 ; SI: V_CMP_EQ_F64
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ueq double %a, %b
@@ -93,7 +97,9 @@ entry:
 ; FUNC-LABEL: @f64_ugt
 ; SI: V_CMP_U_F64
 ; SI: V_CMP_GT_F64
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ugt double %a, %b
@@ -105,7 +111,9 @@ entry:
 ; FUNC-LABEL: @f64_uge
 ; SI: V_CMP_U_F64
 ; SI: V_CMP_GE_F64
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp uge double %a, %b
@@ -117,7 +125,9 @@ entry:
 ; FUNC-LABEL: @f64_ult
 ; SI: V_CMP_U_F64
 ; SI: V_CMP_LT_F64
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ult double %a, %b
@@ -129,7 +139,9 @@ entry:
 ; FUNC-LABEL: @f64_ule
 ; SI: V_CMP_U_F64
 ; SI: V_CMP_LE_F64
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ule double %a, %b
-- 
1.8.1.5