[llvm] r206558 - R600/SI: Try to use scalar BFE.

Thu Apr 17 22:19:26 PDT 2014

Author: arsenm
Date: Fri Apr 18 00:19:26 2014
New Revision: 206558

URL: http://llvm.org/viewvc/llvm-project?rev=206558&view=rev
Log:
R600/SI: Try to use scalar BFE.

Use scalar BFE with constant shift and offset when possible.
This is complicated by the fact that the scalar version packs
the two operands of the vector version into one.

Modified:
    llvm/trunk/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
    llvm/trunk/lib/Target/R600/SIInstrInfo.cpp
    llvm/trunk/test/CodeGen/R600/sext-in-reg.ll

Modified: llvm/trunk/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/AMDGPUISelDAGToDAG.cpp?rev=206558&r1=206557&r2=206558&view=diff
==============================================================================

--- llvm/trunk/lib/Target/R600/AMDGPUISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/R600/AMDGPUISelDAGToDAG.cpp Fri Apr 18 00:19:26 2014
@@ -203,13 +203,14 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNod
     N->setNodeId(-1);
     return NULL;   // Already selected.
   }
+
+  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
   switch (Opc) {
   default: break;
   // We are selecting i64 ADD here instead of custom lower it during
   // DAG legalization, so we can fold some i64 ADDs used for address
   // calculation into the LOAD and STORE instructions.
   case ISD::ADD: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     if (N->getValueType(0) != MVT::i64 ||
         ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
       break;
@@ -255,7 +256,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNod
   }
   case ISD::BUILD_VECTOR: {
     unsigned RegClassID;
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     const AMDGPURegisterInfo *TRI =
                    static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
     const SIRegisterInfo *SIRI =
@@ -342,7 +342,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNod
   }
   case ISD::BUILD_PAIR: {
     SDValue RC, SubReg0, SubReg1;
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
       break;
     }
@@ -393,7 +392,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNod
   }
 
   case AMDGPUISD::REGISTER_LOAD: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       break;
     SDValue Addr, Offset;
@@ -410,7 +408,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNod
                                   Ops);
   }
   case AMDGPUISD::REGISTER_STORE: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       break;
     SDValue Addr, Offset;
@@ -426,6 +423,47 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNod
                                         CurDAG->getVTList(MVT::Other),
                                         Ops);
   }
+
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      break;
+
+    // There is a scalar version available, but unlike the vector version which
+    // has a separate operand for the offset and width, the scalar version packs
+    // the width and offset into a single operand. Try to move to the scalar
+    // version if the offsets are constant, so that we can try to keep extended
+    // loads of kernel arguments in SGPRs.
+
+    // TODO: Technically we could try to pattern match scalar bitshifts of
+    // dynamic values, but it's probably not useful.
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!Offset)
+      break;
+
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+    if (!Width)
+      break;
+
+    bool Signed = Opc == AMDGPUISD::BFE_I32;
+
+    // Transformation function, pack the offset and width of a BFE into
+    // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+    // source, bits [5:0] contain the offset and bits [22:16] the width.
+
+    uint32_t OffsetVal = Offset->getZExtValue();
+    uint32_t WidthVal = Width->getZExtValue();
+
+    uint32_t PackedVal = OffsetVal | WidthVal << 16;
+
+    SDValue PackedOffsetWidth = CurDAG->getTargetConstant(PackedVal, MVT::i32);
+    return CurDAG->getMachineNode(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
+                                  SDLoc(N),
+                                  MVT::i32,
+                                  N->getOperand(0),
+                                  PackedOffsetWidth);
+
+  }
   }
   return SelectCode(N);
 }

Modified: llvm/trunk/lib/Target/R600/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/SIInstrInfo.cpp?rev=206558&r1=206557&r2=206558&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/R600/SIInstrInfo.cpp Fri Apr 18 00:19:26 2014
@@ -539,6 +539,8 @@ unsigned SIInstrInfo::getVALUOp(const Ma
   case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
   case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
   case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
+  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
   case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
   case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
@@ -1009,6 +1011,27 @@ void SIInstrInfo::moveToVALU(MachineInst
 
     addDescImplicitUseDef(NewDesc, Inst);
 
+    if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
+      const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
+      // If we need to move this to VGPRs, we need to unpack the second operand
+      // back into the 2 separate ones for bit offset and width.
+      assert(OffsetWidthOp.isImm() &&
+             "Scalar BFE is only implemented for constant width and offset");
+      uint32_t Imm = OffsetWidthOp.getImm();
+
+      uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+      uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+
+      Inst->RemoveOperand(2); // Remove old immediate.
+      Inst->addOperand(MachineOperand::CreateImm(Offset));
+      Inst->addOperand(MachineOperand::CreateImm(BitWidth));
+
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+    }
+
     // Update the destination register class.
 
     const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);

Modified: llvm/trunk/test/CodeGen/R600/sext-in-reg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/R600/sext-in-reg.ll?rev=206558&r1=206557&r2=206558&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/R600/sext-in-reg.ll (original)
+++ llvm/trunk/test/CodeGen/R600/sext-in-reg.ll Fri Apr 18 00:19:26 2014
@@ -1,12 +1,13 @@
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=cypress | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone
 
 
 ; FUNC-LABEL: @sext_in_reg_i1_i32
 ; SI: S_LOAD_DWORD [[ARG:s[0-9]+]],
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[ARG]], 0, 1
+; SI: S_BFE_I32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000
+; SI: V_MOV_B32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]]
 ; SI: BUFFER_STORE_DWORD [[EXTRACT]],
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
@@ -148,8 +149,8 @@ define void @sext_in_reg_i32_to_i64(i64
 
 ; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments.
 ; XFUNC-LABEL: @sext_in_reg_i8_to_v1i64
-; XSI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
-; XSI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31,
+; XSI: S_BFE_I32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288
+; XSI: S_ASHR_I32 {{v[0-9]+}}, [[EXTRACT]], 31
 ; XSI: BUFFER_STORE_DWORD
 ; XEG: BFE_INT
 ; XEG: ASHR
@@ -204,8 +205,8 @@ define void @sext_in_reg_v2i1_in_v2i32_o
 
 
 ; FUNC-LABEL: @sext_in_reg_v2i1_to_v2i32
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
 ; SI: BUFFER_STORE_DWORDX2
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
@@ -221,10 +222,10 @@ define void @sext_in_reg_v2i1_to_v2i32(<
 }
 
 ; FUNC-LABEL: @sext_in_reg_v4i1_to_v4i32
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
 ; SI: BUFFER_STORE_DWORDX4
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
@@ -320,6 +321,34 @@ define void @testcase_3(i8 addrspace(1)*
   ret void
 }
 
+; FUNC-LABEL: @vgpr_sext_in_reg_v4i8_to_v4i32
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
+  %loada = load <4 x i32> addrspace(1)* %a, align 16
+  %loadb = load <4 x i32> addrspace(1)* %b, align 16
+  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
+  %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @vgpr_sext_in_reg_v4i16_to_v4i32
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
+  %loada = load <4 x i32> addrspace(1)* %a, align 16
+  %loadb = load <4 x i32> addrspace(1)* %b, align 16
+  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
+  %ashr = ashr <4 x i32> %shl, <i32 16, i32 16, i32 16, i32 16>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
 ; FIXME: The BFE should really be eliminated. I think it should happen
 ; when computeMaskedBitsForTargetNode is implemented for imax.