[PATCH] R600/SI: Try to use scalar BFE.

Wed Apr 2 10:04:15 PDT 2014

On Tue, Apr 01, 2014 at 11:33:07AM -0700, Matt Arsenault wrote:
> Use scalar BFE with constant shift and offset when possible.
> This is complicated by the fact that the scalar version packs
> the two operands of the vector version into one.
> 

LGTM.

> 
> http://llvm-reviews.chandlerc.com/D3249
> 
> Files:
>   lib/Target/R600/AMDGPUISelDAGToDAG.cpp
>   lib/Target/R600/SIInstrInfo.cpp
>   test/CodeGen/R600/sext-in-reg.ll

> Index: lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> ===================================================================
> --- lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> +++ lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> @@ -198,13 +198,14 @@
>      N->setNodeId(-1);
>      return NULL;   // Already selected.
>    }
> +
> +  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
>    switch (Opc) {
>    default: break;
>    // We are selecting i64 ADD here instead of custom lower it during
>    // DAG legalization, so we can fold some i64 ADDs used for address
>    // calculation into the LOAD and STORE instructions.
>    case ISD::ADD: {
> -    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
>      if (N->getValueType(0) != MVT::i64 ||
>          ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
>        break;
> @@ -250,7 +251,6 @@
>    }
>    case ISD::BUILD_VECTOR: {
>      unsigned RegClassID;
> -    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
>      const AMDGPURegisterInfo *TRI =
>                     static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
>      const SIRegisterInfo *SIRI =
> @@ -337,7 +337,6 @@
>    }
>    case ISD::BUILD_PAIR: {
>      SDValue RC, SubReg0, SubReg1;
> -    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
>      if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
>        break;
>      }
> @@ -358,7 +357,6 @@
>                                    SDLoc(N), N->getValueType(0), Ops);
>    }
>    case AMDGPUISD::REGISTER_LOAD: {
> -    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
>      if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
>        break;
>      SDValue Addr, Offset;
> @@ -375,7 +373,6 @@
>                                    Ops);
>    }
>    case AMDGPUISD::REGISTER_STORE: {
> -    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
>      if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
>        break;
>      SDValue Addr, Offset;
> @@ -391,6 +388,47 @@
>                                          CurDAG->getVTList(MVT::Other),
>                                          Ops);
>    }
> +
> +  case AMDGPUISD::BFE_I32:
> +  case AMDGPUISD::BFE_U32: {
> +    if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
> +      break;
> +
> +    // There is a scalar version available, but unlike the vector version which
> +    // has a separate operand for the offset and width, the scalar version packs
> +    // the width and offset into a single operand. Try to move to the scalar
> +    // version if the offsets are constant, so that we can try to keep extended
> +    // loads of kernel arguments in SGPRs.
> +
> +    // TODO: Technically we could try to pattern match scalar bitshifts of
> +    // dynamic values, but it's probably not useful.
> +    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
> +    if (!Offset)
> +      break;
> +
> +    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
> +    if (!Width)
> +      break;
> +
> +    bool Signed = Opc == AMDGPUISD::BFE_I32;
> +
> +    // Transformation function, pack the offset and width of a BFE into
> +    // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
> +    // source, bits [5:0] contain the offset and bits [22:16] the width.
> +
> +    uint32_t OffsetVal = Offset->getZExtValue();
> +    uint32_t WidthVal = Width->getZExtValue();
> +
> +    uint32_t PackedVal = OffsetVal | WidthVal << 16;
> +
> +    SDValue PackedOffsetWidth = CurDAG->getTargetConstant(PackedVal, MVT::i32);
> +    return CurDAG->getMachineNode(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
> +                                  SDLoc(N),
> +                                  MVT::i32,
> +                                  N->getOperand(0),
> +                                  PackedOffsetWidth);
> +
> +  }
>    }
>    return SelectCode(N);
>  }
> Index: lib/Target/R600/SIInstrInfo.cpp
> ===================================================================
> --- lib/Target/R600/SIInstrInfo.cpp
> +++ lib/Target/R600/SIInstrInfo.cpp
> @@ -538,6 +538,8 @@
>    case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
>    case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
>    case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
> +  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
> +  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
>    }
>  }
>  
> @@ -989,6 +991,22 @@
>  
>      addDescImplicitUseDef(NewDesc, Inst);
>  
> +    if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
> +      const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
> +      // If we need to move this to VGPRs, we need to unpack the second operand
> +      // back into the 2 separate ones for bit offset and width.
> +      assert(OffsetWidthOp.isImm() &&
> +             "Scalar BFE is only implemented for constant width and offset");
> +      uint32_t Imm = OffsetWidthOp.getImm();
> +
> +      uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
> +      uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
> +
> +      Inst->RemoveOperand(2); // Remove old immediate.
> +      Inst->addOperand(MachineOperand::CreateImm(Offset));
> +      Inst->addOperand(MachineOperand::CreateImm(BitWidth));
> +    }
> +
>      legalizeOperands(Inst);
>  
>      // Update the destination register class.
> Index: test/CodeGen/R600/sext-in-reg.ll
> ===================================================================
> --- test/CodeGen/R600/sext-in-reg.ll
> +++ test/CodeGen/R600/sext-in-reg.ll
> @@ -6,7 +6,8 @@
>  
>  ; FUNC-LABEL: @sext_in_reg_i1_i32
>  ; SI: S_LOAD_DWORD [[ARG:s[0-9]+]],
> -; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[ARG]], 0, 1
> +; SI: S_BFE_I32 [[SEXTRACT:s[0-9]+]], [[ARG]], 65536
> +; SI: V_MOV_B32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]]
>  ; SI: BUFFER_STORE_DWORD [[EXTRACT]],
>  
>  ; EG: BFE_INT
> @@ -110,8 +111,8 @@
>  
>  ; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments.
>  ; XFUNC-LABEL: @sext_in_reg_i8_to_v1i64
> -; XSI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
> -; XSI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31,
> +; XSI: S_BFE_I32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288
> +; XSI: S_ASHR_I32 {{v[0-9]+}}, [[EXTRACT]], 31
>  ; XSI: BUFFER_STORE_DWORD
>  ; XEG: BFE_INT
>  ; XEG: ASHR
> @@ -152,8 +153,8 @@
>  
>  
>  ; FUNC-LABEL: @sext_in_reg_v2i1_to_v2i32
> -; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
> -; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
> +; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 65536
> +; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 65536
>  ; SI: BUFFER_STORE_DWORDX2
>  ; EG: BFE
>  ; EG: BFE
> @@ -166,10 +167,10 @@
>  }
>  
>  ; FUNC-LABEL: @sext_in_reg_v4i1_to_v4i32
> -; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
> -; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
> -; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
> -; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
> +; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 65536
> +; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 65536
> +; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 65536
> +; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 65536
>  ; SI: BUFFER_STORE_DWORDX4
>  
>  ; EG: BFE
> @@ -257,6 +258,34 @@
>    ret void
>  }
>  
> +; FUNC-LABEL: @vgpr_sext_in_reg_v4i8_to_v4i32
> +; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
> +; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
> +; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
> +; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
> +define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
> +  %loada = load <4 x i32> addrspace(1)* %a, align 16
> +  %loadb = load <4 x i32> addrspace(1)* %b, align 16
> +  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
> +  %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
> +  %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
> +  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
> +  ret void
> +}
> +
> +; FUNC-LABEL: @vgpr_sext_in_reg_v4i16_to_v4i32
> +; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
> +; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
> +define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
> +  %loada = load <4 x i32> addrspace(1)* %a, align 16
> +  %loadb = load <4 x i32> addrspace(1)* %b, align 16
> +  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
> +  %shl = shl <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
> +  %ashr = ashr <4 x i32> %shl, <i32 16, i32 16, i32 16, i32 16>
> +  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
> +  ret void
> +}
> +
>  ; FIXME: The BFE should really be eliminated. I think it should happen
>  ; when computeMaskedBitsForTargetNode is implemented for imax.
>  

> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits