[llvm] r315610 - Implement custom lowering for ISD::CTTZ_ZERO_UNDEF and ISD::CTTZ.
Wei Ding via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 12 12:37:14 PDT 2017
Author: wdng
Date: Thu Oct 12 12:37:14 2017
New Revision: 315610
URL: http://llvm.org/viewvc/llvm-project?rev=315610&view=rev
Log:
Implement custom lowering for ISD::CTTZ_ZERO_UNDEF and ISD::CTTZ.
Differential Revision: http://reviews.llvm.org/D37348
Modified:
llvm/trunk/include/llvm/Target/TargetSelectionDAG.td
llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td
llvm/trunk/lib/Target/AMDGPU/EvergreenInstructions.td
llvm/trunk/lib/Target/AMDGPU/SOPInstructions.td
llvm/trunk/test/CodeGen/AMDGPU/cttz_zero_undef.ll
Modified: llvm/trunk/include/llvm/Target/TargetSelectionDAG.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetSelectionDAG.td?rev=315610&r1=315609&r2=315610&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Target/TargetSelectionDAG.td (original)
+++ llvm/trunk/include/llvm/Target/TargetSelectionDAG.td Thu Oct 12 12:37:14 2017
@@ -132,7 +132,7 @@ def SDTFPSignOp : SDTypeProfile<1, 2, [
def SDTFPTernaryOp : SDTypeProfile<1, 3, [ // fmadd, fnmsub, etc.
SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisFP<0>
]>;
-def SDTIntUnaryOp : SDTypeProfile<1, 1, [ // ctlz
+def SDTIntUnaryOp : SDTypeProfile<1, 1, [ // ctlz, cttz
SDTCisSameAs<0, 1>, SDTCisInt<0>
]>;
def SDTIntExtendOp : SDTypeProfile<1, 1, [ // sext, zext, anyext
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp?rev=315610&r1=315609&r2=315610&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp Thu Oct 12 12:37:14 2017
@@ -2773,7 +2773,7 @@ SDValue SelectionDAGLegalize::ExpandBitC
return DAG.getNode(ISD::CTLZ, dl, Op.getValueType(), Op);
case ISD::CTLZ: {
EVT VT = Op.getValueType();
- unsigned len = VT.getSizeInBits();
+ unsigned Len = VT.getSizeInBits();
if (TLI.isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) {
EVT SetCCVT = getSetCCResultType(VT);
@@ -2781,7 +2781,7 @@ SDValue SelectionDAGLegalize::ExpandBitC
SDValue Zero = DAG.getConstant(0, dl, VT);
SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
- DAG.getConstant(len, dl, VT), CTLZ);
+ DAG.getConstant(Len, dl, VT), CTLZ);
}
// for now, we do this:
@@ -2794,7 +2794,7 @@ SDValue SelectionDAGLegalize::ExpandBitC
//
// Ref: "Hacker's Delight" by Henry Warren
EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
- for (unsigned i = 0; (1U << i) <= (len / 2); ++i) {
+ for (unsigned i = 0; (1U << i) <= (Len / 2); ++i) {
SDValue Tmp3 = DAG.getConstant(1ULL << i, dl, ShVT);
Op = DAG.getNode(ISD::OR, dl, VT, Op,
DAG.getNode(ISD::SRL, dl, VT, Op, Tmp3));
@@ -2806,11 +2806,22 @@ SDValue SelectionDAGLegalize::ExpandBitC
// This trivially expands to CTTZ.
return DAG.getNode(ISD::CTTZ, dl, Op.getValueType(), Op);
case ISD::CTTZ: {
+ EVT VT = Op.getValueType();
+ unsigned Len = VT.getSizeInBits();
+
+ if (TLI.isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) {
+ EVT SetCCVT = getSetCCResultType(VT);
+ SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op);
+ SDValue Zero = DAG.getConstant(0, dl, VT);
+ SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
+ return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
+ DAG.getConstant(Len, dl, VT), CTTZ);
+ }
+
// for now, we use: { return popcount(~x & (x - 1)); }
// unless the target has ctlz but not ctpop, in which case we use:
// { return 32 - nlz(~x & (x-1)); }
// Ref: "Hacker's Delight" by Henry Warren
- EVT VT = Op.getValueType();
SDValue Tmp3 = DAG.getNode(ISD::AND, dl, VT,
DAG.getNOT(dl, Op, VT),
DAG.getNode(ISD::SUB, dl, VT, Op,
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=315610&r1=315609&r2=315610&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Thu Oct 12 12:37:14 2017
@@ -417,8 +417,10 @@ AMDGPUTargetLowering::AMDGPUTargetLoweri
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
if (Subtarget->hasFFBL())
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
+ setOperationAction(ISD::CTTZ, MVT::i64, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
setOperationAction(ISD::CTLZ, MVT::i64, Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
@@ -1113,9 +1115,11 @@ SDValue AMDGPUTargetLowering::LowerOpera
case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+ case ISD::CTTZ:
+ case ISD::CTTZ_ZERO_UNDEF:
case ISD::CTLZ:
case ISD::CTLZ_ZERO_UNDEF:
- return LowerCTLZ(Op, DAG);
+ return LowerCTLZ_CTTZ(Op, DAG);
case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
}
return Op;
@@ -2154,13 +2158,33 @@ SDValue AMDGPUTargetLowering::LowerFFLOO
return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
}
-SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
+static bool isCtlzOpc(unsigned Opc) {
+ return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
+}
+
+static bool isCttzOpc(unsigned Opc) {
+ return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
+}
+
+SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
- bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
+ bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
+ Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
+
+ unsigned ISDOpc, NewOpc;
+ if (isCtlzOpc(Op.getOpcode())) {
+ ISDOpc = ISD::CTLZ_ZERO_UNDEF;
+ NewOpc = AMDGPUISD::FFBH_U32;
+ } else if (isCttzOpc(Op.getOpcode())) {
+ ISDOpc = ISD::CTTZ_ZERO_UNDEF;
+ NewOpc = AMDGPUISD::FFBL_B32;
+ } else
+ llvm_unreachable("Unexpected OPCode!!!");
+
if (ZeroUndef && Src.getValueType() == MVT::i32)
- return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src);
+ return DAG.getNode(NewOpc, SL, MVT::i32, Src);
SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
@@ -2173,24 +2197,33 @@ SDValue AMDGPUTargetLowering::LowerCTLZ(
EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
*DAG.getContext(), MVT::i32);
- SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ);
+ SDValue ZeroOrOne = isCtlzOpc(Op.getOpcode()) ? Zero : One;
+ SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
+ SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, ZeroOrOne, ISD::SETEQ);
- SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo);
- SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi);
+ SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
+ SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);
const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
- SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32);
-
- // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
- SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi);
+ SDValue Add, NewOpr;
+ if (isCtlzOpc(Op.getOpcode())) {
+ Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
+ // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
+ NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
+ } else {
+ Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
+ // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
+ NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
+ }
if (!ZeroUndef) {
// Test if the full 64-bit input is zero.
// FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
// which we probably don't want.
- SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ);
- SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0);
+ SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
+ SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, ZeroOrOne, ISD::SETEQ);
+ SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);
// TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
// with the same cycles, otherwise it is slower.
@@ -2201,11 +2234,11 @@ SDValue AMDGPUTargetLowering::LowerCTLZ(
// The instruction returns -1 for 0 input, but the defined intrinsic
// behavior is to return the number of bits.
- NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32,
- SrcIsZero, Bits32, NewCtlz);
+ NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
+ SrcIsZero, Bits32, NewOpr);
}
- return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz);
+ return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
}
SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
@@ -3117,13 +3150,10 @@ static bool isNegativeOne(SDValue Val) {
return false;
}
-static bool isCtlzOpc(unsigned Opc) {
- return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
-}
-
-SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG,
+SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
SDValue Op,
- const SDLoc &DL) const {
+ const SDLoc &DL,
+ unsigned Opc) const {
EVT VT = Op.getValueType();
EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
@@ -3133,11 +3163,11 @@ SDValue AMDGPUTargetLowering::getFFBH_U3
if (VT != MVT::i32)
Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
- SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op);
+ SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
if (VT != MVT::i32)
- FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH);
+ FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
- return FFBH;
+ return FFBX;
}
// The native instructions return -1 on 0 input. Optimize out a select that
@@ -3147,7 +3177,7 @@ SDValue AMDGPUTargetLowering::getFFBH_U3
// against the bitwidth.
//
// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
-SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
+SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
SDValue LHS, SDValue RHS,
DAGCombinerInfo &DCI) const {
ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
@@ -3158,20 +3188,25 @@ SDValue AMDGPUTargetLowering::performCtl
ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
SDValue CmpLHS = Cond.getOperand(0);
+ unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 :
+ AMDGPUISD::FFBH_U32;
+
// select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
+ // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
if (CCOpcode == ISD::SETEQ &&
- isCtlzOpc(RHS.getOpcode()) &&
+ (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
RHS.getOperand(0) == CmpLHS &&
isNegativeOne(LHS)) {
- return getFFBH_U32(DAG, CmpLHS, SL);
+ return getFFBX_U32(DAG, CmpLHS, SL, Opc);
}
// select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
+ // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
if (CCOpcode == ISD::SETNE &&
- isCtlzOpc(LHS.getOpcode()) &&
+ (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
LHS.getOperand(0) == CmpLHS &&
isNegativeOne(RHS)) {
- return getFFBH_U32(DAG, CmpLHS, SL);
+ return getFFBX_U32(DAG, CmpLHS, SL, Opc);
}
return SDValue();
@@ -3304,7 +3339,7 @@ SDValue AMDGPUTargetLowering::performSel
}
// There's no reason to not do this if the condition has other uses.
- return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
+ return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
}
static bool isConstantFPZero(SDValue N) {
@@ -3892,6 +3927,7 @@ const char* AMDGPUTargetLowering::getTar
NODE_NAME_CASE(BFM)
NODE_NAME_CASE(FFBH_U32)
NODE_NAME_CASE(FFBH_I32)
+ NODE_NAME_CASE(FFBL_B32)
NODE_NAME_CASE(MUL_U24)
NODE_NAME_CASE(MUL_I24)
NODE_NAME_CASE(MULHI_U24)
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h?rev=315610&r1=315609&r2=315610&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h Thu Oct 12 12:37:14 2017
@@ -32,7 +32,7 @@ private:
/// legalized from a smaller type VT. Need to match pre-legalized type because
/// the generic legalization inserts the add/sub between the select and
/// compare.
- SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const;
+ SDValue getFFBX_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, unsigned Opc) const;
public:
static bool isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op);
@@ -57,7 +57,7 @@ protected:
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const;
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
@@ -88,7 +88,7 @@ protected:
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulLoHi24Combine(SDNode *N, DAGCombinerInfo &DCI) const;
- SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
+ SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
SDValue RHS, DAGCombinerInfo &DCI) const;
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -371,6 +371,7 @@ enum NodeType : unsigned {
BFM, // Insert a range of bits into a 32-bit word.
FFBH_U32, // ctlz with -1 if input is zero.
FFBH_I32,
+ FFBL_B32, // cttz with -1 if input is zero.
MUL_U24,
MUL_I24,
MULHI_U24,
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td?rev=315610&r1=315609&r2=315610&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td Thu Oct 12 12:37:14 2017
@@ -298,6 +298,8 @@ def AMDGPUbfm : SDNode<"AMDGPUISD::BFM",
def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>;
def AMDGPUffbh_i32 : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>;
+def AMDGPUffbl_b32 : SDNode<"AMDGPUISD::FFBL_B32", SDTIntUnaryOp>;
+
// Signed and unsigned 24-bit multiply. The highest 8-bits are ignore
// when performing the mulitply. The result is a 32-bit value.
def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
Modified: llvm/trunk/lib/Target/AMDGPU/EvergreenInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/EvergreenInstructions.td?rev=315610&r1=315609&r2=315610&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/EvergreenInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/EvergreenInstructions.td Thu Oct 12 12:37:14 2017
@@ -449,7 +449,7 @@ def FLT32_TO_FLT16 : R600_1OP_Helper <0x
def FLT16_TO_FLT32 : R600_1OP_Helper <0xA3, "FLT16_TO_FLT32", f16_to_fp, VecALU>;
def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>;
-def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>;
+def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", AMDGPUffbl_b32, VecALU>;
let hasSideEffects = 1 in {
def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>;
Modified: llvm/trunk/lib/Target/AMDGPU/SOPInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SOPInstructions.td?rev=315610&r1=315609&r2=315610&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SOPInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SOPInstructions.td Thu Oct 12 12:37:14 2017
@@ -159,10 +159,11 @@ def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcn
def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;
def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">;
+def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64">;
+
def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32",
- [(set i32:$sdst, (cttz_zero_undef i32:$src0))]
+ [(set i32:$sdst, (AMDGPUffbl_b32 i32:$src0))]
>;
-def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64">;
def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32",
[(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))]
Modified: llvm/trunk/test/CodeGen/AMDGPU/cttz_zero_undef.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cttz_zero_undef.ll?rev=315610&r1=315609&r2=315610&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cttz_zero_undef.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cttz_zero_undef.ll Thu Oct 12 12:37:14 2017
@@ -1,8 +1,12 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-NOSDWA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-SDWA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
+declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone
+declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone
+declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone
declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
+declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
declare i32 @llvm.r600.read.tidig.x() nounwind readnone
@@ -76,3 +80,187 @@ define amdgpu_kernel void @v_cttz_zero_u
store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
ret void
}
+
+; FUNC-LABEL: {{^}}s_cttz_zero_undef_i8_with_select:
+; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}}
+; EG: MEM_RAT MSKOR
+; EG: FFBL_INT
+define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 %val) nounwind {
+ %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
+ %cttz_ret = icmp ne i8 %val, 0
+ %ret = select i1 %cttz_ret, i8 %cttz, i8 32
+ store i8 %cttz, i8 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_cttz_zero_undef_i16_with_select:
+; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}}
+; EG: MEM_RAT MSKOR
+; EG: FFBL_INT
+define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 %val) nounwind {
+ %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
+ %cttz_ret = icmp ne i16 %val, 0
+ %ret = select i1 %cttz_ret, i16 %cttz, i16 32
+ store i16 %cttz, i16 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32_with_select:
+; SI: s_ff1_i32_b32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+ %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
+ %cttz_ret = icmp ne i32 %val, 0
+ %ret = select i1 %cttz_ret, i32 %cttz, i32 32
+ store i32 %cttz, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_cttz_zero_undef_i64_with_select:
+; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}}
+; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
+ %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
+ %cttz_ret = icmp ne i64 %val, 0
+ %ret = select i1 %cttz_ret, i64 %cttz, i64 32
+ store i64 %cttz, i64 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_cttz_zero_undef_i8_with_select:
+; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI-SDWA: v_ffbl_b32_sdwa
+; EG: MEM_RAT MSKOR
+define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind {
+ %val = load i8, i8 addrspace(1)* %arrayidx, align 1
+ %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
+ %cttz_ret = icmp ne i8 %val, 0
+ %ret = select i1 %cttz_ret, i8 %cttz, i8 32
+ store i8 %ret, i8 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_cttz_zero_undef_i16_with_select:
+; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI-SDWA: v_ffbl_b32_sdwa
+; EG: MEM_RAT MSKOR
+define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind {
+ %val = load i16, i16 addrspace(1)* %arrayidx, align 1
+ %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
+ %cttz_ret = icmp ne i16 %val, 0
+ %ret = select i1 %cttz_ret, i16 %cttz, i16 32
+ store i16 %ret, i16 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32_with_select:
+; SI: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
+ %val = load i32, i32 addrspace(1)* %arrayidx, align 1
+ %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
+ %cttz_ret = icmp ne i32 %val, 0
+ %ret = select i1 %cttz_ret, i32 %cttz, i32 32
+ store i32 %ret, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_cttz_zero_undef_i64_with_select:
+; SI-NOSDWA: v_or_b32_e32
+; SI-NOSDWA: v_or_b32_e32
+; SI-NOSDWA: v_or_b32_e32
+; SI-SDWA: v_or_b32_sdwa
+; SI-NOSDWA: v_or_b32_e32
+; SI-SDWA: v_or_b32_sdwa
+; SI: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; SI: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; SI-DAG: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]]
+; SI-DAG: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 addrspace(1)* nocapture readonly %arrayidx) nounwind {
+ %val = load i64, i64 addrspace(1)* %arrayidx, align 1
+ %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
+ %cttz_ret = icmp ne i64 %val, 0
+ %ret = select i1 %cttz_ret, i64 %cttz, i64 32
+ store i64 %ret, i64 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_cttz_i32_sel_eq_neg1:
+; SI: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL:v[0-9]+]]
+; SI: v_cmp_ne_u32_e32 vcc, 0, [[VAL]]
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW
+; EG: FFBL_INT
+define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
+ %val = load i32, i32 addrspace(1)* %arrayidx, align 1
+ %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
+ %cmp = icmp eq i32 %val, 0
+ %sel = select i1 %cmp, i32 -1, i32 %ctlz
+ store i32 %sel, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_cttz_i32_sel_ne_neg1:
+; SI: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL:v[0-9]+]]
+; SI: v_cmp_ne_u32_e32 vcc, 0, [[VAL]]
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW
+; EG: FFBL_INT
+define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
+ %val = load i32, i32 addrspace(1)* %arrayidx, align 1
+ %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
+ %cmp = icmp ne i32 %val, 0
+ %sel = select i1 %cmp, i32 %ctlz, i32 -1
+ store i32 %sel, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_cttz_i32_sel_ne_bitwidth:
+; SI: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: v_cmp
+; SI: v_cndmask
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW
+; EG: FFBL_INT
+define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
+ %val = load i32, i32 addrspace(1)* %arrayidx, align 1
+ %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
+ %cmp = icmp ne i32 %ctlz, 32
+ %sel = select i1 %cmp, i32 %ctlz, i32 -1
+ store i32 %sel, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_cttz_i8_sel_eq_neg1:
+; SI: {{buffer|flat}}_load_ubyte
+; SI: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; EG: MEM_RAT MSKOR
+; EG: FFBL_INT
+ define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind {
+ %val = load i8, i8 addrspace(1)* %arrayidx, align 1
+ %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
+ %cmp = icmp eq i8 %val, 0
+ %sel = select i1 %cmp, i8 -1, i8 %ctlz
+ store i8 %sel, i8 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_cttz_i16_sel_eq_neg1:
+; SI: {{buffer|flat}}_load_ubyte
+; SI: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: buffer_store_short
+; EG: MEM_RAT MSKOR
+; EG: FFBL_INT
+ define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind {
+ %val = load i16, i16 addrspace(1)* %arrayidx, align 1
+ %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
+ %cmp = icmp eq i16 %val, 0
+ %sel = select i1 %cmp, i16 -1, i16 %ctlz
+ store i16 %sel, i16 addrspace(1)* %out
+ ret void
+}
+
+
More information about the llvm-commits
mailing list