[PATCH] R600: Match 24-bit arithmetic patterns in a Target DAGCombine

Tom Stellard tom at stellard.net
Sat Mar 29 08:23:08 PDT 2014


On Fri, Mar 28, 2014 at 02:15:39PM -0700, Matt Arsenault wrote:
> On 03/28/2014 09:34 PM, Tom Stellard wrote:
> > Moving these patterns from TableGen files to PerformDAGCombine()
> > should allow us to generate better code by eliminating unnecessary
> > shifts and extensions earlier.
> >
> > This also fixes a bug where the MAD pattern was calling
> > SimplifyDemandedBits with a 24-bit mask on the first operand
> > even when the full pattern wasn't being matched.  This occasionally
> > resulted in some instructions being incorrectly deleted from the
> > program.
> > ---
> >   lib/Target/R600/AMDGPUISelDAGToDAG.cpp   | 46 ------------------
> >   lib/Target/R600/AMDGPUISelLowering.cpp   | 83 ++++++++++++++++++++++++++++++++
> >   lib/Target/R600/AMDGPUISelLowering.h     |  4 ++
> >   lib/Target/R600/AMDGPUInstrInfo.td       |  8 +++
> >   lib/Target/R600/AMDGPUInstructions.td    |  3 --
> >   lib/Target/R600/AMDGPUSubtarget.h        |  9 ++++
> >   lib/Target/R600/CaymanInstructions.td    |  4 +-
> >   lib/Target/R600/EvergreenInstructions.td |  4 +-
> >   lib/Target/R600/R600ISelLowering.cpp     |  1 +
> >   lib/Target/R600/SIISelLowering.cpp       |  2 +-
> >   lib/Target/R600/SIInstructions.td        |  8 +--
> >   test/CodeGen/R600/mad_int24.ll           | 17 ++++---
> >   test/CodeGen/R600/mad_uint24.ll          | 67 +++++++++++++++-----------
> >   test/CodeGen/R600/mul_int24.ll           | 17 ++++---
> >   test/CodeGen/R600/mul_uint24.ll          | 43 +++++++----------
> >   15 files changed, 190 insertions(+), 126 deletions(-)
> >
> > diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> > index e8c5f5b..5fd268d 100644
> > --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> > +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> > @@ -58,9 +58,6 @@ private:
> >     bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
> >     bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
> >     bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
> > -  SDValue SimplifyI24(SDValue &Op);
> > -  bool SelectI24(SDValue Addr, SDValue &Op);
> > -  bool SelectU24(SDValue Addr, SDValue &Op);
> >   
> >     static bool checkType(const Value *ptr, unsigned int addrspace);
> >   
> > @@ -563,49 +560,6 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
> >     return true;
> >   }
> >   
> > -SDValue AMDGPUDAGToDAGISel::SimplifyI24(SDValue &Op) {
> > -  APInt Demanded = APInt(32, 0x00FFFFFF);
> > -  APInt KnownZero, KnownOne;
> > -  TargetLowering::TargetLoweringOpt TLO(*CurDAG, true, true);
> > -  const TargetLowering *TLI = getTargetLowering();
> > -  if (TLI->SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) {
> > -    CurDAG->ReplaceAllUsesWith(Op, TLO.New);
> > -    CurDAG->RepositionNode(Op.getNode(), TLO.New.getNode());
> > -    return SimplifyI24(TLO.New);
> > -  } else {
> > -    return  Op;
> > -  }
> > -}
> > -
> > -bool AMDGPUDAGToDAGISel::SelectI24(SDValue Op, SDValue &I24) {
> > -
> > -  assert(Op.getValueType() == MVT::i32);
> > -
> > -  if (CurDAG->ComputeNumSignBits(Op) == 9) {
> > -    I24 = SimplifyI24(Op);
> > -    return true;
> > -  }
> > -  return false;
> > -}
> > -
> > -bool AMDGPUDAGToDAGISel::SelectU24(SDValue Op, SDValue &U24) {
> > -  APInt KnownZero;
> > -  APInt KnownOne;
> > -  CurDAG->ComputeMaskedBits(Op, KnownZero, KnownOne);
> > -
> > -  assert (Op.getValueType() == MVT::i32);
> > -
> > -  // ANY_EXTEND and EXTLOAD operations can only be done on types smaller than
> > -  // i32.  These smaller types are legal to use with the i24 instructions.
> > -  if ((KnownZero & APInt(KnownZero.getBitWidth(), 0xFF000000)) == 0xFF000000 ||
> > -       Op.getOpcode() == ISD::ANY_EXTEND ||
> > -       ISD::isEXTLoad(Op.getNode())) {
> > -    U24 = SimplifyI24(Op);
> > -    return true;
> > -  }
> > -  return false;
> > -}
> > -
> >   void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
> >     const AMDGPUTargetLowering& Lowering =
> >       (*(const AMDGPUTargetLowering*)getTargetLowering());
> > diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
> > index ddf251f..85f7ca2 100644
> > --- a/lib/Target/R600/AMDGPUISelLowering.cpp
> > +++ b/lib/Target/R600/AMDGPUISelLowering.cpp
> > @@ -225,6 +225,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
> >     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
> >   
> >     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
> > +
> > +  setTargetDAGCombine(ISD::MUL);
> >   }
> >   
> >   //===----------------------------------------------------------------------===//
> > @@ -1037,6 +1039,85 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
> >   }
> >   
> >   //===----------------------------------------------------------------------===//
> > +// Custom DAG optimizations
> > +//===----------------------------------------------------------------------===//
> > +
> > +static bool isU24(SDValue Op, SelectionDAG &DAG) {
> > +  APInt KnownZero, KnownOne;
> > +  EVT VT = Op.getValueType();
> > +  DAG.ComputeMaskedBits(Op, KnownZero, KnownOne);
> > +
> > +  return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
> > +}
> > +
> > +static bool isI24(SDValue Op, SelectionDAG &DAG) {
> > +  EVT VT = Op.getValueType();
> > +
> > +  // In order for this to be a signed 24-bit value, bit 23, must
> > +  // be a sign bit.
> > +  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
> > +                                     // as unsigned 24-bit values.
> > +         (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
> > +}
> > +
> > +static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
> > +
> > +  SelectionDAG &DAG = DCI.DAG;
> > +  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
> > +  EVT VT = Op.getValueType();
> > +
> > +  APInt Demanded = APInt(VT.getSizeInBits(), 0x00FFFFFF);
> I think using APInt::getLowBitsSet is easier to read than counting Fs
> 
> > +  APInt KnownZero, KnownOne;
> > +  TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
> > +  if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
> > +    DCI.CommitTargetLoweringOpt(TLO);
> > +}
> > +
> 
> It might help to implemented computeMaskedBitsForTargetNode and 
> ComputeNumSignBitsForTargetNode for these in case they end up being chained
>

The instructions output is 32 bits, so I don't think I can implement
computeMaskedBitsForTargetNode, but I should be able to re-use the
ISD:MUL implementation of CompuateNumSignBits.

-Tom

> > +SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
> > +                                            DAGCombinerInfo &DCI) const {
> > +  SelectionDAG &DAG = DCI.DAG;
> > +  SDLoc DL(N);
> > +
> > +  switch(N->getOpcode()) {
> > +    default: break;
> > +    case ISD::MUL: {
> > +      EVT VT = N->getValueType(0);
> > +      SDValue N0 = N->getOperand(0);
> > +      SDValue N1 = N->getOperand(1);
> > +      SDValue Mul;
> > +
> > +      if (VT.isVector())
> > +        break;
> > +
> > +      if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
> > +        N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
> > +        N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
> > +        Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
> > +      } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
> > +        N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
> > +        N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
> > +        Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
> > +      } else {
> > +        break;
> > +      }
> > +
> > +      SDValue Reg = DAG.getSExtOrTrunc(Mul, DL, VT);
> > +
> > +      return Reg;
> > +    }
> > +    case AMDGPUISD::MUL_I24:
> > +    case AMDGPUISD::MUL_U24: {
> > +      SDValue N0 = N->getOperand(0);
> > +      SDValue N1 = N->getOperand(1);
> > +      simplifyI24(N0, DCI);
> > +      simplifyI24(N1, DCI);
> > +      return SDValue();
> > +    }
> > +  }
> > +  return SDValue();
> > +}
> > +
> > +//===----------------------------------------------------------------------===//
> >   // Helper functions
> >   //===----------------------------------------------------------------------===//
> >   
> > @@ -1130,6 +1211,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
> >     NODE_NAME_CASE(UMIN)
> >     NODE_NAME_CASE(BFE_U32)
> >     NODE_NAME_CASE(BFE_I32)
> > +  NODE_NAME_CASE(MUL_U24)
> > +  NODE_NAME_CASE(MUL_I24)
> >     NODE_NAME_CASE(URECIP)
> >     NODE_NAME_CASE(DOT4)
> >     NODE_NAME_CASE(EXPORT)
> > diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
> > index 2595c51..5566451 100644
> > --- a/lib/Target/R600/AMDGPUISelLowering.h
> > +++ b/lib/Target/R600/AMDGPUISelLowering.h
> > @@ -131,6 +131,8 @@ public:
> >     /// We don't want to shrink f64/f32 constants.
> >     bool ShouldShrinkFPConstant(EVT VT) const;
> >   
> > +  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
> > +
> >   private:
> >     void InitAMDILLowering();
> >     SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const;
> > @@ -177,6 +179,8 @@ enum {
> >     DOT4,
> >     BFE_U32, // Extract range of bits with zero extension to 32-bits.
> >     BFE_I32, // Extract range of bits with sign extension to 32-bits.
> > +  MUL_U24,
> > +  MUL_I24,
> >     TEXTURE_FETCH,
> >     EXPORT,
> >     CONST_ADDRESS,
> > diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
> > index 2138bd2..26a0473 100644
> > --- a/lib/Target/R600/AMDGPUInstrInfo.td
> > +++ b/lib/Target/R600/AMDGPUInstrInfo.td
> > @@ -90,3 +90,11 @@ def AMDGPUround : SDNode<"ISD::FROUND",
> >   def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
> >   def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
> >   
> > +// Signed and unsigned 24-bit mulitply.  The highest 8-bits are ignore when
> > +// performing the mulitply.  The result is a 32-bit value.
> > +def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
> > +  [SDNPCommutative]
> > +>;
> > +def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
> > +  [SDNPCommutative]
> > +>;
> > diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
> > index 505fc81..cea7a90 100644
> > --- a/lib/Target/R600/AMDGPUInstructions.td
> > +++ b/lib/Target/R600/AMDGPUInstructions.td
> > @@ -253,9 +253,6 @@ def FP_ONE : PatLeaf <
> >     [{return N->isExactlyValue(1.0);}]
> >   >;
> >   
> > -def U24 : ComplexPattern<i32, 1, "SelectU24", [], []>;
> > -def I24 : ComplexPattern<i32, 1, "SelectI24", [], []>;
> > -
> >   let isCodeGenOnly = 1, isPseudo = 1 in {
> >   
> >   let usesCustomInserter = 1  in {
> > diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
> > index 8874d14..7cf102c 100644
> > --- a/lib/Target/R600/AMDGPUSubtarget.h
> > +++ b/lib/Target/R600/AMDGPUSubtarget.h
> > @@ -77,6 +77,15 @@ public:
> >       return hasBFE();
> >     }
> >   
> > +  bool hasMulU24() const {
> > +    return (getGeneration() >= EVERGREEN);
> > +  }
> > +
> > +  bool hasMulI24() const {
> > +    return (getGeneration() >= SOUTHERN_ISLANDS ||
> > +            hasCaymanISA());
> > +  }
> > +
> >     bool IsIRStructurizerEnabled() const;
> >     bool isIfCvtEnabled() const;
> >     unsigned getWavefrontSize() const;
> > diff --git a/lib/Target/R600/CaymanInstructions.td b/lib/Target/R600/CaymanInstructions.td
> > index acd7bde..837d602 100644
> > --- a/lib/Target/R600/CaymanInstructions.td
> > +++ b/lib/Target/R600/CaymanInstructions.td
> > @@ -21,10 +21,10 @@ def isCayman : Predicate<"Subtarget.hasCaymanISA()">;
> >   let Predicates = [isCayman] in {
> >   
> >   def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24",
> > -  [(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))], VecALU
> > +  [(set i32:$dst, (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2))], VecALU
> >   >;
> >   def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24",
> > -  [(set i32:$dst, (mul I24:$src0, I24:$src1))], VecALU
> > +  [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))], VecALU
> >   >;
> >   
> >   let isVector = 1 in {
> > diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td
> > index 384b98e..184b2aa 100644
> > --- a/lib/Target/R600/EvergreenInstructions.td
> > +++ b/lib/Target/R600/EvergreenInstructions.td
> > @@ -286,7 +286,7 @@ def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [], VecALU>;
> >   defm : BFIPatterns <BFI_INT_eg>;
> >   
> >   def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24",
> > -  [(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))], VecALU
> > +  [(set i32:$dst, (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2))], VecALU
> >   >;
> >   def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
> >   def : ROTRPattern <BIT_ALIGN_INT_eg>;
> > @@ -301,7 +301,7 @@ def CNDGE_eg : CNDGE_Common<0x1B>;
> >   def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
> >   def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
> >   def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
> > -  [(set i32:$dst, (mul U24:$src0, U24:$src1))], VecALU
> > +  [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))], VecALU
> >   >;
> >   def DOT4_eg : DOT4_Common<0xBE>;
> >   defm CUBE_eg : CUBE_Common<0xC0>;
> > diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
> > index 4d15321..e286fb2 100644
> > --- a/lib/Target/R600/R600ISelLowering.cpp
> > +++ b/lib/Target/R600/R600ISelLowering.cpp
> > @@ -1522,6 +1522,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
> >     SelectionDAG &DAG = DCI.DAG;
> >   
> >     switch (N->getOpcode()) {
> > +  default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
> >     // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
> >     case ISD::FP_ROUND: {
> >         SDValue Arg = N->getOperand(0);
> > diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
> > index 0e9de5d..2e9bab3 100644
> > --- a/lib/Target/R600/SIISelLowering.cpp
> > +++ b/lib/Target/R600/SIISelLowering.cpp
> > @@ -954,7 +954,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
> >     EVT VT = N->getValueType(0);
> >   
> >     switch (N->getOpcode()) {
> > -    default: break;
> > +    default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
> >       case ISD::SELECT_CC: {
> >         ConstantSDNode *True, *False;
> >         // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc)
> > diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> > index 8ec2971..b51cd7f 100644
> > --- a/lib/Target/R600/SIInstructions.td
> > +++ b/lib/Target/R600/SIInstructions.td
> > @@ -946,11 +946,11 @@ defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32",
> >   
> >   
> >   defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24",
> > -  [(set i32:$dst, (mul I24:$src0, I24:$src1))]
> > +  [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))]
> >   >;
> >   //defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>;
> >   defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24",
> > -  [(set i32:$dst, (mul U24:$src0, U24:$src1))]
> > +  [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))]
> >   >;
> >   //defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>;
> >   
> > @@ -1045,10 +1045,10 @@ let neverHasSideEffects = 1 in {
> >   def V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
> >   def V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", []>;
> >   def V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24",
> > -  [(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))]
> > +  [(set i32:$dst, (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2))]
> >   >;
> >   def V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24",
> > -  [(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))]
> > +  [(set i32:$dst, (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2))]
> >   >;
> >   
> >   } // End neverHasSideEffects
> > diff --git a/test/CodeGen/R600/mad_int24.ll b/test/CodeGen/R600/mad_int24.ll
> > index df063ec..abb5290 100644
> > --- a/test/CodeGen/R600/mad_int24.ll
> > +++ b/test/CodeGen/R600/mad_int24.ll
> > @@ -1,12 +1,15 @@
> > -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
> > -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
> > -; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
> > +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
> > +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
> > +; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
> >   
> > -; EG-CHECK: @i32_mad24
> > +; FUNC-LABEL: @i32_mad24
> >   ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
> > -; EG-CHECK: MULLO_INT
> > -; CM-CHECK: MULADD_INT24 {{[ *]*}}T{{[0-9].[XYZW]}}, KC0[2].Z, KC0[2].W, KC0[3].X
> > -; SI-CHECK: V_MAD_I32_I24
> > +; EG: MULLO_INT
> > +; Make sure we aren't masking the inputs.
> > +; CM-NOT: AND
> > +; CM: MULADD_INT24
> > +; SI-NOT: AND
> > +; SI: V_MAD_I32_I24
> >   define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
> >   entry:
> >     %0 = shl i32 %a, 8
> > diff --git a/test/CodeGen/R600/mad_uint24.ll b/test/CodeGen/R600/mad_uint24.ll
> > index 3dcadc9..0f0893b 100644
> > --- a/test/CodeGen/R600/mad_uint24.ll
> > +++ b/test/CodeGen/R600/mad_uint24.ll
> > @@ -1,11 +1,10 @@
> > -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
> > -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
> > -; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
> > +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
> > +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
> > +; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
> >   
> > -; EG-CHECK-LABEL: @u32_mad24
> > -; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W, KC0[3].X
> > -; SI-CHECK-LABEL: @u32_mad24
> > -; SI-CHECK: V_MAD_U32_U24
> > +; FUNC-LABEL: @u32_mad24
> > +; EG: MULADD_UINT24
> > +; SI: V_MAD_U32_U24
> >   
> >   define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
> >   entry:
> > @@ -19,18 +18,14 @@ entry:
> >     ret void
> >   }
> >   
> > -; EG-CHECK-LABEL: @i16_mad24
> > -; EG-CHECK-DAG: VTX_READ_16 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
> > -; EG-CHECK-DAG: VTX_READ_16 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
> > -; EG-CHECK-DAG: VTX_READ_16 [[C:T[0-9]\.X]], T{{[0-9]}}.X, 48
> > +; FUNC-LABEL: @i16_mad24
> >   ; The order of A and B does not matter.
> > -; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]], [[A]], [[B]], [[C]]
> > +; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
> >   ; The result must be sign-extended
> > -; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
> > -; EG-CHECK: 16
> > -; SI-CHECK-LABEL: @i16_mad24
> > -; SI-CHECK: V_MAD_U32_U24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
> > -; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MAD]], 0, 16
> > +; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
> > +; EG: 16
> > +; SI: V_MAD_U32_U24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
> > +; SI: V_BFE_I32 v{{[0-9]}}, [[MAD]], 0, 16
> >   
> >   define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
> >   entry:
> > @@ -41,18 +36,13 @@ entry:
> >     ret void
> >   }
> >   
> > -; EG-CHECK-LABEL: @i8_mad24
> > -; EG-CHECK-DAG: VTX_READ_8 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
> > -; EG-CHECK-DAG: VTX_READ_8 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
> > -; EG-CHECK-DAG: VTX_READ_8 [[C:T[0-9]\.X]], T{{[0-9]}}.X, 48
> > -; The order of A and B does not matter.
> > -; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]], [[A]], [[B]], [[C]]
> > +; FUNC-LABEL: @i8_mad24
> > +; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
> >   ; The result must be sign-extended
> > -; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
> > -; EG-CHECK: 8
> > -; SI-CHECK-LABEL: @i8_mad24
> > -; SI-CHECK: V_MAD_U32_U24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
> > -; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8
> > +; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
> > +; EG: 8
> > +; SI: V_MAD_U32_U24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
> > +; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8
> >   
> >   define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
> >   entry:
> > @@ -62,3 +52,24 @@ entry:
> >     store i32 %2, i32 addrspace(1)* %out
> >     ret void
> >   }
> > +
> > +; This tests for a bug where the mad_u24 pattern matcher would call
> > +; SimplifyDemandedBits on the first operand of the mul instruction
> > +; assuming that the pattern would be matched to a 24-bit mad.  This
> > +; led to some instructions being incorrectly erased when the entire
> > +; 24-bit mad pattern wasn't being matched.
> > +
> > +; Check that the select instruction is not deleted.
> > +; FUNC-LABEL: @i24_i32_i32_mad
> > +; EG: CNDE_INT
> > +; SI: V_CNDMASK
> > +define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
> > +entry:
> > +  %0 = ashr i32 %a, 8
> > +  %1 = icmp ne i32 %c, 0
> > +  %2 = select i1 %1, i32 %0, i32 34
> > +  %3 = mul i32 %2, %c
> > +  %4 = add i32 %3, %d
> > +  store i32 %4, i32 addrspace(1)* %out
> > +  ret void
> > +}
> > diff --git a/test/CodeGen/R600/mul_int24.ll b/test/CodeGen/R600/mul_int24.ll
> > index 66a1a9e..046911b 100644
> > --- a/test/CodeGen/R600/mul_int24.ll
> > +++ b/test/CodeGen/R600/mul_int24.ll
> > @@ -1,12 +1,15 @@
> > -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
> > -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
> > -; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
> > +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
> > +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
> > +; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
> >   
> > -; EG-CHECK: @i32_mul24
> > +; FUNC-LABEL: @i32_mul24
> >   ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
> > -; EG-CHECK: MULLO_INT
> > -; CM-CHECK: MUL_INT24 {{[ *]*}}T{{[0-9].[XYZW]}}, KC0[2].Z, KC0[2].W
> > -; SI-CHECK: V_MUL_I32_I24
> > +; EG: MULLO_INT
> > +; Make sure we are not masking the inputs
> > +; CM-NOT: AND
> > +; CM: MUL_INT24
> > +; SI-NOT: AND
> > +; SI: V_MUL_I32_I24
> >   define void @i32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
> >   entry:
> >     %0 = shl i32 %a, 8
> > diff --git a/test/CodeGen/R600/mul_uint24.ll b/test/CodeGen/R600/mul_uint24.ll
> > index a413961..27b3717 100644
> > --- a/test/CodeGen/R600/mul_uint24.ll
> > +++ b/test/CodeGen/R600/mul_uint24.ll
> > @@ -1,11 +1,10 @@
> > -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
> > -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
> > -; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
> > +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
> > +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
> > +; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
> >   
> > -; EG-CHECK-LABEL: @u32_mul24
> > -; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
> > -; SI-CHECK-LABEL: @u32_mul24
> > -; SI-CHECK: V_MUL_U32_U24
> > +; FUNC-LABEL: @u32_mul24
> > +; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
> > +; SI: V_MUL_U32_U24
> >   
> >   define void @u32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
> >   entry:
> > @@ -18,17 +17,13 @@ entry:
> >     ret void
> >   }
> >   
> > -; EG-CHECK-LABEL: @i16_mul24
> > -; EG-CHECK-DAG: VTX_READ_16 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
> > -; EG-CHECK-DAG: VTX_READ_16 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
> > -; The order of A and B does not matter.
> > -; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]], [[A]], [[B]]
> > +; FUNC-LABEL: @i16_mul24
> > +; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
> >   ; The result must be sign-extended
> > -; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
> > -; EG-CHECK: 16
> > -; SI-CHECK-LABEL: @i16_mul24
> > -; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
> > -; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 16,
> > +; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
> > +; EG: 16
> > +; SI: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
> > +; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 16,
> >   define void @i16_mul24(i32 addrspace(1)* %out, i16 %a, i16 %b) {
> >   entry:
> >     %0 = mul i16 %a, %b
> > @@ -37,16 +32,12 @@ entry:
> >     ret void
> >   }
> >   
> > -; EG-CHECK-LABEL: @i8_mul24
> > -; EG-CHECK-DAG: VTX_READ_8 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
> > -; EG-CHECK-DAG: VTX_READ_8 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
> > -; The order of A and B does not matter.
> > -; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]], [[A]], [[B]]
> > +; FUNC-LABEL: @i8_mul24
> > +; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
> >   ; The result must be sign-extended
> > -; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
> > -; SI-CHECK-LABEL: @i8_mul24
> > -; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
> > -; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8,
> > +; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
> > +; SI: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
> > +; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8,
> >   
> >   define void @i8_mul24(i32 addrspace(1)* %out, i8 %a, i8 %b) {
> >   entry:
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits



More information about the llvm-commits mailing list