getting assert [Re: [llvm] r370327 - [X86][CodeGen][NFC] Delay `combineIncDecVector()` from DAGCombine to X86DAGToDAGISel]

Sun Sep 8 12:04:15 PDT 2019

Roman,

Despite this being kinda NFC we see it causing regressions.
Here is a small reproducer that asserts  while running through llc 
(testcase shown below and also attached):

] cat reduced.ll
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
target triple = "x86_64-unknown-linux-gnu"

define void @TestvMeth(i32 %0, i64 %1) gc "statepoint-example" !prof !1 {
bci_0:
   %token418 = call token (i64, i32, i8 * (i64, i32, i32, i32)*, i32, 
i32, ...) @llvm.experimental.gc.statepoint.p0f_p1i8i64i32i32i32f(i64 
2882400000, i32 0, i8 * (i64, i32, i32, i32)* nonnull @newarray, i32 4, 
i32 0, i64 undef, i32 10, i32 10, i32 400, i32 0, i32 35, i32 0, i32 1, 
i32 0, i32 43, i32 1, i32 13, i32 0, i32 3, i32 400, i32 3, i32 %0, i32 
4, i64 %1, i32 7, i8* null, i32 3, i32 -11464, i32 7, i8* null, i32 3, 
i32 -243, i32 3, i32 14, i32 3, i32 117, i32 3, i32 -13, i32 3, i32 -15, 
i32 3, i32 -210, i32 3, i32 541, i32 7, i8* null)
   %v2 = load atomic float, float * undef unordered, align 4
   %v3 = load <4 x i32>, <4 x i32> * undef, align 4
   %v4 = add <4 x i32> %v3, <i32 1, i32 1, i32 1, i32 1>
   store <4 x i32> %v4, <4 x i32> * undef, align 4
   %v5 = fadd float %v2, 1.500000e+01
   store atomic float %v5, float * undef unordered, align 4
   unreachable
}

declare i32* @personality_function()
declare i8 * @newarray(i64, i32, i32, i32)
declare token @llvm.experimental.gc.statepoint.p0f_p1i8i64i32i32i32f(i64 
immarg, i32 immarg, i8 * (i64, i32, i32, i32)*, i32 immarg, i32 immarg, ...)

!1 = !{!"function_entry_count", i64 32768}
] bin/llc reduced.ll
llc: 
/home/fsergeev/ws/llvm-upstream/llvm-mono/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp:1149: 
void llvm::SelectionDAGISel::DoInstructionSelection(): Assertion 
`Op->getNodeId() != -1 && "Node has already selected predecessor node
"' failed.
...

It stops asserting when I revert this commit locally.

regards,
   Fedor.

On 8/29/19 1:50 PM, Roman Lebedev via llvm-commits wrote:
> Author: lebedevri
> Date: Thu Aug 29 03:50:09 2019
> New Revision: 370327
>
> URL: http://llvm.org/viewvc/llvm-project?rev=370327&view=rev
> Log:
> [X86][CodeGen][NFC] Delay `combineIncDecVector()` from DAGCombine to X86DAGToDAGISel
>
> Summary:
> We were previously doing it in DAGCombine.
> But we also want to do `sub %x, C` -> `add %x, (sub 0, C)` for vectors in DAGCombine.
> So if we had `sub %x, -1`, we'll transform it to `add %x, 1`,
> which `combineIncDecVector()` will immediately transform back into `sub %x, -1`,
> and here we go again...
>
> I've marked this as NFC since not a single test changes,
> but since that 'changes' DAGCombine, probably this isn't fully NFC.
>
> Reviewers: RKSimon, craig.topper, spatel
>
> Reviewed By: craig.topper
>
> Subscribers: hiraditya, llvm-commits
>
> Tags: #llvm
>
> Differential Revision: https://reviews.llvm.org/D62327
>
> Modified:
>      llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp
>      llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>      llvm/trunk/lib/Target/X86/X86ISelLowering.h
>      llvm/trunk/test/CodeGen/X86/stack-folding-int-avx1.ll
>      llvm/trunk/test/CodeGen/X86/stack-folding-int-sse42.ll
>
> Modified: llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp?rev=370327&r1=370326&r2=370327&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp Thu Aug 29 03:50:09 2019
> @@ -507,6 +507,7 @@ namespace {
>       bool shrinkAndImmediate(SDNode *N);
>       bool isMaskZeroExtended(SDNode *N) const;
>       bool tryShiftAmountMod(SDNode *N);
> +    bool combineIncDecVector(SDNode *Node);
>       bool tryShrinkShlLogicImm(SDNode *N);
>       bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
>   
> @@ -3775,6 +3776,49 @@ bool X86DAGToDAGISel::tryShrinkShlLogicI
>     return true;
>   }
>   
> +/// Convert vector increment or decrement to sub/add with an all-ones constant:
> +/// add X, <1, 1...> --> sub X, <-1, -1...>
> +/// sub X, <1, 1...> --> add X, <-1, -1...>
> +/// The all-ones vector constant can be materialized using a pcmpeq instruction
> +/// that is commonly recognized as an idiom (has no register dependency), so
> +/// that's better/smaller than loading a splat 1 constant.
> +bool X86DAGToDAGISel::combineIncDecVector(SDNode *Node) {
> +  assert((Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::SUB) &&
> +         "Unexpected opcode for increment/decrement transform");
> +
> +  EVT VT = Node->getValueType(0);
> +  assert(VT.isVector() && "Should only be called for vectors.");
> +
> +  SDValue X = Node->getOperand(0);
> +  SDValue OneVec = Node->getOperand(1);
> +
> +  APInt SplatVal;
> +  if (!X86::isConstantSplat(OneVec, SplatVal) || !SplatVal.isOneValue())
> +    return false;
> +
> +  SDLoc DL(Node);
> +  SDValue AllOnesVec;
> +
> +  APInt Ones = APInt::getAllOnesValue(32);
> +  assert(VT.getSizeInBits() % 32 == 0 &&
> +         "Expected bit count to be a multiple of 32");
> +  unsigned NumElts = VT.getSizeInBits() / 32;
> +  assert(NumElts > 0 && "Expected to get non-empty vector.");
> +  AllOnesVec =
> +      CurDAG->getConstant(Ones, DL, MVT::getVectorVT(MVT::i32, NumElts));
> +  insertDAGNode(*CurDAG, X, AllOnesVec);
> +
> +  AllOnesVec = CurDAG->getBitcast(VT, AllOnesVec);
> +  insertDAGNode(*CurDAG, X, AllOnesVec);
> +
> +  unsigned NewOpcode = Node->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
> +  SDValue NewNode = CurDAG->getNode(NewOpcode, DL, VT, X, AllOnesVec);
> +
> +  ReplaceNode(Node, NewNode.getNode());
> +  SelectCode(NewNode.getNode());
> +  return true;
> +}
> +
>   /// If the high bits of an 'and' operand are known zero, try setting the
>   /// high bits of an 'and' constant operand to produce a smaller encoding by
>   /// creating a small, sign-extended negative immediate rather than a large
> @@ -4347,6 +4391,10 @@ void X86DAGToDAGISel::Select(SDNode *Nod
>       LLVM_FALLTHROUGH;
>     case ISD::ADD:
>     case ISD::SUB: {
> +    if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && NVT.isVector() &&
> +        combineIncDecVector(Node))
> +      return;
> +
>       // Try to avoid folding immediates with multiple uses for optsize.
>       // This code tries to select to register form directly to avoid going
>       // through the isel table which might fold the immediate. We can't change
>
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=370327&r1=370326&r2=370327&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Aug 29 03:50:09 2019
> @@ -6215,7 +6215,9 @@ static bool getTargetConstantBitsFromNod
>     return false;
>   }
>   
> -static bool isConstantSplat(SDValue Op, APInt &SplatVal) {
> +namespace llvm {
> +namespace X86 {
> +bool isConstantSplat(SDValue Op, APInt &SplatVal) {
>     APInt UndefElts;
>     SmallVector<APInt, 16> EltBits;
>     if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
> @@ -6238,6 +6240,8 @@ static bool isConstantSplat(SDValue Op,
>   
>     return false;
>   }
> +} // namespace X86
> +} // namespace llvm
>   
>   static bool getTargetShuffleMaskIndices(SDValue MaskNode,
>                                           unsigned MaskEltSizeInBits,
> @@ -18115,7 +18119,7 @@ static SDValue LowerFunnelShift(SDValue
>         std::swap(Op0, Op1);
>   
>       APInt APIntShiftAmt;
> -    if (isConstantSplat(Amt, APIntShiftAmt)) {
> +    if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
>         uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
>         return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
>                            Op0, Op1, DAG.getConstant(ShiftAmt, DL, MVT::i8));
> @@ -25337,7 +25341,7 @@ static SDValue LowerScalarImmediateShift
>   
>     // Optimize shl/srl/sra with constant shift amount.
>     APInt APIntShiftAmt;
> -  if (!isConstantSplat(Amt, APIntShiftAmt))
> +  if (!X86::isConstantSplat(Amt, APIntShiftAmt))
>       return SDValue();
>   
>     // If the shift amount is out of range, return undef.
> @@ -43750,39 +43754,6 @@ static SDValue combineLoopSADPattern(SDN
>     return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags);
>   }
>   
> -/// Convert vector increment or decrement to sub/add with an all-ones constant:
> -/// add X, <1, 1...> --> sub X, <-1, -1...>
> -/// sub X, <1, 1...> --> add X, <-1, -1...>
> -/// The all-ones vector constant can be materialized using a pcmpeq instruction
> -/// that is commonly recognized as an idiom (has no register dependency), so
> -/// that's better/smaller than loading a splat 1 constant.
> -static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG,
> -                                   TargetLowering::DAGCombinerInfo &DCI) {
> -  assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
> -         "Unexpected opcode for increment/decrement transform");
> -
> -  // Delay this until legalize ops to avoid interfering with early DAG combines
> -  // that may expect canonical adds.
> -  // FIXME: We may want to consider moving this to custom lowering or all the
> -  // way to isel, but lets start here.
> -  if (DCI.isBeforeLegalizeOps())
> -    return SDValue();
> -
> -  // Pseudo-legality check: getOnesVector() expects one of these types, so bail
> -  // out and wait for legalization if we have an unsupported vector length.
> -  EVT VT = N->getValueType(0);
> -  if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
> -    return SDValue();
> -
> -  APInt SplatVal;
> -  if (!isConstantSplat(N->getOperand(1), SplatVal) || !SplatVal.isOneValue())
> -    return SDValue();
> -
> -  SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
> -  unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
> -  return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
> -}
> -
>   static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
>                               const SDLoc &DL, EVT VT,
>                               const X86Subtarget &Subtarget) {
> @@ -44045,9 +44016,6 @@ static SDValue combineAdd(SDNode *N, Sel
>                               HADDBuilder);
>     }
>   
> -  if (SDValue V = combineIncDecVector(N, DAG, DCI))
> -    return V;
> -
>     return combineAddOrSubToADCOrSBB(N, DAG);
>   }
>   
> @@ -44176,9 +44144,6 @@ static SDValue combineSub(SDNode *N, Sel
>                               HSUBBuilder);
>     }
>   
> -  if (SDValue V = combineIncDecVector(N, DAG, DCI))
> -    return V;
> -
>     // Try to create PSUBUS if SUB's argument is max/min
>     if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
>       return V;
>
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=370327&r1=370326&r2=370327&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Thu Aug 29 03:50:09 2019
> @@ -683,6 +683,9 @@ namespace llvm {
>       bool isCalleePop(CallingConv::ID CallingConv,
>                        bool is64Bit, bool IsVarArg, bool GuaranteeTCO);
>   
> +    /// If Op is a constant whose elements are all the same constant or
> +    /// undefined, return true and return the constant value in \p SplatVal.
> +    bool isConstantSplat(SDValue Op, APInt &SplatVal);
>     } // end namespace X86
>   
>     //===--------------------------------------------------------------------===//
>
> Modified: llvm/trunk/test/CodeGen/X86/stack-folding-int-avx1.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/stack-folding-int-avx1.ll?rev=370327&r1=370326&r2=370327&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/stack-folding-int-avx1.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/stack-folding-int-avx1.ll Thu Aug 29 03:50:09 2019
> @@ -540,8 +540,8 @@ define <16 x i8> @stack_fold_pandn(<16 x
>   ; CHECK-NEXT:    #APP
>   ; CHECK-NEXT:    nop
>   ; CHECK-NEXT:    #NO_APP
> -; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
>   ; CHECK-NEXT:    vpandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
> +; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
>   ; CHECK-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
>   ; CHECK-NEXT:    retq
>     %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
>
> Modified: llvm/trunk/test/CodeGen/X86/stack-folding-int-sse42.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/stack-folding-int-sse42.ll?rev=370327&r1=370326&r2=370327&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/stack-folding-int-sse42.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/stack-folding-int-sse42.ll Thu Aug 29 03:50:09 2019
> @@ -724,8 +724,8 @@ define <16 x i8> @stack_fold_pandn(<16 x
>   ; CHECK-NEXT:    #APP
>   ; CHECK-NEXT:    nop
>   ; CHECK-NEXT:    #NO_APP
> -; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
>   ; CHECK-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
> +; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
>   ; CHECK-NEXT:    psubb %xmm1, %xmm0
>   ; CHECK-NEXT:    retq
>     %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits

-------------- next part --------------
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
target triple = "x86_64-unknown-linux-gnu"

define void @TestvMeth(i32 %0, i64 %1) gc "statepoint-example" !prof !1 {
bci_0:
  %token418 = call token (i64, i32, i8 * (i64, i32, i32, i32)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p1i8i64i32i32i32f(i64 2882400000, i32 0, i8 * (i64, i32, i32, i32)* nonnull @newarray, i32 4, i32 0, i64 undef, i32 10, i32 10, i32 400, i32 0, i32 35, i32 0, i32 1, i32 0, i32 43, i32 1, i32 13, i32 0, i32 3, i32 400, i32 3, i32 %0, i32 4, i64 %1, i32 7, i8* null, i32 3, i32 -11464, i32 7, i8* null, i32 3, i32 -243, i32 3, i32 14, i32 3, i32 117, i32 3, i32 -13, i32 3, i32 -15, i32 3, i32 -210, i32 3, i32 541, i32 7, i8* null) 
  %v2 = load atomic float, float * undef unordered, align 4
  %v3 = load <4 x i32>, <4 x i32> * undef, align 4
  %v4 = add <4 x i32> %v3, <i32 1, i32 1, i32 1, i32 1>
  store <4 x i32> %v4, <4 x i32> * undef, align 4
  %v5 = fadd float %v2, 1.500000e+01
  store atomic float %v5, float * undef unordered, align 4
  unreachable
}

declare i32* @personality_function() 
declare i8 * @newarray(i64, i32, i32, i32) 
declare token @llvm.experimental.gc.statepoint.p0f_p1i8i64i32i32i32f(i64 immarg, i32 immarg, i8 * (i64, i32, i32, i32)*, i32 immarg, i32 immarg, ...)

!1 = !{!"function_entry_count", i64 32768}