[llvm] 257acbf - [SelectionDAG] Combine U{ADD, SUB}O diamonds into {ADD, SUB}CARRY
David Zarzycki via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 20 06:28:17 PST 2019
Author: David Zarzycki
Date: 2019-11-20T16:25:42+02:00
New Revision: 257acbf6aee983227a3976d10d0086f3600f2bee
URL: https://github.com/llvm/llvm-project/commit/257acbf6aee983227a3976d10d0086f3600f2bee
DIFF: https://github.com/llvm/llvm-project/commit/257acbf6aee983227a3976d10d0086f3600f2bee.diff
LOG: [SelectionDAG] Combine U{ADD,SUB}O diamonds into {ADD,SUB}CARRY
Summary:
Convert (uaddo (uaddo x, y), carryIn) into addcarry x, y, carryIn if-and-only-if the carry flags of the first two uaddo are merged via OR or XOR.
Work remaining: match ADD, etc.
Reviewers: craig.topper, RKSimon, spatel, niravd, jonpa, uweigand, deadalnix, nikic, lebedev.ri, dmgreen, chfast
Reviewed By: lebedev.ri
Subscribers: chfast, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D70079
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/X86/addcarry.ll
llvm/test/CodeGen/X86/subcarry.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9a9229e02fe6..8d691b0cc7c0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2802,6 +2802,96 @@ static SDValue combineADDCARRYDiamond(DAGCombiner &Combiner, SelectionDAG &DAG,
return SDValue();
}
+// If we are facing some sort of diamond carry/borrow in/out pattern try to
+// match patterns like:
+//
+// (uaddo A, B) CarryIn
+// | \ |
+// | \ |
+// PartialSum PartialCarryOutX /
+// | | /
+// | ____|____________/
+// | / |
+// (uaddo *, *) \________
+// | \ \
+// | \ |
+// | PartialCarryOutY |
+// | \ |
+// | \ /
+// AddCarrySum | ______/
+// | /
+// CarryOut = (or *, *)
+//
+// And generate ADDCARRY (or SUBCARRY) with two result values:
+//
+// {AddCarrySum, CarryOut} = (addcarry A, B, CarryIn)
+//
+// Our goal is to identify A, B, and CarryIn and produce ADDCARRY/SUBCARRY with
+// a single path for carry/borrow out propagation:
+static SDValue combineCarryDiamond(DAGCombiner &Combiner, SelectionDAG &DAG,
+ const TargetLowering &TLI, SDValue Carry0,
+ SDValue Carry1, SDNode *N) {
+ if (Carry0.getResNo() != 1 || Carry1.getResNo() != 1)
+ return SDValue();
+ unsigned Opcode = Carry0.getOpcode();
+ if (Opcode != Carry1.getOpcode())
+ return SDValue();
+ if (Opcode != ISD::UADDO && Opcode != ISD::USUBO)
+ return SDValue();
+
+ // Canonicalize the add/sub of A and B as Carry0 and the add/sub of the
+ // carry/borrow in as Carry1. (The top and middle uaddo nodes respectively in
+ // the above ASCII art.)
+ if (Carry1.getOperand(0) != Carry0.getValue(0) &&
+ Carry1.getOperand(1) != Carry0.getValue(0))
+ std::swap(Carry0, Carry1);
+ if (Carry1.getOperand(0) != Carry0.getValue(0) &&
+ Carry1.getOperand(1) != Carry0.getValue(0))
+ return SDValue();
+
+ // The carry in value must be on the righthand side for subtraction.
+ unsigned CarryInOperandNum =
+ Carry1.getOperand(0) == Carry0.getValue(0) ? 1 : 0;
+ if (Opcode == ISD::USUBO && CarryInOperandNum != 1)
+ return SDValue();
+ SDValue CarryIn = Carry1.getOperand(CarryInOperandNum);
+
+ unsigned NewOp = Opcode == ISD::UADDO ? ISD::ADDCARRY : ISD::SUBCARRY;
+ if (!TLI.isOperationLegalOrCustom(NewOp, Carry0.getValue(0).getValueType()))
+ return SDValue();
+
+ // Verify that the carry/borrow in is plausibly a carry/borrow bit.
+ // TODO: make getAsCarry() aware of how partial carries are merged.
+ if (CarryIn.getOpcode() != ISD::ZERO_EXTEND)
+ return SDValue();
+ CarryIn = CarryIn.getOperand(0);
+ if (CarryIn.getValueType() != MVT::i1)
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue Merged =
+ DAG.getNode(NewOp, DL, Carry1->getVTList(), Carry0.getOperand(0),
+ Carry0.getOperand(1), CarryIn);
+
+ // Please note that because we have proven that the result of the UADDO/USUBO
+ // of A and B feeds into the UADDO/USUBO that does the carry/borrow in, we can
+ // therefore prove that if the first UADDO/USUBO overflows, the second
+ // UADDO/USUBO cannot. For example consider 8-bit numbers where 0xFF is the
+ // maximum value.
+ //
+ // 0xFF + 0xFF == 0xFE with carry but 0xFE + 1 does not carry
+ // 0x00 - 0xFF == 1 with a carry/borrow but 1 - 1 == 0 (no carry/borrow)
+ //
+ // This is important because it means that OR and XOR can be used to merge
+ // carry flags; and that AND can return a constant zero.
+ //
+ // TODO: match other operations that can merge flags (ADD, etc)
+ DAG.ReplaceAllUsesOfValueWith(Carry1.getValue(0), Merged.getValue(0));
+ if (N->getOpcode() == ISD::AND)
+ return DAG.getConstant(0, DL, MVT::i1);
+ return Merged.getValue(1);
+}
+
SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn,
SDNode *N) {
// fold (addcarry (xor a, -1), b, c) -> (subcarry b, a, !c) and flip carry.
@@ -5093,6 +5183,9 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
if (SDValue Shuffle = XformToShuffleWithZero(N))
return Shuffle;
+ if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N))
+ return Combined;
+
// fold (and (or x, C), D) -> D if (C & D) == D
auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) {
return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue());
@@ -5787,6 +5880,9 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
if (SDValue Combined = visitORLike(N0, N1, N))
return Combined;
+ if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N))
+ return Combined;
+
// Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16)
if (SDValue BSwap = MatchBSwapHWord(N, N0, N1))
return BSwap;
@@ -7049,6 +7145,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
+ if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N))
+ return Combined;
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll
index a4ea7d005565..9edcb9492a3d 100644
--- a/llvm/test/CodeGen/X86/addcarry.ll
+++ b/llvm/test/CodeGen/X86/addcarry.ll
@@ -511,40 +511,13 @@ define i32 @add_U320_without_i128_add(%struct.U320* nocapture dereferenceable(40
define i32 @add_U320_without_i128_or(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) {
; CHECK-LABEL: add_U320_without_i128_or:
; CHECK: # %bb.0:
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset %rbx, -16
-; CHECK-NEXT: addq 8(%rdi), %rdx
-; CHECK-NEXT: setb %r10b
; CHECK-NEXT: addq %rsi, (%rdi)
-; CHECK-NEXT: adcq $0, %rdx
-; CHECK-NEXT: setb %al
-; CHECK-NEXT: addq 16(%rdi), %rcx
-; CHECK-NEXT: setb %r11b
-; CHECK-NEXT: orb %r10b, %al
-; CHECK-NEXT: movzbl %al, %ebx
-; CHECK-NEXT: addq %rcx, %rbx
-; CHECK-NEXT: setb %cl
-; CHECK-NEXT: addq 24(%rdi), %r8
-; CHECK-NEXT: setb %r10b
-; CHECK-NEXT: orb %r11b, %cl
-; CHECK-NEXT: movzbl %cl, %esi
-; CHECK-NEXT: addq %r8, %rsi
+; CHECK-NEXT: adcq %rdx, 8(%rdi)
+; CHECK-NEXT: adcq %rcx, 16(%rdi)
+; CHECK-NEXT: adcq %r8, 24(%rdi)
+; CHECK-NEXT: adcq %r9, 32(%rdi)
; CHECK-NEXT: setb %al
-; CHECK-NEXT: addq 32(%rdi), %r9
-; CHECK-NEXT: setb %r8b
-; CHECK-NEXT: orb %r10b, %al
; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: addq %r9, %rax
-; CHECK-NEXT: setb %cl
-; CHECK-NEXT: movq %rdx, 8(%rdi)
-; CHECK-NEXT: movq %rbx, 16(%rdi)
-; CHECK-NEXT: movq %rsi, 24(%rdi)
-; CHECK-NEXT: movq %rax, 32(%rdi)
-; CHECK-NEXT: orb %r8b, %cl
-; CHECK-NEXT: movzbl %cl, %eax
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0
%8 = load i64, i64* %7, align 8
@@ -594,40 +567,13 @@ define i32 @add_U320_without_i128_or(%struct.U320* nocapture dereferenceable(40)
define i32 @add_U320_without_i128_xor(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) {
; CHECK-LABEL: add_U320_without_i128_xor:
; CHECK: # %bb.0:
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset %rbx, -16
-; CHECK-NEXT: addq 8(%rdi), %rdx
-; CHECK-NEXT: setb %r10b
; CHECK-NEXT: addq %rsi, (%rdi)
-; CHECK-NEXT: adcq $0, %rdx
-; CHECK-NEXT: setb %al
-; CHECK-NEXT: addq 16(%rdi), %rcx
-; CHECK-NEXT: setb %r11b
-; CHECK-NEXT: xorb %r10b, %al
-; CHECK-NEXT: movzbl %al, %ebx
-; CHECK-NEXT: addq %rcx, %rbx
-; CHECK-NEXT: setb %cl
-; CHECK-NEXT: addq 24(%rdi), %r8
-; CHECK-NEXT: setb %r10b
-; CHECK-NEXT: xorb %r11b, %cl
-; CHECK-NEXT: movzbl %cl, %esi
-; CHECK-NEXT: addq %r8, %rsi
+; CHECK-NEXT: adcq %rdx, 8(%rdi)
+; CHECK-NEXT: adcq %rcx, 16(%rdi)
+; CHECK-NEXT: adcq %r8, 24(%rdi)
+; CHECK-NEXT: adcq %r9, 32(%rdi)
; CHECK-NEXT: setb %al
-; CHECK-NEXT: addq 32(%rdi), %r9
-; CHECK-NEXT: setb %r8b
-; CHECK-NEXT: xorb %r10b, %al
; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: addq %r9, %rax
-; CHECK-NEXT: setb %cl
-; CHECK-NEXT: movq %rdx, 8(%rdi)
-; CHECK-NEXT: movq %rbx, 16(%rdi)
-; CHECK-NEXT: movq %rsi, 24(%rdi)
-; CHECK-NEXT: movq %rax, 32(%rdi)
-; CHECK-NEXT: xorb %r8b, %cl
-; CHECK-NEXT: movzbl %cl, %eax
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0
%8 = load i64, i64* %7, align 8
@@ -674,34 +620,71 @@ define i32 @add_U320_without_i128_xor(%struct.U320* nocapture dereferenceable(40
ret i32 %43
}
+; Either the primary addition can overflow or the addition of the carry, but
+; they cannot both overflow.
+define i32 @bogus_add_U320_without_i128_and(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) {
+; CHECK-LABEL: bogus_add_U320_without_i128_and:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addq %rsi, (%rdi)
+; CHECK-NEXT: adcq %rdx, 8(%rdi)
+; CHECK-NEXT: addq %rcx, 16(%rdi)
+; CHECK-NEXT: addq %r8, 24(%rdi)
+; CHECK-NEXT: addq %r9, 32(%rdi)
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: retq
+ %7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0
+ %8 = load i64, i64* %7, align 8
+ %9 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 1
+ %10 = load i64, i64* %9, align 8
+ %11 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 2
+ %12 = load i64, i64* %11, align 8
+ %13 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 3
+ %14 = load i64, i64* %13, align 8
+ %15 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 4
+ %16 = load i64, i64* %15, align 8
+ %17 = add i64 %8, %1
+ %18 = add i64 %10, %2
+ %19 = icmp ult i64 %17, %1
+ %20 = zext i1 %19 to i64
+ %21 = add i64 %18, %20
+ %22 = add i64 %12, %3
+ %23 = icmp ult i64 %18, %10
+ %24 = icmp ult i64 %21, %18
+ %25 = and i1 %23, %24
+ %26 = zext i1 %25 to i64
+ %27 = add i64 %22, %26
+ %28 = add i64 %14, %4
+ %29 = icmp ult i64 %22, %12
+ %30 = icmp ult i64 %27, %22
+ %31 = and i1 %29, %30
+ %32 = zext i1 %31 to i64
+ %33 = add i64 %28, %32
+ %34 = add i64 %16, %5
+ %35 = icmp ult i64 %28, %14
+ %36 = icmp ult i64 %33, %28
+ %37 = and i1 %35, %36
+ %38 = zext i1 %37 to i64
+ %39 = add i64 %34, %38
+ store i64 %17, i64* %7, align 8
+ store i64 %21, i64* %9, align 8
+ store i64 %27, i64* %11, align 8
+ store i64 %33, i64* %13, align 8
+ store i64 %39, i64* %15, align 8
+ %40 = icmp ult i64 %34, %16
+ %41 = icmp ult i64 %39, %34
+ %42 = and i1 %40, %41
+ %43 = zext i1 %42 to i32
+ ret i32 %43
+}
+
define void @add_U320_without_i128_or_no_ret(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) {
; CHECK-LABEL: add_U320_without_i128_or_no_ret:
; CHECK: # %bb.0:
-; CHECK-NEXT: addq 8(%rdi), %rdx
-; CHECK-NEXT: setb %r10b
; CHECK-NEXT: addq %rsi, (%rdi)
-; CHECK-NEXT: adcq $0, %rdx
-; CHECK-NEXT: setb %al
-; CHECK-NEXT: addq 16(%rdi), %rcx
-; CHECK-NEXT: setb %r11b
-; CHECK-NEXT: orb %r10b, %al
-; CHECK-NEXT: movzbl %al, %esi
-; CHECK-NEXT: addq %rcx, %rsi
-; CHECK-NEXT: setb %cl
-; CHECK-NEXT: addq 24(%rdi), %r8
-; CHECK-NEXT: setb %r10b
-; CHECK-NEXT: orb %r11b, %cl
-; CHECK-NEXT: movzbl %cl, %ecx
-; CHECK-NEXT: addq %r8, %rcx
-; CHECK-NEXT: setb %al
-; CHECK-NEXT: addq 32(%rdi), %r9
-; CHECK-NEXT: orb %r10b, %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: addq %r9, %rax
-; CHECK-NEXT: movq %rdx, 8(%rdi)
-; CHECK-NEXT: movq %rsi, 16(%rdi)
-; CHECK-NEXT: movq %rcx, 24(%rdi)
-; CHECK-NEXT: movq %rax, 32(%rdi)
+; CHECK-NEXT: adcq %rdx, 8(%rdi)
+; CHECK-NEXT: adcq %rcx, 16(%rdi)
+; CHECK-NEXT: adcq %r8, 24(%rdi)
+; CHECK-NEXT: adcq %r9, 32(%rdi)
; CHECK-NEXT: retq
%7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0
%8 = load i64, i64* %7, align 8
@@ -747,34 +730,12 @@ define void @add_U320_without_i128_or_no_ret(%struct.U320* nocapture dereference
define i32 @add_U320_uaddo(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) {
; CHECK-LABEL: add_U320_uaddo:
; CHECK: # %bb.0:
-; CHECK-NEXT: addq 8(%rdi), %rdx
-; CHECK-NEXT: setb %r10b
; CHECK-NEXT: addq %rsi, (%rdi)
-; CHECK-NEXT: adcq $0, %rdx
-; CHECK-NEXT: setb %al
-; CHECK-NEXT: orb %r10b, %al
-; CHECK-NEXT: movzbl %al, %esi
-; CHECK-NEXT: addq 16(%rdi), %rcx
-; CHECK-NEXT: setb %r10b
-; CHECK-NEXT: addq %rsi, %rcx
-; CHECK-NEXT: setb %al
-; CHECK-NEXT: orb %r10b, %al
-; CHECK-NEXT: movzbl %al, %esi
-; CHECK-NEXT: addq 24(%rdi), %r8
-; CHECK-NEXT: setb %r10b
-; CHECK-NEXT: addq %rsi, %r8
-; CHECK-NEXT: setb %al
-; CHECK-NEXT: orb %r10b, %al
-; CHECK-NEXT: movzbl %al, %esi
-; CHECK-NEXT: addq 32(%rdi), %r9
-; CHECK-NEXT: setb %r10b
-; CHECK-NEXT: addq %rsi, %r9
+; CHECK-NEXT: adcq %rdx, 8(%rdi)
+; CHECK-NEXT: adcq %rcx, 16(%rdi)
+; CHECK-NEXT: adcq %r8, 24(%rdi)
+; CHECK-NEXT: adcq %r9, 32(%rdi)
; CHECK-NEXT: setb %al
-; CHECK-NEXT: orb %r10b, %al
-; CHECK-NEXT: movq %rdx, 8(%rdi)
-; CHECK-NEXT: movq %rcx, 16(%rdi)
-; CHECK-NEXT: movq %r8, 24(%rdi)
-; CHECK-NEXT: movq %r9, 32(%rdi)
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: retq
%7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0
@@ -838,22 +799,14 @@ define void @PR39464(%struct.U192* noalias nocapture sret %0, %struct.U192* noca
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: movq (%rsi), %rcx
-; CHECK-NEXT: movq (%rdx), %r8
-; CHECK-NEXT: leaq (%rcx,%r8), %rdi
-; CHECK-NEXT: movq %rdi, (%rax)
-; CHECK-NEXT: movq 8(%rsi), %rdi
-; CHECK-NEXT: addq 8(%rdx), %rdi
-; CHECK-NEXT: setb %r9b
-; CHECK-NEXT: addq %r8, %rcx
-; CHECK-NEXT: adcq $0, %rdi
-; CHECK-NEXT: setb %cl
-; CHECK-NEXT: orb %r9b, %cl
-; CHECK-NEXT: movzbl %cl, %ecx
-; CHECK-NEXT: movq %rdi, 8(%rax)
-; CHECK-NEXT: movq 16(%rsi), %rsi
-; CHECK-NEXT: addq 16(%rdx), %rsi
-; CHECK-NEXT: addq %rcx, %rsi
-; CHECK-NEXT: movq %rsi, 16(%rax)
+; CHECK-NEXT: addq (%rdx), %rcx
+; CHECK-NEXT: movq %rcx, (%rdi)
+; CHECK-NEXT: movq 8(%rsi), %rcx
+; CHECK-NEXT: adcq 8(%rdx), %rcx
+; CHECK-NEXT: movq %rcx, 8(%rdi)
+; CHECK-NEXT: movq 16(%rsi), %rcx
+; CHECK-NEXT: adcq 16(%rdx), %rcx
+; CHECK-NEXT: movq %rcx, 16(%rdi)
; CHECK-NEXT: retq
%4 = getelementptr inbounds %struct.U192, %struct.U192* %1, i64 0, i32 0, i64 0
%5 = load i64, i64* %4, align 8
@@ -896,12 +849,9 @@ define void @PR39464(%struct.U192* noalias nocapture sret %0, %struct.U192* noca
define zeroext i1 @uaddo_U128_without_i128_or(i64 %0, i64 %1, i64 %2, i64 %3, %uint128* nocapture %4) nounwind {
; CHECK-LABEL: uaddo_U128_without_i128_or:
; CHECK: # %bb.0:
-; CHECK-NEXT: addq %rcx, %rsi
-; CHECK-NEXT: setb %cl
; CHECK-NEXT: addq %rdx, %rdi
-; CHECK-NEXT: adcq $0, %rsi
+; CHECK-NEXT: adcq %rcx, %rsi
; CHECK-NEXT: setb %al
-; CHECK-NEXT: orb %cl, %al
; CHECK-NEXT: movq %rsi, (%r8)
; CHECK-NEXT: movq %rdi, 8(%r8)
; CHECK-NEXT: retq
@@ -927,18 +877,12 @@ define void @add_U192_without_i128_or(%uint192* sret %0, i64 %1, i64 %2, i64 %3,
; CHECK-LABEL: add_U192_without_i128_or:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: addq %r9, %rdx
-; CHECK-NEXT: setb %dil
; CHECK-NEXT: addq %r8, %rsi
-; CHECK-NEXT: adcq $0, %rdx
-; CHECK-NEXT: setb %r8b
-; CHECK-NEXT: orb %dil, %r8b
-; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rcx
-; CHECK-NEXT: movzbl %r8b, %edi
-; CHECK-NEXT: addq %rcx, %rdi
-; CHECK-NEXT: movq %rdi, (%rax)
-; CHECK-NEXT: movq %rdx, 8(%rax)
-; CHECK-NEXT: movq %rsi, 16(%rax)
+; CHECK-NEXT: adcq %r9, %rdx
+; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT: movq %rcx, (%rdi)
+; CHECK-NEXT: movq %rdx, 8(%rdi)
+; CHECK-NEXT: movq %rsi, 16(%rdi)
; CHECK-NEXT: retq
%8 = add i64 %4, %1
%9 = icmp ult i64 %8, %1
@@ -969,29 +913,18 @@ define void @add_U256_without_i128_or_by_i64_words(%uint256* sret %0, %uint256*
; CHECK-LABEL: add_U256_without_i128_or_by_i64_words:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: movq (%rdx), %r9
-; CHECK-NEXT: movq 8(%rdx), %r10
-; CHECK-NEXT: addq 8(%rsi), %r10
-; CHECK-NEXT: setb %r8b
-; CHECK-NEXT: addq (%rsi), %r9
-; CHECK-NEXT: adcq $0, %r10
-; CHECK-NEXT: setb %cl
-; CHECK-NEXT: orb %r8b, %cl
-; CHECK-NEXT: movq 16(%rdx), %rdi
-; CHECK-NEXT: addq 16(%rsi), %rdi
-; CHECK-NEXT: setb %r8b
-; CHECK-NEXT: movzbl %cl, %r11d
-; CHECK-NEXT: addq %rdi, %r11
-; CHECK-NEXT: setb %cl
-; CHECK-NEXT: orb %r8b, %cl
+; CHECK-NEXT: movq (%rdx), %r8
+; CHECK-NEXT: movq 8(%rdx), %rdi
+; CHECK-NEXT: addq (%rsi), %r8
+; CHECK-NEXT: adcq 8(%rsi), %rdi
+; CHECK-NEXT: movq 16(%rdx), %rcx
+; CHECK-NEXT: adcq 16(%rsi), %rcx
; CHECK-NEXT: movq 24(%rdx), %rdx
-; CHECK-NEXT: addq 24(%rsi), %rdx
-; CHECK-NEXT: movzbl %cl, %ecx
-; CHECK-NEXT: addq %rdx, %rcx
-; CHECK-NEXT: movq %rcx, (%rax)
-; CHECK-NEXT: movq %r11, 8(%rax)
-; CHECK-NEXT: movq %r10, 16(%rax)
-; CHECK-NEXT: movq %r9, 24(%rax)
+; CHECK-NEXT: adcq 24(%rsi), %rdx
+; CHECK-NEXT: movq %rdx, (%rax)
+; CHECK-NEXT: movq %rcx, 8(%rax)
+; CHECK-NEXT: movq %rdi, 16(%rax)
+; CHECK-NEXT: movq %r8, 24(%rax)
; CHECK-NEXT: retq
%4 = getelementptr inbounds %uint256, %uint256* %1, i64 0, i32 0, i32 0
%5 = load i64, i64* %4, align 8
@@ -1043,24 +976,15 @@ define void @add_U256_without_i128_or_recursive(%uint256* sret %0, %uint256* %1,
; CHECK-LABEL: add_U256_without_i128_or_recursive:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: movq (%rdx), %r9
+; CHECK-NEXT: movq (%rdx), %r8
; CHECK-NEXT: movq 8(%rdx), %rdi
-; CHECK-NEXT: addq 8(%rsi), %rdi
-; CHECK-NEXT: setb %r8b
-; CHECK-NEXT: addq (%rsi), %r9
-; CHECK-NEXT: adcq $0, %rdi
-; CHECK-NEXT: setb %cl
-; CHECK-NEXT: orb %r8b, %cl
-; CHECK-NEXT: movq 16(%rdx), %r8
-; CHECK-NEXT: movq 24(%rdx), %r10
-; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: addq 16(%rsi), %r8
-; CHECK-NEXT: setb %dl
-; CHECK-NEXT: addq 24(%rsi), %r10
-; CHECK-NEXT: movzbl %cl, %ecx
-; CHECK-NEXT: addq %r8, %rcx
-; CHECK-NEXT: adcq %r10, %rdx
-; CHECK-NEXT: movq %r9, (%rax)
+; CHECK-NEXT: addq (%rsi), %r8
+; CHECK-NEXT: adcq 8(%rsi), %rdi
+; CHECK-NEXT: movq 16(%rdx), %rcx
+; CHECK-NEXT: movq 24(%rdx), %rdx
+; CHECK-NEXT: adcq 16(%rsi), %rcx
+; CHECK-NEXT: adcq 24(%rsi), %rdx
+; CHECK-NEXT: movq %r8, (%rax)
; CHECK-NEXT: movq %rdi, 8(%rax)
; CHECK-NEXT: movq %rcx, 16(%rax)
; CHECK-NEXT: movq %rdx, 24(%rax)
diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/llvm/test/CodeGen/X86/subcarry.ll
index 9afbcc8db218..f5476cdebdd3 100644
--- a/llvm/test/CodeGen/X86/subcarry.ll
+++ b/llvm/test/CodeGen/X86/subcarry.ll
@@ -192,51 +192,13 @@ define i64 @sub_from_carry(i64 %x, i64 %y, i64* %valout, i64 %z) {
define i32 @sub_U320_without_i128_or(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) {
; CHECK-LABEL: sub_U320_without_i128_or:
; CHECK: # %bb.0:
-; CHECK-NEXT: pushq %r14
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 24
-; CHECK-NEXT: .cfi_offset %rbx, -24
-; CHECK-NEXT: .cfi_offset %r14, -16
-; CHECK-NEXT: movq 8(%rdi), %r14
-; CHECK-NEXT: movq 16(%rdi), %r10
-; CHECK-NEXT: movq 24(%rdi), %r11
-; CHECK-NEXT: movq 32(%rdi), %rbx
-; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: subq %rsi, (%rdi)
+; CHECK-NEXT: sbbq %rdx, 8(%rdi)
+; CHECK-NEXT: sbbq %rcx, 16(%rdi)
+; CHECK-NEXT: sbbq %r8, 24(%rdi)
+; CHECK-NEXT: sbbq %r9, 32(%rdi)
; CHECK-NEXT: setb %al
-; CHECK-NEXT: subq %rdx, %r14
-; CHECK-NEXT: setb %dl
-; CHECK-NEXT: subq %rax, %r14
-; CHECK-NEXT: setb %al
-; CHECK-NEXT: subq %rcx, %r10
-; CHECK-NEXT: setb %cl
-; CHECK-NEXT: orb %dl, %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: subq %rax, %r10
-; CHECK-NEXT: setb %al
-; CHECK-NEXT: subq %r8, %r11
-; CHECK-NEXT: setb %dl
-; CHECK-NEXT: orb %cl, %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: subq %rax, %r11
-; CHECK-NEXT: setb %al
-; CHECK-NEXT: subq %r9, %rbx
-; CHECK-NEXT: setb %cl
-; CHECK-NEXT: orb %dl, %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: subq %rax, %rbx
-; CHECK-NEXT: setb %al
-; CHECK-NEXT: movq %r14, 8(%rdi)
-; CHECK-NEXT: movq %r10, 16(%rdi)
-; CHECK-NEXT: movq %r11, 24(%rdi)
-; CHECK-NEXT: movq %rbx, 32(%rdi)
-; CHECK-NEXT: orb %cl, %al
; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: popq %r14
-; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0
%8 = load i64, i64* %7, align 8
@@ -286,51 +248,13 @@ define i32 @sub_U320_without_i128_or(%struct.U320* nocapture dereferenceable(40)
define i32 @sub_U320_usubo(%struct.U320* nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) {
; CHECK-LABEL: sub_U320_usubo:
; CHECK: # %bb.0:
-; CHECK-NEXT: pushq %r14
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 24
-; CHECK-NEXT: .cfi_offset %rbx, -24
-; CHECK-NEXT: .cfi_offset %r14, -16
-; CHECK-NEXT: movq 8(%rdi), %r14
-; CHECK-NEXT: movq 16(%rdi), %r10
-; CHECK-NEXT: movq 24(%rdi), %r11
-; CHECK-NEXT: movq 32(%rdi), %rbx
-; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: subq %rsi, (%rdi)
+; CHECK-NEXT: sbbq %rdx, 8(%rdi)
+; CHECK-NEXT: sbbq %rcx, 16(%rdi)
+; CHECK-NEXT: sbbq %r8, 24(%rdi)
+; CHECK-NEXT: sbbq %r9, 32(%rdi)
; CHECK-NEXT: setb %al
-; CHECK-NEXT: subq %rdx, %r14
-; CHECK-NEXT: setb %dl
-; CHECK-NEXT: subq %rax, %r14
-; CHECK-NEXT: setb %al
-; CHECK-NEXT: orb %dl, %al
; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: subq %rcx, %r10
-; CHECK-NEXT: setb %cl
-; CHECK-NEXT: subq %rax, %r10
-; CHECK-NEXT: setb %al
-; CHECK-NEXT: orb %cl, %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: subq %r8, %r11
-; CHECK-NEXT: setb %cl
-; CHECK-NEXT: subq %rax, %r11
-; CHECK-NEXT: setb %al
-; CHECK-NEXT: orb %cl, %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: subq %r9, %rbx
-; CHECK-NEXT: setb %cl
-; CHECK-NEXT: subq %rax, %rbx
-; CHECK-NEXT: setb %al
-; CHECK-NEXT: orb %cl, %al
-; CHECK-NEXT: movq %r14, 8(%rdi)
-; CHECK-NEXT: movq %r10, 16(%rdi)
-; CHECK-NEXT: movq %r11, 24(%rdi)
-; CHECK-NEXT: movq %rbx, 32(%rdi)
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: popq %r14
-; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%7 = getelementptr inbounds %struct.U320, %struct.U320* %0, i64 0, i32 0, i64 0
%8 = load i64, i64* %7, align 8
@@ -393,22 +317,14 @@ define void @PR39464(%struct.U192* noalias nocapture sret %0, %struct.U192* noca
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: movq (%rsi), %rcx
-; CHECK-NEXT: xorl %r9d, %r9d
; CHECK-NEXT: subq (%rdx), %rcx
-; CHECK-NEXT: setb %r9b
; CHECK-NEXT: movq %rcx, (%rdi)
-; CHECK-NEXT: movq 8(%rsi), %rdi
-; CHECK-NEXT: subq 8(%rdx), %rdi
-; CHECK-NEXT: setb %r8b
-; CHECK-NEXT: subq %r9, %rdi
-; CHECK-NEXT: setb %cl
-; CHECK-NEXT: orb %r8b, %cl
-; CHECK-NEXT: movzbl %cl, %ecx
-; CHECK-NEXT: movq %rdi, 8(%rax)
-; CHECK-NEXT: movq 16(%rsi), %rsi
-; CHECK-NEXT: subq 16(%rdx), %rsi
-; CHECK-NEXT: subq %rcx, %rsi
-; CHECK-NEXT: movq %rsi, 16(%rax)
+; CHECK-NEXT: movq 8(%rsi), %rcx
+; CHECK-NEXT: sbbq 8(%rdx), %rcx
+; CHECK-NEXT: movq %rcx, 8(%rdi)
+; CHECK-NEXT: movq 16(%rsi), %rcx
+; CHECK-NEXT: sbbq 16(%rdx), %rcx
+; CHECK-NEXT: movq %rcx, 16(%rdi)
; CHECK-NEXT: retq
%4 = getelementptr inbounds %struct.U192, %struct.U192* %1, i64 0, i32 0, i64 0
%5 = load i64, i64* %4, align 8
@@ -454,28 +370,23 @@ define void @sub_U256_without_i128_or_recursive(%uint256* sret %0, %uint256* %1,
; CHECK-LABEL: sub_U256_without_i128_or_recursive:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: movq (%rsi), %r8
+; CHECK-NEXT: movq (%rsi), %r9
; CHECK-NEXT: movq 8(%rsi), %r10
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: subq (%rdx), %r8
-; CHECK-NEXT: setb %cl
-; CHECK-NEXT: subq 8(%rdx), %r10
-; CHECK-NEXT: setb %r9b
-; CHECK-NEXT: subq %rcx, %r10
-; CHECK-NEXT: setb %cl
-; CHECK-NEXT: orb %r9b, %cl
-; CHECK-NEXT: movq 16(%rsi), %rdi
+; CHECK-NEXT: subq (%rdx), %r9
+; CHECK-NEXT: sbbq 8(%rdx), %r10
+; CHECK-NEXT: setb %r8b
+; CHECK-NEXT: movq 16(%rsi), %rcx
; CHECK-NEXT: movq 24(%rsi), %rsi
-; CHECK-NEXT: xorl %r9d, %r9d
-; CHECK-NEXT: subq 16(%rdx), %rdi
-; CHECK-NEXT: setb %r9b
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: subq 16(%rdx), %rcx
+; CHECK-NEXT: setb %dil
; CHECK-NEXT: subq 24(%rdx), %rsi
-; CHECK-NEXT: movzbl %cl, %ecx
-; CHECK-NEXT: subq %rcx, %rdi
-; CHECK-NEXT: sbbq %r9, %rsi
-; CHECK-NEXT: movq %r8, (%rax)
+; CHECK-NEXT: movzbl %r8b, %edx
+; CHECK-NEXT: subq %rdx, %rcx
+; CHECK-NEXT: sbbq %rdi, %rsi
+; CHECK-NEXT: movq %r9, (%rax)
; CHECK-NEXT: movq %r10, 8(%rax)
-; CHECK-NEXT: movq %rdi, 16(%rax)
+; CHECK-NEXT: movq %rcx, 16(%rax)
; CHECK-NEXT: movq %rsi, 24(%rax)
; CHECK-NEXT: retq
%4 = getelementptr inbounds %uint256, %uint256* %1, i64 0, i32 0, i32 0
More information about the llvm-commits
mailing list