[llvm] Remove the uaddo-only specification (PR #160392)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 25 12:03:27 PDT 2025
https://github.com/AZero13 updated https://github.com/llvm/llvm-project/pull/160392
>From a22d4c1d4e7a306d5b8a4b92b0a18e178e022ded Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Tue, 23 Sep 2025 16:05:38 -0400
Subject: [PATCH 01/17] usubo
---
llvm/include/llvm/CodeGen/TargetLowering.h | 7 +------
llvm/lib/Target/X86/X86ISelLowering.cpp | 6 ++----
2 files changed, 3 insertions(+), 10 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index c45e03a7bdad84..bcf14a77f9d702 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3458,12 +3458,7 @@ class LLVM_ABI TargetLoweringBase {
// Form it if it is legal.
if (isOperationLegal(Opcode, VT))
return true;
-
- // TODO: The default logic is inherited from code in CodeGenPrepare.
- // The opcode should not make a difference by default?
- if (Opcode != ISD::UADDO)
- return false;
-
+
// Allow the transform as long as we have an integer type that is not
// obviously illegal and unsupported and if the math result is used
// besides the overflow check. On some targets (e.g. SPARC), it is
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 80fbcaa562032a..47d0010504d28b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3405,10 +3405,8 @@ bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
bool) const {
- // TODO: Allow vectors?
- if (VT.isVector())
- return false;
- return VT.isSimple() || !isOperationExpand(Opcode, VT);
+
+ return TargetLowering::shouldFormOverflowOp(Opcode, VT, true);
}
bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
>From b773dc60fca026f6e4b434efe531ad35c35be75e Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Tue, 23 Sep 2025 18:11:11 -0400
Subject: [PATCH 02/17] r
---
llvm/test/CodeGen/AArch64/abdu-neg.ll | 2 +-
llvm/test/CodeGen/AArch64/arm64-srl-and.ll | 9 +-
llvm/test/CodeGen/AArch64/cgp-usubo.ll | 8 +-
llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll | 12 +-
.../AArch64/local-bounds-single-trap.ll | 16 +-
llvm/test/CodeGen/AArch64/sat-add.ll | 44 +-
.../AArch64/signed-truncation-check.ll | 6 +-
.../CodeGen/ARM/atomicrmw-uinc-udec-wrap.ll | 8 +-
llvm/test/CodeGen/ARM/select_const.ll | 47 +-
llvm/test/CodeGen/Hexagon/loop-balign.ll | 104 +++-
llvm/test/CodeGen/NVPTX/i128.ll | 72 +--
.../PowerPC/atomicrmw-uinc-udec-wrap.ll | 46 +-
llvm/test/CodeGen/PowerPC/sat-add.ll | 38 +-
llvm/test/CodeGen/RISCV/branch-on-zero.ll | 28 +-
.../test/CodeGen/RISCV/overflow-intrinsics.ll | 84 +--
.../SPIRV/optimizations/add-check-overflow.ll | 4 +
.../CodeGen/Thumb/scheduler-clone-cpsr-def.ll | 33 +-
llvm/test/CodeGen/X86/abdu-neg.ll | 93 ++--
.../X86/div-rem-pair-recomposition-signed.ll | 520 +++++++++---------
.../div-rem-pair-recomposition-unsigned.ll | 432 ++++++++-------
llvm/test/CodeGen/X86/select.ll | 12 +-
.../AArch64/overflow-intrinsics.ll | 126 ++++-
.../SPARC/overflow-intrinsics.ll | 95 +++-
23 files changed, 1057 insertions(+), 782 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/abdu-neg.ll b/llvm/test/CodeGen/AArch64/abdu-neg.ll
index 269cbf03f32a05..606162ade272bd 100644
--- a/llvm/test/CodeGen/AArch64/abdu-neg.ll
+++ b/llvm/test/CodeGen/AArch64/abdu-neg.ll
@@ -355,7 +355,7 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
; CHECK-LABEL: abd_cmp_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: subs x8, x0, x1
-; CHECK-NEXT: cneg x0, x8, hs
+; CHECK-NEXT: cneg x0, x8, hi
; CHECK-NEXT: ret
%cmp = icmp ult i64 %a, %b
%ab = sub i64 %a, %b
diff --git a/llvm/test/CodeGen/AArch64/arm64-srl-and.ll b/llvm/test/CodeGen/AArch64/arm64-srl-and.ll
index b58f6ba96a5b87..3f4d6f722fdb63 100644
--- a/llvm/test/CodeGen/AArch64/arm64-srl-and.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-srl-and.ll
@@ -9,13 +9,12 @@ define i32 @srl_and() {
; CHECK-LABEL: srl_and:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:g
-; CHECK-NEXT: mov w9, #50
; CHECK-NEXT: ldr x8, [x8, :got_lo12:g]
; CHECK-NEXT: ldrh w8, [x8]
-; CHECK-NEXT: eor w8, w8, w9
-; CHECK-NEXT: mov w9, #65535
-; CHECK-NEXT: add w8, w8, w9
-; CHECK-NEXT: and w0, w8, w8, lsr #16
+; CHECK-NEXT: cmp w8, #50
+; CHECK-NEXT: sub w8, w8, #1
+; CHECK-NEXT: cset w9, ne
+; CHECK-NEXT: and w0, w8, w9
; CHECK-NEXT: ret
entry:
%0 = load i16, ptr @g, align 4
diff --git a/llvm/test/CodeGen/AArch64/cgp-usubo.ll b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
index d307107fc07ee6..e49e8e86561c7c 100644
--- a/llvm/test/CodeGen/AArch64/cgp-usubo.ll
+++ b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
@@ -108,11 +108,9 @@ define i1 @usubo_ugt_constant_op1_i8(i8 %x, ptr %p) nounwind {
define i1 @usubo_eq_constant1_op1_i32(i32 %x, ptr %p) nounwind {
; CHECK-LABEL: usubo_eq_constant1_op1_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: sub w9, w0, #1
-; CHECK-NEXT: cset w8, eq
-; CHECK-NEXT: str w9, [x1]
-; CHECK-NEXT: mov w0, w8
+; CHECK-NEXT: subs w8, w0, #1
+; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: str w8, [x1]
; CHECK-NEXT: ret
%s = add i32 %x, -1
%ov = icmp eq i32 %x, 0
diff --git a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
index 3f4dd116d91f8c..7917be57285910 100644
--- a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
+++ b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
@@ -192,12 +192,12 @@ define i1 @test_conditional2(i32 %a, i32 %b, ptr %c) {
; CHECK-NEXT: mov w22, #2 ; =0x2
; CHECK-NEXT: LBB3_5: ; %for.cond
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: cbz w22, LBB3_8
+; CHECK-NEXT: subs w22, w22, #1
+; CHECK-NEXT: b.lo LBB3_8
; CHECK-NEXT: ; %bb.6: ; %for.body
; CHECK-NEXT: ; in Loop: Header=BB3_5 Depth=1
-; CHECK-NEXT: sub w22, w22, #1
-; CHECK-NEXT: orr w9, w21, w20
; CHECK-NEXT: ldr w10, [x19, w22, sxtw #2]
+; CHECK-NEXT: orr w9, w21, w20
; CHECK-NEXT: cmp w9, w10
; CHECK-NEXT: b.eq LBB3_5
; CHECK-NEXT: ; %bb.7: ; %if.then
@@ -238,12 +238,12 @@ define i1 @test_conditional2(i32 %a, i32 %b, ptr %c) {
; OUTLINE-ATOMICS-NEXT: cset w8, eq
; OUTLINE-ATOMICS-NEXT: LBB3_1: ; %for.cond
; OUTLINE-ATOMICS-NEXT: ; =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT: cbz w22, LBB3_4
+; OUTLINE-ATOMICS-NEXT: subs w22, w22, #1
+; OUTLINE-ATOMICS-NEXT: b.lo LBB3_4
; OUTLINE-ATOMICS-NEXT: ; %bb.2: ; %for.body
; OUTLINE-ATOMICS-NEXT: ; in Loop: Header=BB3_1 Depth=1
-; OUTLINE-ATOMICS-NEXT: sub w22, w22, #1
-; OUTLINE-ATOMICS-NEXT: orr w9, w21, w20
; OUTLINE-ATOMICS-NEXT: ldr w10, [x19, w22, sxtw #2]
+; OUTLINE-ATOMICS-NEXT: orr w9, w21, w20
; OUTLINE-ATOMICS-NEXT: cmp w9, w10
; OUTLINE-ATOMICS-NEXT: b.eq LBB3_1
; OUTLINE-ATOMICS-NEXT: ; %bb.3: ; %if.then
diff --git a/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll b/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll
index 1207eaa2612a3e..f2c84006910c5e 100644
--- a/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll
+++ b/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll
@@ -17,24 +17,22 @@ define dso_local void @f8(i32 noundef %i, i32 noundef %k) #0 {
; CHECK-ASM-NEXT: .cfi_remember_state
; CHECK-ASM-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-ASM-NEXT: sxtw x8, w0
+; CHECK-ASM-NEXT: mov w9, #10 // =0xa
; CHECK-ASM-NEXT: stp w1, w0, [sp, #8]
-; CHECK-ASM-NEXT: cmp x8, #10
-; CHECK-ASM-NEXT: b.hi .LBB0_5
+; CHECK-ASM-NEXT: subs x9, x9, x8
+; CHECK-ASM-NEXT: b.lo .LBB0_5
; CHECK-ASM-NEXT: // %bb.1: // %entry
-; CHECK-ASM-NEXT: mov w9, #10 // =0xa
-; CHECK-ASM-NEXT: sub x9, x9, x8
; CHECK-ASM-NEXT: cbz x9, .LBB0_5
; CHECK-ASM-NEXT: // %bb.2:
; CHECK-ASM-NEXT: ldrsw x9, [sp, #8]
+; CHECK-ASM-NEXT: mov w10, #10 // =0xa
+; CHECK-ASM-NEXT: subs x11, x10, x9
; CHECK-ASM-NEXT: adrp x10, .L_MergedGlobals
; CHECK-ASM-NEXT: add x10, x10, :lo12:.L_MergedGlobals
; CHECK-ASM-NEXT: strb wzr, [x10, x8]
-; CHECK-ASM-NEXT: cmp x9, #10
-; CHECK-ASM-NEXT: b.hi .LBB0_6
+; CHECK-ASM-NEXT: b.lo .LBB0_6
; CHECK-ASM-NEXT: // %bb.3:
-; CHECK-ASM-NEXT: mov w8, #10 // =0xa
-; CHECK-ASM-NEXT: sub x8, x8, x9
-; CHECK-ASM-NEXT: cbz x8, .LBB0_6
+; CHECK-ASM-NEXT: cbz x11, .LBB0_6
; CHECK-ASM-NEXT: // %bb.4:
; CHECK-ASM-NEXT: add x8, x10, x9
; CHECK-ASM-NEXT: strb wzr, [x8, #10]
diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll
index ecd48d6b7c65bf..12044ebe20fa1d 100644
--- a/llvm/test/CodeGen/AArch64/sat-add.ll
+++ b/llvm/test/CodeGen/AArch64/sat-add.ll
@@ -25,9 +25,9 @@ define i8 @unsigned_sat_constant_i8_using_cmp_sum(i8 %x) {
; CHECK-LABEL: unsigned_sat_constant_i8_using_cmp_sum:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w0, #0xff
-; CHECK-NEXT: add w8, w8, #42
-; CHECK-NEXT: tst w8, #0x100
-; CHECK-NEXT: csinv w0, w8, wzr, eq
+; CHECK-NEXT: add w9, w0, #42
+; CHECK-NEXT: cmp w8, w9, uxtb
+; CHECK-NEXT: csinv w0, w9, wzr, ls
; CHECK-NEXT: ret
%a = add i8 %x, 42
%c = icmp ugt i8 %x, %a
@@ -68,9 +68,9 @@ define i16 @unsigned_sat_constant_i16_using_cmp_sum(i16 %x) {
; CHECK-LABEL: unsigned_sat_constant_i16_using_cmp_sum:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w0, #0xffff
-; CHECK-NEXT: add w8, w8, #42
-; CHECK-NEXT: tst w8, #0x10000
-; CHECK-NEXT: csinv w0, w8, wzr, eq
+; CHECK-NEXT: add w9, w0, #42
+; CHECK-NEXT: cmp w8, w9, uxth
+; CHECK-NEXT: csinv w0, w9, wzr, ls
; CHECK-NEXT: ret
%a = add i16 %x, 42
%c = icmp ugt i16 %x, %a
@@ -188,9 +188,9 @@ define i8 @unsigned_sat_variable_i8_using_cmp_sum(i8 %x, i8 %y) {
; CHECK-LABEL: unsigned_sat_variable_i8_using_cmp_sum:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w0, #0xff
-; CHECK-NEXT: add w8, w8, w1, uxtb
-; CHECK-NEXT: tst w8, #0x100
-; CHECK-NEXT: csinv w0, w8, wzr, eq
+; CHECK-NEXT: add w9, w0, w1
+; CHECK-NEXT: cmp w8, w9, uxtb
+; CHECK-NEXT: csinv w0, w9, wzr, ls
; CHECK-NEXT: ret
%a = add i8 %x, %y
%c = icmp ugt i8 %x, %a
@@ -201,11 +201,11 @@ define i8 @unsigned_sat_variable_i8_using_cmp_sum(i8 %x, i8 %y) {
define i8 @unsigned_sat_variable_i8_using_cmp_notval(i8 %x, i8 %y) {
; CHECK-LABEL: unsigned_sat_variable_i8_using_cmp_notval:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w1, #0xff
-; CHECK-NEXT: add w9, w0, w1
-; CHECK-NEXT: add w8, w8, w0, uxtb
-; CHECK-NEXT: tst w8, #0x100
-; CHECK-NEXT: csinv w0, w9, wzr, eq
+; CHECK-NEXT: and w8, w0, #0xff
+; CHECK-NEXT: mvn w9, w1
+; CHECK-NEXT: add w10, w0, w1
+; CHECK-NEXT: cmp w8, w9, uxtb
+; CHECK-NEXT: csinv w0, w10, wzr, ls
; CHECK-NEXT: ret
%noty = xor i8 %y, -1
%a = add i8 %x, %y
@@ -234,9 +234,9 @@ define i16 @unsigned_sat_variable_i16_using_cmp_sum(i16 %x, i16 %y) {
; CHECK-LABEL: unsigned_sat_variable_i16_using_cmp_sum:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w0, #0xffff
-; CHECK-NEXT: add w8, w8, w1, uxth
-; CHECK-NEXT: tst w8, #0x10000
-; CHECK-NEXT: csinv w0, w8, wzr, eq
+; CHECK-NEXT: add w9, w0, w1
+; CHECK-NEXT: cmp w8, w9, uxth
+; CHECK-NEXT: csinv w0, w9, wzr, ls
; CHECK-NEXT: ret
%a = add i16 %x, %y
%c = icmp ugt i16 %x, %a
@@ -247,11 +247,11 @@ define i16 @unsigned_sat_variable_i16_using_cmp_sum(i16 %x, i16 %y) {
define i16 @unsigned_sat_variable_i16_using_cmp_notval(i16 %x, i16 %y) {
; CHECK-LABEL: unsigned_sat_variable_i16_using_cmp_notval:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w1, #0xffff
-; CHECK-NEXT: add w9, w0, w1
-; CHECK-NEXT: add w8, w8, w0, uxth
-; CHECK-NEXT: tst w8, #0x10000
-; CHECK-NEXT: csinv w0, w9, wzr, eq
+; CHECK-NEXT: and w8, w0, #0xffff
+; CHECK-NEXT: mvn w9, w1
+; CHECK-NEXT: add w10, w0, w1
+; CHECK-NEXT: cmp w8, w9, uxth
+; CHECK-NEXT: csinv w0, w10, wzr, ls
; CHECK-NEXT: ret
%noty = xor i16 %y, -1
%a = add i16 %x, %y
diff --git a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
index 7c80f9320faec1..0720a7f72bd8c1 100644
--- a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
+++ b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
@@ -313,9 +313,9 @@ define i1 @add_ultcmp_bad_i16_i8_cmp(i16 %x, i16 %y) nounwind {
define i1 @add_ultcmp_bad_i8_i16(i16 %x) nounwind {
; CHECK-LABEL: add_ultcmp_bad_i8_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0xffff
-; CHECK-NEXT: add w8, w8, #128
-; CHECK-NEXT: lsr w0, w8, #16
+; CHECK-NEXT: add w8, w0, #128
+; CHECK-NEXT: tst w8, #0xff80
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%tmp0 = add i16 %x, 128 ; 1U << (8-1)
%tmp1 = icmp ult i16 %tmp0, 128 ; 1U << (8-1)
diff --git a/llvm/test/CodeGen/ARM/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/ARM/atomicrmw-uinc-udec-wrap.ll
index 433fb325a7349f..c37afeeea375d1 100644
--- a/llvm/test/CodeGen/ARM/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/ARM/atomicrmw-uinc-udec-wrap.ll
@@ -147,11 +147,11 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
; CHECK-NEXT: .LBB6_1: @ %atomicrmw.start
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrex r12, [r0]
-; CHECK-NEXT: mov r3, r1
; CHECK-NEXT: cmp r12, r1
-; CHECK-NEXT: subls r3, r12, #1
-; CHECK-NEXT: cmp r12, #0
-; CHECK-NEXT: moveq r3, r1
+; CHECK-NEXT: sub r3, r12, #1
+; CHECK-NEXT: movhi r3, r1
+; CHECK-NEXT: cmp r12, #1
+; CHECK-NEXT: movlo r3, r1
; CHECK-NEXT: strex r2, r3, [r0]
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: bne .LBB6_1
diff --git a/llvm/test/CodeGen/ARM/select_const.ll b/llvm/test/CodeGen/ARM/select_const.ll
index 180daa12e7c52d..6d8a7f70754d3d 100644
--- a/llvm/test/CodeGen/ARM/select_const.ll
+++ b/llvm/test/CodeGen/ARM/select_const.ll
@@ -763,46 +763,35 @@ define i64 @opaque_constant2(i1 %cond, i64 %x) {
define i64 @func(i64 %arg) {
; ARM-LABEL: func:
; ARM: @ %bb.0: @ %entry
-; ARM-NEXT: adds r0, r0, #1
-; ARM-NEXT: mov r2, #0
-; ARM-NEXT: adcs r0, r1, #0
+; ARM-NEXT: and r0, r0, r1
; ARM-NEXT: mov r1, #0
-; ARM-NEXT: adcs r0, r2, #0
-; ARM-NEXT: movne r0, #8
+; ARM-NEXT: cmn r0, #1
+; ARM-NEXT: mov r0, #0
+; ARM-NEXT: moveq r0, #8
; ARM-NEXT: mov pc, lr
;
; THUMB2-LABEL: func:
; THUMB2: @ %bb.0: @ %entry
+; THUMB2-NEXT: ands r0, r1
+; THUMB2-NEXT: movs r1, #0
; THUMB2-NEXT: adds r0, #1
-; THUMB2-NEXT: mov.w r2, #0
-; THUMB2-NEXT: adcs r0, r1, #0
-; THUMB2-NEXT: mov.w r1, #0
-; THUMB2-NEXT: adcs r0, r2, #0
-; THUMB2-NEXT: it ne
-; THUMB2-NEXT: movne r0, #8
+; THUMB2-NEXT: mov.w r0, #0
+; THUMB2-NEXT: it eq
+; THUMB2-NEXT: moveq r0, #8
; THUMB2-NEXT: bx lr
;
; THUMB-LABEL: func:
; THUMB: @ %bb.0: @ %entry
-; THUMB-NEXT: .save {r4, lr}
-; THUMB-NEXT: push {r4, lr}
-; THUMB-NEXT: movs r2, #0
-; THUMB-NEXT: adds r3, r0, #1
-; THUMB-NEXT: mov r12, r1
-; THUMB-NEXT: mov r3, r12
-; THUMB-NEXT: adcs r3, r2
-; THUMB-NEXT: mov r12, r2
-; THUMB-NEXT: mov r3, r12
-; THUMB-NEXT: adcs r3, r2
-; THUMB-NEXT: subs r4, r3, #1
+; THUMB-NEXT: ands r0, r1
+; THUMB-NEXT: movs r1, #0
; THUMB-NEXT: adds r0, r0, #1
-; THUMB-NEXT: adcs r1, r2
-; THUMB-NEXT: sbcs r3, r4
-; THUMB-NEXT: lsls r0, r3, #3
-; THUMB-NEXT: movs r1, r2
-; THUMB-NEXT: pop {r4}
-; THUMB-NEXT: pop {r2}
-; THUMB-NEXT: bx r2
+; THUMB-NEXT: beq .LBB26_2
+; THUMB-NEXT: @ %bb.1: @ %entry
+; THUMB-NEXT: movs r0, r1
+; THUMB-NEXT: bx lr
+; THUMB-NEXT: .LBB26_2:
+; THUMB-NEXT: movs r0, #8
+; THUMB-NEXT: bx lr
entry:
%0 = add i64 %arg, 1
%1 = icmp ult i64 %0, 1
diff --git a/llvm/test/CodeGen/Hexagon/loop-balign.ll b/llvm/test/CodeGen/Hexagon/loop-balign.ll
index 78285f6d1ae64e..c3b27a84ac3f13 100644
--- a/llvm/test/CodeGen/Hexagon/loop-balign.ll
+++ b/llvm/test/CodeGen/Hexagon/loop-balign.ll
@@ -1,9 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=hexagon -O3 < %s | FileCheck %s -check-prefix=BALIGN
; BALIGN: .p2align{{.*}}5
; The test for checking the alignment of 'for.body4.for.body4_crit_edge' basic block
define dso_local void @foo(i32 %nCol, i32 %nRow, ptr nocapture %resMat) local_unnamed_addr {
+; BALIGN-LABEL: foo:
+; BALIGN: .cfi_startproc
+; BALIGN-NEXT: // %bb.0: // %entry
+; BALIGN-NEXT: {
+; BALIGN-NEXT: r5 = asl(r1,#2)
+; BALIGN-NEXT: r3 = add(r0,#-1)
+; BALIGN-NEXT: r4 = #-2
+; BALIGN-NEXT: }
+; BALIGN-NEXT: // implicit-def: $d3
+; BALIGN-NEXT: {
+; BALIGN-NEXT: p0 = cmp.gt(r3,#0)
+; BALIGN-NEXT: r3 = #0
+; BALIGN-NEXT: r8 = r5
+; BALIGN-NEXT: if (!p0.new) r0 = #1
+; BALIGN-NEXT: }
+; BALIGN-NEXT: {
+; BALIGN-NEXT: p0 = cmp.gt(r1,#0)
+; BALIGN-NEXT: jump .LBB0_1
+; BALIGN-NEXT: }
+; BALIGN-NEXT: .p2align 4
+; BALIGN-NEXT: .LBB0_8: // %for.end7
+; BALIGN-NEXT: // in Loop: Header=BB0_1 Depth=1
+; BALIGN-NEXT: {
+; BALIGN-NEXT: r3 = add(r3,#1)
+; BALIGN-NEXT: r4 = add(r4,#1)
+; BALIGN-NEXT: }
+; BALIGN-NEXT: {
+; BALIGN-NEXT: p1 = cmp.eq(r3,r0)
+; BALIGN-NEXT: if (p1.new) jumpr:nt r31
+; BALIGN-NEXT: }
+; BALIGN-NEXT: .LBB0_1: // %Outerloop
+; BALIGN-NEXT: // =>This Loop Header: Depth=1
+; BALIGN-NEXT: // Child Loop BB0_3 Depth 2
+; BALIGN-NEXT: // Child Loop BB0_6 Depth 3
+; BALIGN-NEXT: {
+; BALIGN-NEXT: if (!p0) jump:nt .LBB0_8
+; BALIGN-NEXT: }
+; BALIGN-NEXT: // %bb.2: // %for.body.lr.ph
+; BALIGN-NEXT: // in Loop: Header=BB0_1 Depth=1
+; BALIGN-NEXT: {
+; BALIGN-NEXT: loop1(.LBB0_3,r1)
+; BALIGN-NEXT: p1 = cmp.eq(r3,#0)
+; BALIGN-NEXT: p2 = cmp.eq(r3,#1)
+; BALIGN-NEXT: jump .LBB0_3
+; BALIGN-NEXT: }
+; BALIGN-NEXT: .p2align 4
+; BALIGN-NEXT: .LBB0_7: // %for.end
+; BALIGN-NEXT: // in Loop: Header=BB0_3 Depth=2
+; BALIGN-NEXT: {
+; BALIGN-NEXT: r9 = clb(r7:6)
+; BALIGN-NEXT: memw(r2+#0) = r9.new
+; BALIGN-NEXT: }
+; BALIGN-NEXT: {
+; BALIGN-NEXT: nop
+; BALIGN-NEXT: nop
+; BALIGN-NEXT: nop
+; BALIGN-NEXT: } :endloop1
+; BALIGN-NEXT: {
+; BALIGN-NEXT: jump .LBB0_8
+; BALIGN-NEXT: }
+; BALIGN-NEXT: .LBB0_3: // Block address taken
+; BALIGN-NEXT: // %for.body
+; BALIGN-NEXT: // Parent Loop BB0_1 Depth=1
+; BALIGN-NEXT: // => This Loop Header: Depth=2
+; BALIGN-NEXT: // Child Loop BB0_6 Depth 3
+; BALIGN-NEXT: {
+; BALIGN-NEXT: r12 = r8
+; BALIGN-NEXT: r8 = add(r8,r5)
+; BALIGN-NEXT: if (p1) jump:nt .LBB0_7
+; BALIGN-NEXT: }
+; BALIGN-NEXT: // %bb.4: // %for.body4.peel
+; BALIGN-NEXT: // in Loop: Header=BB0_3 Depth=2
+; BALIGN-NEXT: {
+; BALIGN-NEXT: r12 = memw(r12+#0)
+; BALIGN-NEXT: }
+; BALIGN-NEXT: {
+; BALIGN-NEXT: r7:6 -= mpy(r12,r9)
+; BALIGN-NEXT: if (p2) jump:nt .LBB0_7
+; BALIGN-NEXT: }
+; BALIGN-NEXT: // %bb.5: // %for.body4.preheader.peel.newph
+; BALIGN-NEXT: // in Loop: Header=BB0_3 Depth=2
+; BALIGN-NEXT: {
+; BALIGN-NEXT: r13 = add(r4,#1)
+; BALIGN-NEXT: r12 = memw(r8+#0)
+; BALIGN-NEXT: }
+; BALIGN-NEXT: {
+; BALIGN-NEXT: loop0(.LBB0_6,r13)
+; BALIGN-NEXT: }
+; BALIGN-NEXT: .p2align 4
+; BALIGN-NEXT: .LBB0_6: // Block address taken
+; BALIGN-NEXT: // %for.body4
+; BALIGN-NEXT: // Parent Loop BB0_1 Depth=1
+; BALIGN-NEXT: // Parent Loop BB0_3 Depth=2
+; BALIGN-NEXT: // => This Inner Loop Header: Depth=3
+; BALIGN-NEXT: {
+; BALIGN-NEXT: r7:6 -= mpy(r12,r9)
+; BALIGN-NEXT: nop
+; BALIGN-NEXT: } :endloop0
+; BALIGN-NEXT: {
+; BALIGN-NEXT: jump .LBB0_7
+; BALIGN-NEXT: }
entry:
%shl = shl i32 %nRow, 2
%cmp36 = icmp sgt i32 %nRow, 0
@@ -85,7 +187,7 @@ if.end: ; preds = %for.end7
}
; Function Attrs: nounwind readnone
-declare i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64, i32, i32)
+declare i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64, i32, i32)
; Function Attrs: nounwind readnone
declare i32 @llvm.hexagon.S2.clbp(i64)
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index cdbbabe3e3b05b..25aff73a38b829 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -61,21 +61,21 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: // %bb.3: // %udiv-bb1
; CHECK-NEXT: add.cc.s64 %rd71, %rd26, 1;
; CHECK-NEXT: addc.cc.s64 %rd72, %rd27, 0;
-; CHECK-NEXT: or.b64 %rd30, %rd71, %rd72;
-; CHECK-NEXT: setp.eq.b64 %p16, %rd30, 0;
; CHECK-NEXT: cvt.u32.u64 %r5, %rd26;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd31, %rd3, %r6;
+; CHECK-NEXT: shl.b64 %rd30, %rd3, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd32, %rd2, %r7;
-; CHECK-NEXT: or.b64 %rd33, %rd31, %rd32;
+; CHECK-NEXT: shr.u64 %rd31, %rd2, %r7;
+; CHECK-NEXT: or.b64 %rd32, %rd30, %rd31;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd34, %rd2, %r8;
-; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd76, %rd34, %rd33, %p17;
+; CHECK-NEXT: shl.b64 %rd33, %rd2, %r8;
+; CHECK-NEXT: setp.gt.s32 %p16, %r6, 63;
+; CHECK-NEXT: selp.b64 %rd76, %rd33, %rd32, %p16;
; CHECK-NEXT: shl.b64 %rd75, %rd2, %r6;
+; CHECK-NEXT: or.b64 %rd34, %rd71, %rd72;
+; CHECK-NEXT: setp.eq.b64 %p17, %rd34, 0;
; CHECK-NEXT: mov.b64 %rd69, %rd70;
-; CHECK-NEXT: @%p16 bra $L__BB0_4;
+; CHECK-NEXT: @%p17 bra $L__BB0_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
; CHECK-NEXT: cvt.u32.u64 %r9, %rd71;
; CHECK-NEXT: shr.u64 %rd35, %rd2, %r9;
@@ -191,21 +191,21 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: // %bb.3: // %udiv-bb1
; CHECK-NEXT: add.cc.s64 %rd58, %rd17, 1;
; CHECK-NEXT: addc.cc.s64 %rd59, %rd18, 0;
-; CHECK-NEXT: or.b64 %rd21, %rd58, %rd59;
-; CHECK-NEXT: setp.eq.b64 %p14, %rd21, 0;
; CHECK-NEXT: cvt.u32.u64 %r5, %rd17;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd22, %rd6, %r6;
+; CHECK-NEXT: shl.b64 %rd21, %rd6, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd23, %rd5, %r7;
-; CHECK-NEXT: or.b64 %rd24, %rd22, %rd23;
+; CHECK-NEXT: shr.u64 %rd22, %rd5, %r7;
+; CHECK-NEXT: or.b64 %rd23, %rd21, %rd22;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd25, %rd5, %r8;
-; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd63, %rd25, %rd24, %p15;
+; CHECK-NEXT: shl.b64 %rd24, %rd5, %r8;
+; CHECK-NEXT: setp.gt.s32 %p14, %r6, 63;
+; CHECK-NEXT: selp.b64 %rd63, %rd24, %rd23, %p14;
; CHECK-NEXT: shl.b64 %rd62, %rd5, %r6;
+; CHECK-NEXT: or.b64 %rd25, %rd58, %rd59;
+; CHECK-NEXT: setp.eq.b64 %p15, %rd25, 0;
; CHECK-NEXT: mov.b64 %rd56, %rd57;
-; CHECK-NEXT: @%p14 bra $L__BB1_4;
+; CHECK-NEXT: @%p15 bra $L__BB1_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
; CHECK-NEXT: cvt.u32.u64 %r9, %rd58;
; CHECK-NEXT: shr.u64 %rd26, %rd5, %r9;
@@ -363,21 +363,21 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: // %bb.3: // %udiv-bb1
; CHECK-NEXT: add.cc.s64 %rd66, %rd27, 1;
; CHECK-NEXT: addc.cc.s64 %rd67, %rd28, 0;
-; CHECK-NEXT: or.b64 %rd31, %rd66, %rd67;
-; CHECK-NEXT: setp.eq.b64 %p16, %rd31, 0;
; CHECK-NEXT: cvt.u32.u64 %r5, %rd27;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd32, %rd2, %r6;
+; CHECK-NEXT: shl.b64 %rd31, %rd2, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd33, %rd1, %r7;
-; CHECK-NEXT: or.b64 %rd34, %rd32, %rd33;
+; CHECK-NEXT: shr.u64 %rd32, %rd1, %r7;
+; CHECK-NEXT: or.b64 %rd33, %rd31, %rd32;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd35, %rd1, %r8;
-; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd71, %rd35, %rd34, %p17;
+; CHECK-NEXT: shl.b64 %rd34, %rd1, %r8;
+; CHECK-NEXT: setp.gt.s32 %p16, %r6, 63;
+; CHECK-NEXT: selp.b64 %rd71, %rd34, %rd33, %p16;
; CHECK-NEXT: shl.b64 %rd70, %rd1, %r6;
+; CHECK-NEXT: or.b64 %rd35, %rd66, %rd67;
+; CHECK-NEXT: setp.eq.b64 %p17, %rd35, 0;
; CHECK-NEXT: mov.b64 %rd64, %rd65;
-; CHECK-NEXT: @%p16 bra $L__BB4_4;
+; CHECK-NEXT: @%p17 bra $L__BB4_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
; CHECK-NEXT: cvt.u32.u64 %r9, %rd66;
; CHECK-NEXT: shr.u64 %rd36, %rd1, %r9;
@@ -487,21 +487,21 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: // %bb.3: // %udiv-bb1
; CHECK-NEXT: add.cc.s64 %rd52, %rd17, 1;
; CHECK-NEXT: addc.cc.s64 %rd53, %rd18, 0;
-; CHECK-NEXT: or.b64 %rd21, %rd52, %rd53;
-; CHECK-NEXT: setp.eq.b64 %p14, %rd21, 0;
; CHECK-NEXT: cvt.u32.u64 %r5, %rd17;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd22, %rd4, %r6;
+; CHECK-NEXT: shl.b64 %rd21, %rd4, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd23, %rd3, %r7;
-; CHECK-NEXT: or.b64 %rd24, %rd22, %rd23;
+; CHECK-NEXT: shr.u64 %rd22, %rd3, %r7;
+; CHECK-NEXT: or.b64 %rd23, %rd21, %rd22;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd25, %rd3, %r8;
-; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd57, %rd25, %rd24, %p15;
+; CHECK-NEXT: shl.b64 %rd24, %rd3, %r8;
+; CHECK-NEXT: setp.gt.s32 %p14, %r6, 63;
+; CHECK-NEXT: selp.b64 %rd57, %rd24, %rd23, %p14;
; CHECK-NEXT: shl.b64 %rd56, %rd3, %r6;
+; CHECK-NEXT: or.b64 %rd25, %rd52, %rd53;
+; CHECK-NEXT: setp.eq.b64 %p15, %rd25, 0;
; CHECK-NEXT: mov.b64 %rd50, %rd51;
-; CHECK-NEXT: @%p14 bra $L__BB5_4;
+; CHECK-NEXT: @%p15 bra $L__BB5_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
; CHECK-NEXT: cvt.u32.u64 %r9, %rd52;
; CHECK-NEXT: shr.u64 %rd26, %rd3, %r9;
diff --git a/llvm/test/CodeGen/PowerPC/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/PowerPC/atomicrmw-uinc-udec-wrap.ll
index 4dc6d0ad3d5c7b..05fe11026cc597 100644
--- a/llvm/test/CodeGen/PowerPC/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/PowerPC/atomicrmw-uinc-udec-wrap.ll
@@ -370,42 +370,42 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
; CHECK-LABEL: atomicrmw_udec_wrap_i64:
; CHECK: # %bb.0:
; CHECK-NEXT: sync
-; CHECK-NEXT: ld 6, 0(3)
+; CHECK-NEXT: ld 8, 0(3)
+; CHECK-NEXT: li 6, 1
+; CHECK-NEXT: li 7, 0
; CHECK-NEXT: .LBB7_1: # %atomicrmw.start
; CHECK-NEXT: # =>This Loop Header: Depth=1
-; CHECK-NEXT: # Child Loop BB7_4 Depth 2
-; CHECK-NEXT: cmpdi 6, 0
-; CHECK-NEXT: mr 7, 4
-; CHECK-NEXT: bc 12, 2, .LBB7_4
+; CHECK-NEXT: # Child Loop BB7_3 Depth 2
+; CHECK-NEXT: subc 5, 8, 6
+; CHECK-NEXT: addze. 9, 7
+; CHECK-NEXT: cmpld 1, 8, 4
+; CHECK-NEXT: cror 20, 2, 5
+; CHECK-NEXT: mr 9, 4
+; CHECK-NEXT: bc 12, 20, .LBB7_3
; CHECK-NEXT: # %bb.2: # %atomicrmw.start
; CHECK-NEXT: #
-; CHECK-NEXT: cmpld 6, 4
-; CHECK-NEXT: mr 7, 4
-; CHECK-NEXT: bc 12, 1, .LBB7_4
-; CHECK-NEXT: # %bb.3: # %atomicrmw.start
-; CHECK-NEXT: #
-; CHECK-NEXT: addi 7, 6, -1
-; CHECK-NEXT: .LBB7_4: # %cmpxchg.start
+; CHECK-NEXT: mr 9, 5
+; CHECK-NEXT: .LBB7_3: # %cmpxchg.start
; CHECK-NEXT: # Parent Loop BB7_1 Depth=1
; CHECK-NEXT: # => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldarx 5, 0, 3
-; CHECK-NEXT: cmpld 5, 6
-; CHECK-NEXT: bne- 0, .LBB7_7
-; CHECK-NEXT: # %bb.5: # %cmpxchg.fencedstore
+; CHECK-NEXT: cmpld 5, 8
+; CHECK-NEXT: bne- 0, .LBB7_6
+; CHECK-NEXT: # %bb.4: # %cmpxchg.fencedstore
; CHECK-NEXT: #
-; CHECK-NEXT: stdcx. 7, 0, 3
+; CHECK-NEXT: stdcx. 9, 0, 3
; CHECK-NEXT: creqv 20, 20, 20
-; CHECK-NEXT: bne- 0, .LBB7_4
-; CHECK-NEXT: # %bb.6: # %cmpxchg.end
+; CHECK-NEXT: bne- 0, .LBB7_3
+; CHECK-NEXT: # %bb.5: # %cmpxchg.end
; CHECK-NEXT: #
-; CHECK-NEXT: mr 6, 5
+; CHECK-NEXT: mr 8, 5
; CHECK-NEXT: bc 4, 20, .LBB7_1
-; CHECK-NEXT: b .LBB7_8
-; CHECK-NEXT: .LBB7_7: # %cmpxchg.nostore
+; CHECK-NEXT: b .LBB7_7
+; CHECK-NEXT: .LBB7_6: # %cmpxchg.nostore
; CHECK-NEXT: #
-; CHECK-NEXT: mr 6, 5
+; CHECK-NEXT: mr 8, 5
; CHECK-NEXT: b .LBB7_1
-; CHECK-NEXT: .LBB7_8: # %atomicrmw.end
+; CHECK-NEXT: .LBB7_7: # %atomicrmw.end
; CHECK-NEXT: mr 3, 5
; CHECK-NEXT: lwsync
; CHECK-NEXT: blr
diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll
index 34b703a9811058..fc608f9f6410b6 100644
--- a/llvm/test/CodeGen/PowerPC/sat-add.ll
+++ b/llvm/test/CodeGen/PowerPC/sat-add.ll
@@ -24,11 +24,12 @@ define i8 @unsigned_sat_constant_i8_using_min(i8 %x) {
define i8 @unsigned_sat_constant_i8_using_cmp_sum(i8 %x) {
; CHECK-LABEL: unsigned_sat_constant_i8_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrlwi 3, 3, 24
+; CHECK-NEXT: clrlwi 4, 3, 24
; CHECK-NEXT: addi 3, 3, 42
-; CHECK-NEXT: andi. 4, 3, 256
+; CHECK-NEXT: clrlwi 5, 3, 24
+; CHECK-NEXT: cmplw 4, 5
; CHECK-NEXT: li 4, -1
-; CHECK-NEXT: iseleq 3, 3, 4
+; CHECK-NEXT: iselgt 3, 4, 3
; CHECK-NEXT: blr
%a = add i8 %x, 42
%c = icmp ugt i8 %x, %a
@@ -69,11 +70,12 @@ define i16 @unsigned_sat_constant_i16_using_min(i16 %x) {
define i16 @unsigned_sat_constant_i16_using_cmp_sum(i16 %x) {
; CHECK-LABEL: unsigned_sat_constant_i16_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrlwi 3, 3, 16
+; CHECK-NEXT: clrlwi 4, 3, 16
; CHECK-NEXT: addi 3, 3, 42
-; CHECK-NEXT: andis. 4, 3, 1
+; CHECK-NEXT: clrlwi 5, 3, 16
+; CHECK-NEXT: cmplw 4, 5
; CHECK-NEXT: li 4, -1
-; CHECK-NEXT: iseleq 3, 3, 4
+; CHECK-NEXT: iselgt 3, 4, 3
; CHECK-NEXT: blr
%a = add i16 %x, 42
%c = icmp ugt i16 %x, %a
@@ -114,9 +116,9 @@ define i32 @unsigned_sat_constant_i32_using_cmp_sum(i32 %x) {
; CHECK-LABEL: unsigned_sat_constant_i32_using_cmp_sum:
; CHECK: # %bb.0:
; CHECK-NEXT: addi 4, 3, 42
-; CHECK-NEXT: cmplw 4, 3
+; CHECK-NEXT: cmplw 3, 4
; CHECK-NEXT: li 3, -1
-; CHECK-NEXT: isellt 3, 3, 4
+; CHECK-NEXT: iselgt 3, 3, 4
; CHECK-NEXT: blr
%a = add i32 %x, 42
%c = icmp ugt i32 %x, %a
@@ -203,12 +205,12 @@ define i8 @unsigned_sat_variable_i8_using_min(i8 %x, i8 %y) {
define i8 @unsigned_sat_variable_i8_using_cmp_sum(i8 %x, i8 %y) {
; CHECK-LABEL: unsigned_sat_variable_i8_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrlwi 4, 4, 24
-; CHECK-NEXT: clrlwi 3, 3, 24
+; CHECK-NEXT: clrlwi 5, 3, 24
; CHECK-NEXT: add 3, 3, 4
-; CHECK-NEXT: andi. 4, 3, 256
+; CHECK-NEXT: clrlwi 4, 3, 24
+; CHECK-NEXT: cmplw 5, 4
; CHECK-NEXT: li 4, -1
-; CHECK-NEXT: iseleq 3, 3, 4
+; CHECK-NEXT: iselgt 3, 4, 3
; CHECK-NEXT: blr
%a = add i8 %x, %y
%c = icmp ugt i8 %x, %a
@@ -254,12 +256,12 @@ define i16 @unsigned_sat_variable_i16_using_min(i16 %x, i16 %y) {
define i16 @unsigned_sat_variable_i16_using_cmp_sum(i16 %x, i16 %y) {
; CHECK-LABEL: unsigned_sat_variable_i16_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrlwi 4, 4, 16
-; CHECK-NEXT: clrlwi 3, 3, 16
+; CHECK-NEXT: clrlwi 5, 3, 16
; CHECK-NEXT: add 3, 3, 4
-; CHECK-NEXT: andis. 4, 3, 1
+; CHECK-NEXT: clrlwi 4, 3, 16
+; CHECK-NEXT: cmplw 5, 4
; CHECK-NEXT: li 4, -1
-; CHECK-NEXT: iseleq 3, 3, 4
+; CHECK-NEXT: iselgt 3, 4, 3
; CHECK-NEXT: blr
%a = add i16 %x, %y
%c = icmp ugt i16 %x, %a
@@ -304,9 +306,9 @@ define i32 @unsigned_sat_variable_i32_using_cmp_sum(i32 %x, i32 %y) {
; CHECK-LABEL: unsigned_sat_variable_i32_using_cmp_sum:
; CHECK: # %bb.0:
; CHECK-NEXT: add 4, 3, 4
-; CHECK-NEXT: cmplw 4, 3
+; CHECK-NEXT: cmplw 3, 4
; CHECK-NEXT: li 3, -1
-; CHECK-NEXT: isellt 3, 3, 4
+; CHECK-NEXT: iselgt 3, 3, 4
; CHECK-NEXT: blr
%a = add i32 %x, %y
%c = icmp ugt i32 %x, %a
diff --git a/llvm/test/CodeGen/RISCV/branch-on-zero.ll b/llvm/test/CodeGen/RISCV/branch-on-zero.ll
index 02aeebdeb37757..b1d396d70ff5fd 100644
--- a/llvm/test/CodeGen/RISCV/branch-on-zero.ll
+++ b/llvm/test/CodeGen/RISCV/branch-on-zero.ll
@@ -8,19 +8,21 @@ define i32 @optbranch_32(i32 %Arg) {
; RV32-LABEL: optbranch_32:
; RV32: # %bb.0: # %bb
; RV32-NEXT: addi a0, a0, 1
-; RV32-NEXT: bnez a0, .LBB0_2
-; RV32-NEXT: # %bb.1: # %bb2
+; RV32-NEXT: beqz a0, .LBB0_2
+; RV32-NEXT: # %bb.1: # %bb3
+; RV32-NEXT: ret
+; RV32-NEXT: .LBB0_2: # %bb2
; RV32-NEXT: li a0, -1
-; RV32-NEXT: .LBB0_2: # %bb3
; RV32-NEXT: ret
;
; RV64-LABEL: optbranch_32:
; RV64: # %bb.0: # %bb
; RV64-NEXT: addiw a0, a0, 1
-; RV64-NEXT: bnez a0, .LBB0_2
-; RV64-NEXT: # %bb.1: # %bb2
+; RV64-NEXT: beqz a0, .LBB0_2
+; RV64-NEXT: # %bb.1: # %bb3
+; RV64-NEXT: ret
+; RV64-NEXT: .LBB0_2: # %bb2
; RV64-NEXT: li a0, -1
-; RV64-NEXT: .LBB0_2: # %bb3
; RV64-NEXT: ret
bb:
%i1 = icmp eq i32 %Arg, -1
@@ -41,20 +43,22 @@ define i64 @optbranch_64(i64 %Arg) {
; RV32-NEXT: seqz a2, a0
; RV32-NEXT: add a1, a1, a2
; RV32-NEXT: or a2, a0, a1
-; RV32-NEXT: bnez a2, .LBB1_2
-; RV32-NEXT: # %bb.1: # %bb2
+; RV32-NEXT: beqz a2, .LBB1_2
+; RV32-NEXT: # %bb.1: # %bb3
+; RV32-NEXT: ret
+; RV32-NEXT: .LBB1_2: # %bb2
; RV32-NEXT: li a0, -1
; RV32-NEXT: li a1, -1
-; RV32-NEXT: .LBB1_2: # %bb3
; RV32-NEXT: ret
;
; RV64-LABEL: optbranch_64:
; RV64: # %bb.0: # %bb
; RV64-NEXT: addi a0, a0, 1
-; RV64-NEXT: bnez a0, .LBB1_2
-; RV64-NEXT: # %bb.1: # %bb2
+; RV64-NEXT: beqz a0, .LBB1_2
+; RV64-NEXT: # %bb.1: # %bb3
+; RV64-NEXT: ret
+; RV64-NEXT: .LBB1_2: # %bb2
; RV64-NEXT: li a0, -1
-; RV64-NEXT: .LBB1_2: # %bb3
; RV64-NEXT: ret
bb:
%i1 = icmp eq i64 %Arg, -1
diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
index ba6769b2aa3e1f..148886224454c7 100644
--- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
@@ -48,33 +48,36 @@ define i64 @uaddo1_overflow_used(i64 %a, i64 %b) nounwind ssp {
define i64 @uaddo1_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp {
; RV32-LABEL: uaddo1_math_overflow_used:
; RV32: # %bb.0:
-; RV32-NEXT: add a5, a3, a1
-; RV32-NEXT: add a0, a2, a0
-; RV32-NEXT: sltu a1, a0, a2
-; RV32-NEXT: add a5, a5, a1
-; RV32-NEXT: beq a5, a3, .LBB1_2
+; RV32-NEXT: add a6, a3, a1
+; RV32-NEXT: add a5, a2, a0
+; RV32-NEXT: sltu a7, a5, a2
+; RV32-NEXT: add a6, a6, a7
+; RV32-NEXT: beq a6, a1, .LBB1_2
; RV32-NEXT: # %bb.1:
-; RV32-NEXT: sltu a1, a5, a3
+; RV32-NEXT: sltu a0, a6, a1
+; RV32-NEXT: beqz a0, .LBB1_3
+; RV32-NEXT: j .LBB1_4
; RV32-NEXT: .LBB1_2:
-; RV32-NEXT: bnez a1, .LBB1_4
-; RV32-NEXT: # %bb.3:
+; RV32-NEXT: sltu a0, a5, a0
+; RV32-NEXT: bnez a0, .LBB1_4
+; RV32-NEXT: .LBB1_3:
; RV32-NEXT: li a2, 42
; RV32-NEXT: .LBB1_4:
-; RV32-NEXT: neg a1, a1
+; RV32-NEXT: neg a1, a0
; RV32-NEXT: and a1, a1, a3
-; RV32-NEXT: sw a0, 0(a4)
-; RV32-NEXT: sw a5, 4(a4)
+; RV32-NEXT: sw a5, 0(a4)
+; RV32-NEXT: sw a6, 4(a4)
; RV32-NEXT: mv a0, a2
; RV32-NEXT: ret
;
; RV64-LABEL: uaddo1_math_overflow_used:
; RV64: # %bb.0:
-; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: bltu a0, a1, .LBB1_2
+; RV64-NEXT: add a3, a1, a0
+; RV64-NEXT: bltu a3, a0, .LBB1_2
; RV64-NEXT: # %bb.1:
; RV64-NEXT: li a1, 42
; RV64-NEXT: .LBB1_2:
-; RV64-NEXT: sd a0, 0(a2)
+; RV64-NEXT: sd a3, 0(a2)
; RV64-NEXT: mv a0, a1
; RV64-NEXT: ret
%add = add i64 %b, %a
@@ -200,7 +203,7 @@ define i64 @uaddo3_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp {
; RV32-NEXT: add a0, a2, a0
; RV32-NEXT: sltu a1, a0, a2
; RV32-NEXT: add a5, a5, a1
-; RV32-NEXT: beq a5, a3, .LBB5_2
+; RV32-NEXT: beq a3, a5, .LBB5_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: sltu a1, a5, a3
; RV32-NEXT: .LBB5_2:
@@ -617,9 +620,10 @@ define i1 @uaddo_i64_increment_alt(i64 %x, ptr %p) {
; RV32-LABEL: uaddo_i64_increment_alt:
; RV32: # %bb.0:
; RV32-NEXT: addi a3, a0, 1
-; RV32-NEXT: seqz a0, a3
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: or a0, a3, a1
+; RV32-NEXT: and a0, a0, a1
+; RV32-NEXT: seqz a4, a3
+; RV32-NEXT: addi a0, a0, 1
+; RV32-NEXT: add a1, a1, a4
; RV32-NEXT: seqz a0, a0
; RV32-NEXT: sw a3, 0(a2)
; RV32-NEXT: sw a1, 4(a2)
@@ -642,12 +646,13 @@ define i1 @uaddo_i64_increment_alt(i64 %x, ptr %p) {
define i1 @uaddo_i64_increment_alt_dom(i64 %x, ptr %p) {
; RV32-LABEL: uaddo_i64_increment_alt_dom:
; RV32: # %bb.0:
-; RV32-NEXT: addi a3, a0, 1
+; RV32-NEXT: and a3, a0, a1
+; RV32-NEXT: addi a4, a0, 1
+; RV32-NEXT: addi a3, a3, 1
+; RV32-NEXT: seqz a5, a4
; RV32-NEXT: seqz a0, a3
-; RV32-NEXT: add a1, a1, a0
-; RV32-NEXT: or a0, a3, a1
-; RV32-NEXT: seqz a0, a0
-; RV32-NEXT: sw a3, 0(a2)
+; RV32-NEXT: add a1, a1, a5
+; RV32-NEXT: sw a4, 0(a2)
; RV32-NEXT: sw a1, 4(a2)
; RV32-NEXT: ret
;
@@ -668,18 +673,16 @@ define i1 @uaddo_i64_increment_alt_dom(i64 %x, ptr %p) {
define i1 @uaddo_i32_decrement_alt(i32 signext %x, ptr %p) {
; RV32-LABEL: uaddo_i32_decrement_alt:
; RV32: # %bb.0:
-; RV32-NEXT: snez a2, a0
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: sw a0, 0(a1)
-; RV32-NEXT: mv a0, a2
+; RV32-NEXT: addi a2, a0, -1
+; RV32-NEXT: snez a0, a0
+; RV32-NEXT: sw a2, 0(a1)
; RV32-NEXT: ret
;
; RV64-LABEL: uaddo_i32_decrement_alt:
; RV64: # %bb.0:
-; RV64-NEXT: snez a2, a0
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: sw a0, 0(a1)
-; RV64-NEXT: mv a0, a2
+; RV64-NEXT: addi a2, a0, -1
+; RV64-NEXT: snez a0, a0
+; RV64-NEXT: sw a2, 0(a1)
; RV64-NEXT: ret
%a = add i32 %x, -1
store i32 %a, ptr %p
@@ -690,21 +693,20 @@ define i1 @uaddo_i32_decrement_alt(i32 signext %x, ptr %p) {
define i1 @uaddo_i64_decrement_alt(i64 %x, ptr %p) {
; RV32-LABEL: uaddo_i64_decrement_alt:
; RV32: # %bb.0:
-; RV32-NEXT: or a3, a0, a1
-; RV32-NEXT: seqz a4, a0
-; RV32-NEXT: addi a5, a0, -1
-; RV32-NEXT: snez a0, a3
-; RV32-NEXT: sub a1, a1, a4
-; RV32-NEXT: sw a5, 0(a2)
+; RV32-NEXT: seqz a3, a0
+; RV32-NEXT: addi a4, a0, -1
+; RV32-NEXT: or a0, a0, a1
+; RV32-NEXT: sub a1, a1, a3
+; RV32-NEXT: snez a0, a0
+; RV32-NEXT: sw a4, 0(a2)
; RV32-NEXT: sw a1, 4(a2)
; RV32-NEXT: ret
;
; RV64-LABEL: uaddo_i64_decrement_alt:
; RV64: # %bb.0:
-; RV64-NEXT: snez a2, a0
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: sd a0, 0(a1)
-; RV64-NEXT: mv a0, a2
+; RV64-NEXT: addi a2, a0, -1
+; RV64-NEXT: snez a0, a0
+; RV64-NEXT: sd a2, 0(a1)
; RV64-NEXT: ret
%a = add i64 %x, -1
store i64 %a, ptr %p
diff --git a/llvm/test/CodeGen/SPIRV/optimizations/add-check-overflow.ll b/llvm/test/CodeGen/SPIRV/optimizations/add-check-overflow.ll
index 2db620dab88017..0b389e3a26c782 100644
--- a/llvm/test/CodeGen/SPIRV/optimizations/add-check-overflow.ll
+++ b/llvm/test/CodeGen/SPIRV/optimizations/add-check-overflow.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; This test aims to check ability to support "Arithmetic with Overflow" intrinsics
; in the special case when those intrinsics are being generated by the CodeGenPrepare;
; pass during translations with optimization (note -disable-lsr, to inhibit
@@ -89,3 +90,6 @@ l1:
exit:
ret i32 %i
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
+; NOLSR: {{.*}}
diff --git a/llvm/test/CodeGen/Thumb/scheduler-clone-cpsr-def.ll b/llvm/test/CodeGen/Thumb/scheduler-clone-cpsr-def.ll
index 31e54c43c1e5fd..4c92a000204751 100644
--- a/llvm/test/CodeGen/Thumb/scheduler-clone-cpsr-def.ll
+++ b/llvm/test/CodeGen/Thumb/scheduler-clone-cpsr-def.ll
@@ -11,27 +11,20 @@
define i64 @f(i64 %x2, i32 %z) {
; CHECK-LABEL: f:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
-; CHECK-NEXT: movs r2, #0
-; CHECK-NEXT: subs r3, r0, #1
-; CHECK-NEXT: mov r3, r1
-; CHECK-NEXT: sbcs r3, r2
-; CHECK-NEXT: mov r3, r2
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: orrs r2, r1
+; CHECK-NEXT: rsbs r3, r2, #0
; CHECK-NEXT: adcs r3, r2
-; CHECK-NEXT: movs r4, #30
-; CHECK-NEXT: subs r5, r0, #1
-; CHECK-NEXT: mov r5, r1
-; CHECK-NEXT: sbcs r5, r2
-; CHECK-NEXT: adcs r4, r2
-; CHECK-NEXT: lsls r2, r1, #1
-; CHECK-NEXT: lsls r2, r4
-; CHECK-NEXT: movs r4, #1
-; CHECK-NEXT: eors r4, r3
-; CHECK-NEXT: lsrs r0, r4
-; CHECK-NEXT: orrs r0, r2
-; CHECK-NEXT: lsrs r1, r4
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: lsrs r0, r3
+; CHECK-NEXT: movs r2, #31
+; CHECK-NEXT: eors r2, r3
+; CHECK-NEXT: lsls r4, r1, #1
+; CHECK-NEXT: lsls r4, r2
+; CHECK-NEXT: orrs r0, r4
+; CHECK-NEXT: lsrs r1, r3
+; CHECK-NEXT: pop {r4, pc}
%x3 = add nsw i64 %x2, -1
%x8 = icmp ne i64 %x2, 0
%x9 = xor i1 %x8, true
diff --git a/llvm/test/CodeGen/X86/abdu-neg.ll b/llvm/test/CodeGen/X86/abdu-neg.ll
index b7c34070f1af6a..7309f1902bff63 100644
--- a/llvm/test/CodeGen/X86/abdu-neg.ll
+++ b/llvm/test/CodeGen/X86/abdu-neg.ll
@@ -804,21 +804,24 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
; X86-LABEL: abd_cmp_i64:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: subl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $0, %esi
-; X86-NEXT: sbbl %esi, %esi
-; X86-NEXT: xorl %esi, %ecx
-; X86-NEXT: xorl %esi, %eax
-; X86-NEXT: subl %esi, %eax
-; X86-NEXT: sbbl %esi, %ecx
-; X86-NEXT: negl %eax
-; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: subl %eax, %edi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: sbbl %edx, %ebx
+; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: sbbl %esi, %edx
+; X86-NEXT: cmovael %edi, %eax
+; X86-NEXT: cmovael %ebx, %edx
; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-LABEL: abd_cmp_i64:
@@ -845,36 +848,34 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl 32(%ebp), %esi
-; X86-NEXT: movl 36(%ebp), %eax
; X86-NEXT: movl 24(%ebp), %ecx
-; X86-NEXT: movl 28(%ebp), %edi
-; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: movl 40(%ebp), %eax
+; X86-NEXT: movl 44(%ebp), %esi
+; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebp), %esi
+; X86-NEXT: movl 48(%ebp), %edi
+; X86-NEXT: sbbl %esi, %edi
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: movl 52(%ebp), %eax
+; X86-NEXT: sbbl %ebx, %eax
; X86-NEXT: subl 40(%ebp), %ecx
-; X86-NEXT: sbbl 44(%ebp), %edi
+; X86-NEXT: sbbl 44(%ebp), %edx
; X86-NEXT: sbbl 48(%ebp), %esi
-; X86-NEXT: sbbl 52(%ebp), %eax
-; X86-NEXT: movl $0, %ebx
-; X86-NEXT: sbbl %ebx, %ebx
-; X86-NEXT: xorl %ebx, %eax
-; X86-NEXT: xorl %ebx, %esi
-; X86-NEXT: xorl %ebx, %edi
-; X86-NEXT: xorl %ebx, %ecx
-; X86-NEXT: subl %ebx, %ecx
-; X86-NEXT: sbbl %ebx, %edi
-; X86-NEXT: sbbl %ebx, %esi
-; X86-NEXT: sbbl %ebx, %eax
-; X86-NEXT: negl %ecx
-; X86-NEXT: movl $0, %ebx
-; X86-NEXT: sbbl %edi, %ebx
-; X86-NEXT: movl $0, %edi
-; X86-NEXT: sbbl %esi, %edi
-; X86-NEXT: sbbl %eax, %edx
+; X86-NEXT: sbbl 52(%ebp), %ebx
+; X86-NEXT: cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: cmovael %edi, %esi
+; X86-NEXT: cmovael %eax, %ebx
; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %ebx, 12(%eax)
+; X86-NEXT: movl %esi, 8(%eax)
+; X86-NEXT: movl %edx, 4(%eax)
; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %ebx, 4(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %edx, 12(%eax)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -884,19 +885,15 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
;
; X64-LABEL: abd_cmp_i128:
; X64: # %bb.0:
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: xorl %edi, %edi
-; X64-NEXT: subq %rdx, %rax
-; X64-NEXT: sbbq %rcx, %rsi
-; X64-NEXT: movl $0, %ecx
-; X64-NEXT: sbbq %rcx, %rcx
-; X64-NEXT: xorq %rcx, %rsi
-; X64-NEXT: xorq %rcx, %rax
-; X64-NEXT: subq %rcx, %rax
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: subq %rdi, %rax
+; X64-NEXT: movq %rcx, %r8
+; X64-NEXT: sbbq %rsi, %r8
+; X64-NEXT: subq %rdx, %rdi
; X64-NEXT: sbbq %rcx, %rsi
-; X64-NEXT: negq %rax
-; X64-NEXT: sbbq %rsi, %rdi
-; X64-NEXT: movq %rdi, %rdx
+; X64-NEXT: cmovbq %rdi, %rax
+; X64-NEXT: cmovbq %rsi, %r8
+; X64-NEXT: movq %r8, %rdx
; X64-NEXT: retq
%cmp = icmp ult i128 %a, %b
%ab = sub i128 %a, %b
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 455b72d16a0755..7236b2c3eec5d5 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -152,14 +152,14 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $176, %esp
-; X86-NEXT: movl 32(%ebp), %edx
-; X86-NEXT: movl 36(%ebp), %ecx
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: movl 36(%ebp), %edx
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: sarl $31, %eax
-; X86-NEXT: xorl %eax, %ecx
-; X86-NEXT: movl %ecx, %edi
; X86-NEXT: xorl %eax, %edx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, %esi
; X86-NEXT: movl 28(%ebp), %edx
; X86-NEXT: xorl %eax, %edx
; X86-NEXT: movl 24(%ebp), %ecx
@@ -172,26 +172,27 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 52(%ebp), %esi
-; X86-NEXT: movl %esi, %edx
+; X86-NEXT: movl 52(%ebp), %ecx
+; X86-NEXT: movl %ecx, %edx
; X86-NEXT: sarl $31, %edx
-; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: xorl %edx, %edi
; X86-NEXT: movl 48(%ebp), %ecx
; X86-NEXT: xorl %edx, %ecx
; X86-NEXT: movl 44(%ebp), %ebx
; X86-NEXT: xorl %edx, %ebx
-; X86-NEXT: movl 40(%ebp), %edi
-; X86-NEXT: xorl %edx, %edi
-; X86-NEXT: subl %edx, %edi
+; X86-NEXT: movl 40(%ebp), %esi
+; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: subl %edx, %esi
; X86-NEXT: sbbl %edx, %ebx
; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %edx, %esi
+; X86-NEXT: sbbl %edx, %edi
; X86-NEXT: xorl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: movl %esi, %ecx
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: sete %cl
@@ -203,92 +204,99 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: sete %al
; X86-NEXT: orb %cl, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: bsrl %esi, %edx
-; X86-NEXT: xorl $31, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: bsrl %edi, %ecx
; X86-NEXT: xorl $31, %ecx
-; X86-NEXT: orl $32, %ecx
-; X86-NEXT: testl %esi, %esi
-; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: bsrl %ebx, %edx
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: bsrl %edi, %edi
-; X86-NEXT: xorl $31, %edi
-; X86-NEXT: orl $32, %edi
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: testl %ebx, %ebx
-; X86-NEXT: cmovnel %edx, %edi
-; X86-NEXT: orl $64, %edi
-; X86-NEXT: movl %eax, %edx
+; X86-NEXT: orl $32, %edx
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: cmovnel %ecx, %edx
+; X86-NEXT: bsrl %ebx, %eax
+; X86-NEXT: xorl $31, %eax
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: cmovnel %ecx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: bsrl %eax, %edx
-; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: bsrl %esi, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: orl $32, %ecx
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: cmovnel %eax, %ecx
+; X86-NEXT: orl $64, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %esi
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: bsrl %eax, %esi
; X86-NEXT: xorl $31, %esi
-; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: orl $32, %edx
-; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: testl %eax, %eax
; X86-NEXT: cmovnel %esi, %edx
-; X86-NEXT: orl $64, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: cmovnel %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: bsrl %eax, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: cmovnel %edi, %esi
+; X86-NEXT: orl $64, %esi
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: cmovnel %edx, %esi
; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: subl %edx, %edi
-; X86-NEXT: movl $0, %edx
-; X86-NEXT: sbbl %edx, %edx
-; X86-NEXT: movl $0, %esi
-; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: subl %esi, %ecx
; X86-NEXT: movl $0, %eax
; X86-NEXT: sbbl %eax, %eax
-; X86-NEXT: movl $127, %ecx
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %edi, %ecx
-; X86-NEXT: movl $0, %ecx
-; X86-NEXT: sbbl %edx, %ecx
-; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: movl $127, %edx
+; X86-NEXT: cmpl %ecx, %edx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %eax, %edx
+; X86-NEXT: movl $0, %edx
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %esi, %ecx
-; X86-NEXT: movl $0, %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %ecx
-; X86-NEXT: setb %cl
-; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; X86-NEXT: sbbl %esi, %edx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %edi, %edx
+; X86-NEXT: setb %dl
+; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: cmovnel %ebx, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: cmovnel %ebx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: cmovnel %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: cmovnel %ebx, %eax
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: cmovnel %ebx, %edi
; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: jne .LBB4_1
; X86-NEXT: # %bb.8: # %_udiv-special-cases
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: xorl $127, %eax
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %edi, %ecx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: je .LBB4_9
; X86-NEXT: # %bb.5: # %udiv-bb1
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: addl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: xorps %xmm0, %xmm0
@@ -299,8 +307,6 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %ecx
; X86-NEXT: xorb $127, %cl
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
@@ -310,251 +316,245 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl 152(%esp,%eax), %esi
; X86-NEXT: movl 156(%esp,%eax), %edx
; X86-NEXT: shldl %cl, %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 144(%esp,%eax), %edx
-; X86-NEXT: movl 148(%esp,%eax), %eax
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl $1, %ebx
+; X86-NEXT: movl 144(%esp,%eax), %ebx
+; X86-NEXT: movl 148(%esp,%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: shldl %cl, %ebx, %edi
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: jae .LBB4_2
-; X86-NEXT: # %bb.6:
-; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: jmp .LBB4_7
-; X86-NEXT: .LBB4_1:
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: jmp .LBB4_9
-; X86-NEXT: .LBB4_2: # %udiv-preheader
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: je .LBB4_6
+; X86-NEXT: # %bb.2: # %udiv-preheader
; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $12, %al
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl 108(%esp,%eax), %edx
+; X86-NEXT: movl 108(%esp,%eax), %ebx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 104(%esp,%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shrdl %cl, %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 96(%esp,%eax), %edi
+; X86-NEXT: movl 100(%esp,%eax), %eax
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 104(%esp,%eax), %ebx
-; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: shrdl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 96(%esp,%eax), %esi
-; X86-NEXT: movl 100(%esp,%eax), %eax
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: shrdl %cl, %ebx, %edi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: shrl %cl, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl %cl, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shrdl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrdl %cl, %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %ecx, %ecx
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: .p2align 4
; X86-NEXT: .LBB4_3: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %esi, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: orl %ecx, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: shldl $1, %ebx, %eax
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %eax, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %ebx, %edx
-; X86-NEXT: shldl $1, %ecx, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl $1, %edi, %ecx
-; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $1, %ecx, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl $1, %edi, %ecx
-; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %edi, %edi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: sbbl %edi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: sbbl %ebx, %ecx
; X86-NEXT: sarl $31, %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl $1, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: subl %ecx, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %edx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: sbbl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: subl %ecx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: sbbl %eax, %esi
+; X86-NEXT: sbbl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl $-1, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl $-1, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: adcl $-1, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: adcl $-1, %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %eax
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edx, %ecx
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: jne .LBB4_3
; X86-NEXT: # %bb.4:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ecx, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jmp .LBB4_7
+; X86-NEXT: .LBB4_1:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: jmp .LBB4_9
+; X86-NEXT: .LBB4_6:
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: xorl %ecx, %ecx
; X86-NEXT: .LBB4_7: # %udiv-loop-exit
-; X86-NEXT: shldl $1, %ebx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: shldl $1, %eax, %ebx
-; X86-NEXT: orl %edx, %ebx
-; X86-NEXT: shldl $1, %edi, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: addl %edi, %edx
+; X86-NEXT: shldl $1, %esi, %edx
; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: shldl $1, %edi, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: addl %ebx, %ebx
+; X86-NEXT: orl %eax, %ebx
; X86-NEXT: .LBB4_9: # %udiv-end
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, %esi
-; X86-NEXT: xorl %ecx, %ebx
-; X86-NEXT: xorl %ecx, %eax
-; X86-NEXT: xorl %ecx, %edx
-; X86-NEXT: subl %ecx, %edx
-; X86-NEXT: sbbl %ecx, %eax
-; X86-NEXT: sbbl %ecx, %ebx
-; X86-NEXT: sbbl %ecx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%ebp), %ecx
-; X86-NEXT: movl %edx, (%ecx)
-; X86-NEXT: movl %eax, 4(%ecx)
-; X86-NEXT: movl %ebx, 8(%ecx)
-; X86-NEXT: movl %esi, 12(%ecx)
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%ebp), %ecx
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: xorl %eax, %edx
+; X86-NEXT: xorl %eax, %esi
+; X86-NEXT: xorl %eax, %edi
+; X86-NEXT: xorl %eax, %ebx
+; X86-NEXT: subl %eax, %ebx
+; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %esi
+; X86-NEXT: sbbl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl 56(%ebp), %eax
+; X86-NEXT: movl %ebx, (%eax)
+; X86-NEXT: movl %edi, 4(%eax)
+; X86-NEXT: movl %esi, 8(%eax)
+; X86-NEXT: movl %edx, 12(%eax)
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl 40(%ebp), %edi
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull 40(%ebp)
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl 44(%ebp), %esi
-; X86-NEXT: mull %esi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl 44(%ebp), %edi
+; X86-NEXT: mull %edi
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %ebx, %edx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: setb %bl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %esi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: mull %ebx
; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl 40(%ebp), %eax
-; X86-NEXT: imull %eax, %ebx
-; X86-NEXT: mull %edi
+; X86-NEXT: imull %eax, %ecx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: imull %esi, %edi
+; X86-NEXT: imull %ebx, %esi
+; X86-NEXT: addl %edx, %esi
+; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: movl 48(%ebp), %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: imull %edi, %ebx
+; X86-NEXT: movl 52(%ebp), %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: imull %ecx, %edi
+; X86-NEXT: mull %ecx
; X86-NEXT: addl %edx, %edi
; X86-NEXT: addl %ebx, %edi
-; X86-NEXT: movl 48(%ebp), %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl 52(%ebp), %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: imull %edx, %ebx
-; X86-NEXT: mull %edx
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: addl %esi, %ebx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %edi, %ebx
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl 24(%ebp), %edx
-; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 28(%ebp), %ecx
-; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl 32(%ebp), %edi
-; X86-NEXT: sbbl %eax, %edi
-; X86-NEXT: movl 36(%ebp), %esi
-; X86-NEXT: sbbl %ebx, %esi
+; X86-NEXT: adcl %esi, %edi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl 24(%ebp), %esi
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: sbbl %edi, %ebx
; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %esi, 12(%eax)
+; X86-NEXT: movl %esi, (%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %ecx, 8(%eax)
+; X86-NEXT: movl %ebx, 12(%eax)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 859e9244d29d2b..199cae7f563b3a 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -152,12 +152,11 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $160, %esp
-; X86-NEXT: movl 40(%ebp), %ebx
-; X86-NEXT: movl 52(%ebp), %esi
+; X86-NEXT: movl 40(%ebp), %ecx
+; X86-NEXT: movl 52(%ebp), %ebx
; X86-NEXT: movl 44(%ebp), %edi
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: orl %ebx, %eax
; X86-NEXT: orl 48(%ebp), %ecx
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: sete %cl
@@ -169,161 +168,157 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: sete %al
; X86-NEXT: orb %cl, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: bsrl %esi, %edx
-; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl 48(%ebp), %ecx
+; X86-NEXT: bsrl %ebx, %ecx
; X86-NEXT: xorl $31, %ecx
-; X86-NEXT: orl $32, %ecx
-; X86-NEXT: testl %esi, %esi
-; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: bsrl %edi, %edx
+; X86-NEXT: bsrl 48(%ebp), %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl %ebx, %eax
+; X86-NEXT: orl $32, %edx
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: cmovnel %ecx, %edx
+; X86-NEXT: bsrl %edi, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: bsrl 40(%ebp), %eax
; X86-NEXT: xorl $31, %eax
; X86-NEXT: orl $32, %eax
; X86-NEXT: testl %edi, %edi
-; X86-NEXT: cmovnel %edx, %eax
+; X86-NEXT: cmovnel %esi, %eax
; X86-NEXT: orl $64, %eax
-; X86-NEXT: movl 48(%ebp), %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: cmovnel %ecx, %eax
-; X86-NEXT: movl 36(%ebp), %ebx
-; X86-NEXT: bsrl %ebx, %edx
-; X86-NEXT: xorl $31, %edx
-; X86-NEXT: movl 32(%ebp), %ecx
-; X86-NEXT: bsrl %ecx, %ecx
-; X86-NEXT: xorl $31, %ecx
-; X86-NEXT: orl $32, %ecx
-; X86-NEXT: testl %ebx, %ebx
-; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl 28(%ebp), %edi
+; X86-NEXT: movl 48(%ebp), %esi
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: cmovnel %edx, %eax
+; X86-NEXT: movl 36(%ebp), %edi
; X86-NEXT: bsrl %edi, %esi
; X86-NEXT: xorl $31, %esi
-; X86-NEXT: bsrl 24(%ebp), %edx
+; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: bsrl %ecx, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: orl $32, %edx
; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %esi, %edx
-; X86-NEXT: orl $64, %edx
-; X86-NEXT: movl 32(%ebp), %esi
-; X86-NEXT: orl %ebx, %esi
-; X86-NEXT: cmovnel %ecx, %edx
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: subl %edx, %eax
+; X86-NEXT: movl 28(%ebp), %ebx
+; X86-NEXT: bsrl %ebx, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: bsrl 24(%ebp), %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: cmovnel %edi, %esi
+; X86-NEXT: orl $64, %esi
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: orl 36(%ebp), %edi
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: cmovnel %edx, %esi
+; X86-NEXT: subl %esi, %eax
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
; X86-NEXT: movl $0, %ebx
; X86-NEXT: sbbl %ebx, %ebx
; X86-NEXT: movl $0, %ecx
; X86-NEXT: sbbl %ecx, %ecx
-; X86-NEXT: movl $0, %esi
-; X86-NEXT: sbbl %esi, %esi
-; X86-NEXT: movl $127, %edx
+; X86-NEXT: movl $127, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %eax, %edx
-; X86-NEXT: movl $0, %edx
+; X86-NEXT: cmpl %eax, %esi
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %edx, %esi
+; X86-NEXT: movl $0, %esi
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %ebx, %edx
-; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %ebx, %esi
+; X86-NEXT: movl $0, %esi
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %ecx, %edx
-; X86-NEXT: movl $0, %edx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %esi, %edx
-; X86-NEXT: setb %dl
-; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
-; X86-NEXT: movl 36(%ebp), %eax
-; X86-NEXT: cmovnel %edi, %eax
-; X86-NEXT: movl 32(%ebp), %esi
-; X86-NEXT: cmovnel %edi, %esi
-; X86-NEXT: movl 28(%ebp), %edx
-; X86-NEXT: cmovnel %edi, %edx
-; X86-NEXT: movl 24(%ebp), %ebx
-; X86-NEXT: cmovnel %edi, %ebx
-; X86-NEXT: movl 56(%ebp), %edi
-; X86-NEXT: jne .LBB4_8
-; X86-NEXT: # %bb.1: # %_udiv-special-cases
-; X86-NEXT: movl %eax, %edi
+; X86-NEXT: sbbl %ecx, %esi
+; X86-NEXT: setb %ah
+; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Folded Reload
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: cmovnel %ecx, %ebx
+; X86-NEXT: cmovnel %ecx, %edi
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%ebp), %esi
+; X86-NEXT: cmovnel %ecx, %esi
+; X86-NEXT: jne .LBB4_1
+; X86-NEXT: # %bb.8: # %_udiv-special-cases
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: xorl $127, %eax
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: je .LBB4_9
+; X86-NEXT: # %bb.5: # %udiv-bb1
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl 56(%ebp), %edi
-; X86-NEXT: movl 24(%ebp), %ecx
-; X86-NEXT: je .LBB4_8
-; X86-NEXT: # %bb.2: # %udiv-bb1
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: addl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 24(%ebp), %esi
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NEXT: xorps %xmm0, %xmm0
; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 28(%ebp), %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 32(%ebp), %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 36(%ebp), %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %edx
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %esi
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NEXT: xorb $127, %cl
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 136(%esp,%eax), %esi
-; X86-NEXT: movl 140(%esp,%eax), %edx
-; X86-NEXT: shldl %cl, %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 136(%esp,%eax), %edi
+; X86-NEXT: movl 140(%esp,%eax), %ebx
+; X86-NEXT: shldl %cl, %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 128(%esp,%eax), %ebx
-; X86-NEXT: movl 132(%esp,%eax), %edx
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: shldl %cl, %ebx, %edx
+; X86-NEXT: movl 132(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %edi
+; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl $0, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: jae .LBB4_3
-; X86-NEXT: # %bb.6:
-; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: jmp .LBB4_7
-; X86-NEXT: .LBB4_3: # %udiv-preheader
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: je .LBB4_6
+; X86-NEXT: # %bb.2: # %udiv-preheader
; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 24(%ebp), %edi
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 28(%ebp), %edi
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 32(%ebp), %edi
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 36(%ebp), %edi
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $12, %al
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl 92(%esp,%eax), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 92(%esp,%eax), %esi
+; X86-NEXT: movl 88(%esp,%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: shrdl %cl, %esi, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 88(%esp,%eax), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shrdl %cl, %edx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 80(%esp,%eax), %edx
; X86-NEXT: movl 84(%esp,%eax), %eax
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shrdl %cl, %edi, %ebx
-; X86-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shrdl %cl, %eax, %edi
+; X86-NEXT: shrl %cl, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shrdl %cl, %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 40(%ebp), %eax
@@ -338,41 +333,46 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl 52(%ebp), %eax
; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %ecx, %ecx
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: .p2align 4
-; X86-NEXT: .LBB4_4: # %udiv-do-while
+; X86-NEXT: .LBB4_3: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: shldl $1, %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl $1, %edi, %ebx
-; X86-NEXT: shldl $1, %ecx, %edi
-; X86-NEXT: shldl $1, %esi, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $1, %ecx, %esi
-; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: orl %ecx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl $1, %esi, %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %esi, %esi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: shldl $1, %esi, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: orl %ecx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %edx, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: sbbl %ebx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: sarl $31, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl $1, %eax
@@ -386,94 +386,100 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: andl 40(%ebp), %ecx
; X86-NEXT: subl %ecx, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %edx, %ebx
-; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: sbbl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl %edx, %edi
+; X86-NEXT: sbbl %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl $-1, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $-1, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: adcl $-1, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $-1, %esi
+; X86-NEXT: adcl $-1, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl %ebx, %eax
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %ecx
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: jne .LBB4_4
-; X86-NEXT: # %bb.5:
+; X86-NEXT: jne .LBB4_3
+; X86-NEXT: # %bb.4:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: jmp .LBB4_7
+; X86-NEXT: .LBB4_1:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: jmp .LBB4_9
+; X86-NEXT: .LBB4_6:
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl 56(%ebp), %edi
; X86-NEXT: .LBB4_7: # %udiv-loop-exit
-; X86-NEXT: shldl $1, %esi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: shldl $1, %edx, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: shldl $1, %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ebx
+; X86-NEXT: orl %ecx, %ebx
+; X86-NEXT: shldl $1, %edx, %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: shldl $1, %esi, %edx
; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: addl %ebx, %ebx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: .LBB4_8: # %udiv-end
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %esi, %esi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: .LBB4_9: # %udiv-end
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, (%edi)
-; X86-NEXT: movl %edx, 4(%edi)
-; X86-NEXT: movl %esi, 8(%edi)
-; X86-NEXT: movl %eax, 12(%edi)
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl 56(%ebp), %eax
+; X86-NEXT: movl %esi, (%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %edi, 8(%eax)
+; X86-NEXT: movl %ebx, 12(%eax)
; X86-NEXT: movl 48(%ebp), %eax
+; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, %esi
; X86-NEXT: imull %edx, %esi
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %esi, %edx
+; X86-NEXT: movl %edi, %esi
; X86-NEXT: movl 52(%ebp), %edi
-; X86-NEXT: imull %ebx, %edi
+; X86-NEXT: imull %ecx, %edi
; X86-NEXT: addl %edx, %edi
-; X86-NEXT: movl 40(%ebp), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: imull 40(%ebp), %ecx
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: movl 44(%ebp), %eax
-; X86-NEXT: imull %eax, %ebx
-; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl 40(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: imull %ecx, %ebx
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl 44(%ebp), %ebx
+; X86-NEXT: imull %ebx, %esi
+; X86-NEXT: addl %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %edi, %ecx
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull 44(%ebp)
-; X86-NEXT: movl 28(%ebp), %esi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl 32(%ebp), %ebx
; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebx, %edi
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull 44(%ebp)
@@ -481,19 +487,19 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 24(%ebp), %ebx
-; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl 32(%ebp), %edi
-; X86-NEXT: sbbl %eax, %edi
-; X86-NEXT: movl 36(%ebp), %ecx
-; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: movl 24(%ebp), %edi
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl 28(%ebp), %ecx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: sbbl %eax, %ebx
+; X86-NEXT: movl 36(%ebp), %esi
+; X86-NEXT: sbbl %edx, %esi
; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %ebx, (%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl %ebx, 8(%eax)
+; X86-NEXT: movl %esi, 12(%eax)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll
index 4e31b48ec5cece..1b307b30d8c0d1 100644
--- a/llvm/test/CodeGen/X86/select.ll
+++ b/llvm/test/CodeGen/X86/select.ll
@@ -2065,10 +2065,11 @@ define i64 @PR51612(i64 %x, i64 %y) {
; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %eax
; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %ecx
; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %edx
-; ATHLON-NEXT: incl %edx
-; ATHLON-NEXT: addl $1, %eax
-; ATHLON-NEXT: adcl $0, %ecx
-; ATHLON-NEXT: cmovbl %edx, %eax
+; ATHLON-NEXT: addl $1, %ecx
+; ATHLON-NEXT: adcl $0, %edx
+; ATHLON-NEXT: incl %eax
+; ATHLON-NEXT: orl %ecx, %edx
+; ATHLON-NEXT: cmovnel %ecx, %eax
; ATHLON-NEXT: andl 10, %eax
; ATHLON-NEXT: xorl %edx, %edx
; ATHLON-NEXT: retl
@@ -2077,7 +2078,8 @@ define i64 @PR51612(i64 %x, i64 %y) {
; MCU: # %bb.0:
; MCU-NEXT: addl $1, %eax
; MCU-NEXT: adcl $0, %edx
-; MCU-NEXT: jae .LBB45_2
+; MCU-NEXT: orl %eax, %edx
+; MCU-NEXT: jne .LBB45_2
; MCU-NEXT: # %bb.1:
; MCU-NEXT: movl {{[0-9]+}}(%esp), %eax
; MCU-NEXT: incl %eax
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/overflow-intrinsics.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/overflow-intrinsics.ll
index f72679f55e114c..f114d9a2fd192f 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/overflow-intrinsics.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/overflow-intrinsics.ll
@@ -15,6 +15,16 @@ define i64 @uaddo1_overflow_used(i64 %a, i64 %b) nounwind ssp {
; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
; CHECK-NEXT: ret i64 [[Q]]
+;
+; DEBUG-LABEL: @uaddo1_overflow_used(
+; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]]), !dbg [[DBG14:![0-9]+]]
+; DEBUG-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0, !dbg [[DBG14]]
+; DEBUG-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG14]]
+; DEBUG-NEXT: #dbg_value(i64 [[MATH]], [[META9:![0-9]+]], !DIExpression(), [[DBG14]])
+; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META11:![0-9]+]], !DIExpression(), [[META15:![0-9]+]])
+; DEBUG-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42, !dbg [[DBG16:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META13:![0-9]+]], !DIExpression(), [[DBG16]])
+; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG17:![0-9]+]]
;
%add = add i64 %b, %a
%cmp = icmp ult i64 %add, %a
@@ -28,8 +38,19 @@ define i64 @uaddo1_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp {
; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0
; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
-; CHECK-NEXT: store i64 [[MATH]], ptr [[RES:%.*]]
+; CHECK-NEXT: store i64 [[MATH]], ptr [[RES:%.*]], align 8
; CHECK-NEXT: ret i64 [[Q]]
+;
+; DEBUG-LABEL: @uaddo1_math_overflow_used(
+; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]]), !dbg [[DBG23:![0-9]+]]
+; DEBUG-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0, !dbg [[DBG23]]
+; DEBUG-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG23]]
+; DEBUG-NEXT: #dbg_value(i64 [[MATH]], [[META20:![0-9]+]], !DIExpression(), [[DBG23]])
+; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META21:![0-9]+]], !DIExpression(), [[META24:![0-9]+]])
+; DEBUG-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42, !dbg [[DBG25:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META22:![0-9]+]], !DIExpression(), [[DBG25]])
+; DEBUG-NEXT: store i64 [[MATH]], ptr [[RES:%.*]], align 8, !dbg [[DBG26:![0-9]+]]
+; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG27:![0-9]+]]
;
%add = add i64 %b, %a
%cmp = icmp ult i64 %add, %a
@@ -45,6 +66,16 @@ define i64 @uaddo2_overflow_used(i64 %a, i64 %b) nounwind ssp {
; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
; CHECK-NEXT: ret i64 [[Q]]
+;
+; DEBUG-LABEL: @uaddo2_overflow_used(
+; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]]), !dbg [[DBG33:![0-9]+]]
+; DEBUG-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0, !dbg [[DBG33]]
+; DEBUG-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG33]]
+; DEBUG-NEXT: #dbg_value(i64 [[MATH]], [[META30:![0-9]+]], !DIExpression(), [[DBG33]])
+; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META31:![0-9]+]], !DIExpression(), [[META34:![0-9]+]])
+; DEBUG-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42, !dbg [[DBG35:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META32:![0-9]+]], !DIExpression(), [[DBG35]])
+; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG36:![0-9]+]]
;
%add = add i64 %b, %a
%cmp = icmp ult i64 %add, %b
@@ -58,8 +89,19 @@ define i64 @uaddo2_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp {
; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0
; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
-; CHECK-NEXT: store i64 [[MATH]], ptr [[RES:%.*]]
+; CHECK-NEXT: store i64 [[MATH]], ptr [[RES:%.*]], align 8
; CHECK-NEXT: ret i64 [[Q]]
+;
+; DEBUG-LABEL: @uaddo2_math_overflow_used(
+; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]]), !dbg [[DBG42:![0-9]+]]
+; DEBUG-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0, !dbg [[DBG42]]
+; DEBUG-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG42]]
+; DEBUG-NEXT: #dbg_value(i64 [[MATH]], [[META39:![0-9]+]], !DIExpression(), [[DBG42]])
+; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META40:![0-9]+]], !DIExpression(), [[META43:![0-9]+]])
+; DEBUG-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42, !dbg [[DBG44:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META41:![0-9]+]], !DIExpression(), [[DBG44]])
+; DEBUG-NEXT: store i64 [[MATH]], ptr [[RES:%.*]], align 8, !dbg [[DBG45:![0-9]+]]
+; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG46:![0-9]+]]
;
%add = add i64 %b, %a
%cmp = icmp ult i64 %add, %b
@@ -75,6 +117,16 @@ define i64 @uaddo3_overflow_used(i64 %a, i64 %b) nounwind ssp {
; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
; CHECK-NEXT: ret i64 [[Q]]
+;
+; DEBUG-LABEL: @uaddo3_overflow_used(
+; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]]), !dbg [[DBG52:![0-9]+]]
+; DEBUG-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0, !dbg [[DBG52]]
+; DEBUG-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG52]]
+; DEBUG-NEXT: #dbg_value(i64 [[MATH]], [[META49:![0-9]+]], !DIExpression(), [[DBG52]])
+; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META50:![0-9]+]], !DIExpression(), [[META53:![0-9]+]])
+; DEBUG-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42, !dbg [[DBG54:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META51:![0-9]+]], !DIExpression(), [[DBG54]])
+; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG55:![0-9]+]]
;
%add = add i64 %b, %a
%cmp = icmp ugt i64 %b, %add
@@ -88,8 +140,19 @@ define i64 @uaddo3_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp {
; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0
; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
-; CHECK-NEXT: store i64 [[MATH]], ptr [[RES:%.*]]
+; CHECK-NEXT: store i64 [[MATH]], ptr [[RES:%.*]], align 8
; CHECK-NEXT: ret i64 [[Q]]
+;
+; DEBUG-LABEL: @uaddo3_math_overflow_used(
+; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]]), !dbg [[DBG61:![0-9]+]]
+; DEBUG-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0, !dbg [[DBG61]]
+; DEBUG-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG61]]
+; DEBUG-NEXT: #dbg_value(i64 [[MATH]], [[META58:![0-9]+]], !DIExpression(), [[DBG61]])
+; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META59:![0-9]+]], !DIExpression(), [[META62:![0-9]+]])
+; DEBUG-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42, !dbg [[DBG63:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META60:![0-9]+]], !DIExpression(), [[DBG63]])
+; DEBUG-NEXT: store i64 [[MATH]], ptr [[RES:%.*]], align 8, !dbg [[DBG64:![0-9]+]]
+; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG65:![0-9]+]]
;
%add = add i64 %b, %a
%cmp = icmp ugt i64 %b, %add
@@ -106,6 +169,15 @@ define i64 @uaddo6_xor(i64 %a, i64 %b) {
; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
; CHECK-NEXT: ret i64 [[Q]]
+;
+; DEBUG-LABEL: @uaddo6_xor(
+; DEBUG-NEXT: #dbg_value(i64 poison, [[META68:![0-9]+]], !DIExpression(), [[META71:![0-9]+]])
+; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A:%.*]], i64 [[B:%.*]]), !dbg [[DBG72:![0-9]+]]
+; DEBUG-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG72]]
+; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META69:![0-9]+]], !DIExpression(), [[DBG72]])
+; DEBUG-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42, !dbg [[DBG73:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META70:![0-9]+]], !DIExpression(), [[DBG73]])
+; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG74:![0-9]+]]
;
%x = xor i64 %a, -1
%cmp = icmp ult i64 %x, %b
@@ -119,6 +191,15 @@ define i64 @uaddo6_xor_commuted(i64 %a, i64 %b) {
; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
; CHECK-NEXT: ret i64 [[Q]]
+;
+; DEBUG-LABEL: @uaddo6_xor_commuted(
+; DEBUG-NEXT: #dbg_value(i64 poison, [[META77:![0-9]+]], !DIExpression(), [[META80:![0-9]+]])
+; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A:%.*]], i64 [[B:%.*]]), !dbg [[DBG81:![0-9]+]]
+; DEBUG-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG81]]
+; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META78:![0-9]+]], !DIExpression(), [[DBG81]])
+; DEBUG-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42, !dbg [[DBG82:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META79:![0-9]+]], !DIExpression(), [[DBG82]])
+; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG83:![0-9]+]]
;
%x = xor i64 %a, -1
%cmp = icmp ugt i64 %b, %x
@@ -135,6 +216,16 @@ define i64 @uaddo6_xor_multi_use(i64 %a, i64 %b) {
; CHECK-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42
; CHECK-NEXT: call void @use(i64 [[X]])
; CHECK-NEXT: ret i64 [[Q]]
+;
+; DEBUG-LABEL: @uaddo6_xor_multi_use(
+; DEBUG-NEXT: [[X:%.*]] = xor i64 -1, [[A:%.*]], !dbg [[DBG89:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[X]], [[META86:![0-9]+]], !DIExpression(), [[DBG89]])
+; DEBUG-NEXT: [[CMP:%.*]] = icmp ult i64 [[X]], [[B:%.*]], !dbg [[DBG90:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i1 [[CMP]], [[META87:![0-9]+]], !DIExpression(), [[DBG90]])
+; DEBUG-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42, !dbg [[DBG91:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META88:![0-9]+]], !DIExpression(), [[DBG91]])
+; DEBUG-NEXT: call void @use(i64 [[X]]), !dbg [[DBG92:![0-9]+]]
+; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG93:![0-9]+]]
;
%x = xor i64 -1, %a
%cmp = icmp ult i64 %x, %b
@@ -145,9 +236,18 @@ define i64 @uaddo6_xor_multi_use(i64 %a, i64 %b) {
define i1 @usubo_ult_i64_overflow_used(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: @usubo_ult_i64_overflow_used(
-; CHECK-NEXT: [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT: [[OV:%.*]] = icmp ult i64 [[X]], [[Y]]
+; CHECK-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]])
+; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0
+; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
; CHECK-NEXT: ret i1 [[OV]]
+;
+; DEBUG-LABEL: @usubo_ult_i64_overflow_used(
+; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]]), !dbg [[DBG98:![0-9]+]]
+; DEBUG-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0, !dbg [[DBG98]]
+; DEBUG-NEXT: [[OV1:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG98]]
+; DEBUG-NEXT: #dbg_value(i64 [[MATH]], [[META96:![0-9]+]], !DIExpression(), [[DBG98]])
+; DEBUG-NEXT: #dbg_value(i1 [[OV1]], [[META97:![0-9]+]], !DIExpression(), [[META99:![0-9]+]])
+; DEBUG-NEXT: ret i1 [[OV1]], !dbg [[DBG100:![0-9]+]]
;
%s = sub i64 %x, %y
%ov = icmp ult i64 %x, %y
@@ -156,10 +256,20 @@ define i1 @usubo_ult_i64_overflow_used(i64 %x, i64 %y, ptr %p) {
define i1 @usubo_ult_i64_math_overflow_used(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: @usubo_ult_i64_math_overflow_used(
-; CHECK-NEXT: [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT: store i64 [[S]], ptr [[P:%.*]]
-; CHECK-NEXT: [[OV:%.*]] = icmp ult i64 [[X]], [[Y]]
+; CHECK-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]])
+; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0
+; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
+; CHECK-NEXT: store i64 [[MATH]], ptr [[P:%.*]], align 8
; CHECK-NEXT: ret i1 [[OV]]
+;
+; DEBUG-LABEL: @usubo_ult_i64_math_overflow_used(
+; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]]), !dbg [[DBG105:![0-9]+]]
+; DEBUG-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0, !dbg [[DBG105]]
+; DEBUG-NEXT: [[OV1:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG105]]
+; DEBUG-NEXT: #dbg_value(i64 [[MATH]], [[META103:![0-9]+]], !DIExpression(), [[DBG105]])
+; DEBUG-NEXT: store i64 [[MATH]], ptr [[P:%.*]], align 8, !dbg [[DBG106:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i1 [[OV1]], [[META104:![0-9]+]], !DIExpression(), [[META107:![0-9]+]])
+; DEBUG-NEXT: ret i1 [[OV1]], !dbg [[DBG108:![0-9]+]]
;
%s = sub i64 %x, %y
store i64 %s, ptr %p
diff --git a/llvm/test/Transforms/CodeGenPrepare/SPARC/overflow-intrinsics.ll b/llvm/test/Transforms/CodeGenPrepare/SPARC/overflow-intrinsics.ll
index ec60238cbf927e..d0a3ca4daa02f8 100644
--- a/llvm/test/Transforms/CodeGenPrepare/SPARC/overflow-intrinsics.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/SPARC/overflow-intrinsics.ll
@@ -14,6 +14,15 @@ define i64 @uaddo1_overflow_used(i64 %a, i64 %b) nounwind ssp {
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[ADD]], [[A]]
; CHECK-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42
; CHECK-NEXT: ret i64 [[Q]]
+;
+; DEBUG-LABEL: @uaddo1_overflow_used(
+; DEBUG-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]], !dbg [[DBG14:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[ADD]], [[META9:![0-9]+]], !DIExpression(), [[DBG14]])
+; DEBUG-NEXT: [[CMP:%.*]] = icmp ult i64 [[ADD]], [[A]], !dbg [[DBG15:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i1 [[CMP]], [[META11:![0-9]+]], !DIExpression(), [[DBG15]])
+; DEBUG-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42, !dbg [[DBG16:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META13:![0-9]+]], !DIExpression(), [[DBG16]])
+; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG17:![0-9]+]]
;
%add = add i64 %b, %a
%cmp = icmp ult i64 %add, %a
@@ -23,12 +32,21 @@ define i64 @uaddo1_overflow_used(i64 %a, i64 %b) nounwind ssp {
define i64 @uaddo1_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp {
; CHECK-LABEL: @uaddo1_math_overflow_used(
-; CHECK-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]])
-; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0
-; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
+; CHECK-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT: [[OV:%.*]] = icmp ult i64 [[ADD]], [[A]]
; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
-; CHECK-NEXT: store i64 [[MATH]], ptr [[RES:%.*]]
+; CHECK-NEXT: store i64 [[ADD]], ptr [[RES:%.*]], align 8
; CHECK-NEXT: ret i64 [[Q]]
+;
+; DEBUG-LABEL: @uaddo1_math_overflow_used(
+; DEBUG-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]], !dbg [[DBG23:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[ADD]], [[META20:![0-9]+]], !DIExpression(), [[DBG23]])
+; DEBUG-NEXT: [[CMP:%.*]] = icmp ult i64 [[ADD]], [[A]], !dbg [[DBG24:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i1 [[CMP]], [[META21:![0-9]+]], !DIExpression(), [[DBG24]])
+; DEBUG-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42, !dbg [[DBG25:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META22:![0-9]+]], !DIExpression(), [[DBG25]])
+; DEBUG-NEXT: store i64 [[ADD]], ptr [[RES:%.*]], align 8, !dbg [[DBG26:![0-9]+]]
+; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG27:![0-9]+]]
;
%add = add i64 %b, %a
%cmp = icmp ult i64 %add, %a
@@ -43,6 +61,15 @@ define i64 @uaddo2_overflow_used(i64 %a, i64 %b) nounwind ssp {
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[ADD]], [[B]]
; CHECK-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42
; CHECK-NEXT: ret i64 [[Q]]
+;
+; DEBUG-LABEL: @uaddo2_overflow_used(
+; DEBUG-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]], !dbg [[DBG33:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[ADD]], [[META30:![0-9]+]], !DIExpression(), [[DBG33]])
+; DEBUG-NEXT: [[CMP:%.*]] = icmp ult i64 [[ADD]], [[B]], !dbg [[DBG34:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i1 [[CMP]], [[META31:![0-9]+]], !DIExpression(), [[DBG34]])
+; DEBUG-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42, !dbg [[DBG35:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META32:![0-9]+]], !DIExpression(), [[DBG35]])
+; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG36:![0-9]+]]
;
%add = add i64 %b, %a
%cmp = icmp ult i64 %add, %b
@@ -52,12 +79,21 @@ define i64 @uaddo2_overflow_used(i64 %a, i64 %b) nounwind ssp {
define i64 @uaddo2_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp {
; CHECK-LABEL: @uaddo2_math_overflow_used(
-; CHECK-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]])
-; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0
-; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
+; CHECK-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT: [[OV:%.*]] = icmp ult i64 [[ADD]], [[B]]
; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
-; CHECK-NEXT: store i64 [[MATH]], ptr [[RES:%.*]]
+; CHECK-NEXT: store i64 [[ADD]], ptr [[RES:%.*]], align 8
; CHECK-NEXT: ret i64 [[Q]]
+;
+; DEBUG-LABEL: @uaddo2_math_overflow_used(
+; DEBUG-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]], !dbg [[DBG42:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[ADD]], [[META39:![0-9]+]], !DIExpression(), [[DBG42]])
+; DEBUG-NEXT: [[CMP:%.*]] = icmp ult i64 [[ADD]], [[B]], !dbg [[DBG43:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i1 [[CMP]], [[META40:![0-9]+]], !DIExpression(), [[DBG43]])
+; DEBUG-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42, !dbg [[DBG44:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META41:![0-9]+]], !DIExpression(), [[DBG44]])
+; DEBUG-NEXT: store i64 [[ADD]], ptr [[RES:%.*]], align 8, !dbg [[DBG45:![0-9]+]]
+; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG46:![0-9]+]]
;
%add = add i64 %b, %a
%cmp = icmp ult i64 %add, %b
@@ -72,6 +108,15 @@ define i64 @uaddo3_overflow_used(i64 %a, i64 %b) nounwind ssp {
; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[B]], [[ADD]]
; CHECK-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42
; CHECK-NEXT: ret i64 [[Q]]
+;
+; DEBUG-LABEL: @uaddo3_overflow_used(
+; DEBUG-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]], !dbg [[DBG52:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[ADD]], [[META49:![0-9]+]], !DIExpression(), [[DBG52]])
+; DEBUG-NEXT: [[CMP:%.*]] = icmp ugt i64 [[B]], [[ADD]], !dbg [[DBG53:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i1 [[CMP]], [[META50:![0-9]+]], !DIExpression(), [[DBG53]])
+; DEBUG-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42, !dbg [[DBG54:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META51:![0-9]+]], !DIExpression(), [[DBG54]])
+; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG55:![0-9]+]]
;
%add = add i64 %b, %a
%cmp = icmp ugt i64 %b, %add
@@ -81,12 +126,21 @@ define i64 @uaddo3_overflow_used(i64 %a, i64 %b) nounwind ssp {
define i64 @uaddo3_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp {
; CHECK-LABEL: @uaddo3_math_overflow_used(
-; CHECK-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]])
-; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0
-; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
+; CHECK-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT: [[OV:%.*]] = icmp ugt i64 [[B]], [[ADD]]
; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
-; CHECK-NEXT: store i64 [[MATH]], ptr [[RES:%.*]]
+; CHECK-NEXT: store i64 [[ADD]], ptr [[RES:%.*]], align 8
; CHECK-NEXT: ret i64 [[Q]]
+;
+; DEBUG-LABEL: @uaddo3_math_overflow_used(
+; DEBUG-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]], !dbg [[DBG61:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[ADD]], [[META58:![0-9]+]], !DIExpression(), [[DBG61]])
+; DEBUG-NEXT: [[CMP:%.*]] = icmp ugt i64 [[B]], [[ADD]], !dbg [[DBG62:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i1 [[CMP]], [[META59:![0-9]+]], !DIExpression(), [[DBG62]])
+; DEBUG-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42, !dbg [[DBG63:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META60:![0-9]+]], !DIExpression(), [[DBG63]])
+; DEBUG-NEXT: store i64 [[ADD]], ptr [[RES:%.*]], align 8, !dbg [[DBG64:![0-9]+]]
+; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG65:![0-9]+]]
;
%add = add i64 %b, %a
%cmp = icmp ugt i64 %b, %add
@@ -100,6 +154,13 @@ define i1 @usubo_ult_i64_overflow_used(i64 %x, i64 %y, ptr %p) {
; CHECK-NEXT: [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]]
; CHECK-NEXT: [[OV:%.*]] = icmp ult i64 [[X]], [[Y]]
; CHECK-NEXT: ret i1 [[OV]]
+;
+; DEBUG-LABEL: @usubo_ult_i64_overflow_used(
+; DEBUG-NEXT: [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]], !dbg [[DBG70:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[S]], [[META68:![0-9]+]], !DIExpression(), [[DBG70]])
+; DEBUG-NEXT: [[OV:%.*]] = icmp ult i64 [[X]], [[Y]], !dbg [[DBG71:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META69:![0-9]+]], !DIExpression(), [[DBG71]])
+; DEBUG-NEXT: ret i1 [[OV]], !dbg [[DBG72:![0-9]+]]
;
%s = sub i64 %x, %y
%ov = icmp ult i64 %x, %y
@@ -109,9 +170,17 @@ define i1 @usubo_ult_i64_overflow_used(i64 %x, i64 %y, ptr %p) {
define i1 @usubo_ult_i64_math_overflow_used(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: @usubo_ult_i64_math_overflow_used(
; CHECK-NEXT: [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT: store i64 [[S]], ptr [[P:%.*]]
+; CHECK-NEXT: store i64 [[S]], ptr [[P:%.*]], align 8
; CHECK-NEXT: [[OV:%.*]] = icmp ult i64 [[X]], [[Y]]
; CHECK-NEXT: ret i1 [[OV]]
+;
+; DEBUG-LABEL: @usubo_ult_i64_math_overflow_used(
+; DEBUG-NEXT: [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]], !dbg [[DBG77:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i64 [[S]], [[META75:![0-9]+]], !DIExpression(), [[DBG77]])
+; DEBUG-NEXT: store i64 [[S]], ptr [[P:%.*]], align 8, !dbg [[DBG78:![0-9]+]]
+; DEBUG-NEXT: [[OV:%.*]] = icmp ult i64 [[X]], [[Y]], !dbg [[DBG79:![0-9]+]]
+; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META76:![0-9]+]], !DIExpression(), [[DBG79]])
+; DEBUG-NEXT: ret i1 [[OV]], !dbg [[DBG80:![0-9]+]]
;
%s = sub i64 %x, %y
store i64 %s, ptr %p
>From 9cb799df391bb80131e865b813299f58585cdd4e Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Wed, 24 Sep 2025 12:32:43 -0400
Subject: [PATCH 03/17] f
---
llvm/test/CodeGen/PowerPC/sat-add.ll | 62 +-
llvm/test/CodeGen/PowerPC/sat-add.s | 1260 +++++++++++++++++++++
llvm/test/CodeGen/RISCV/branch-on-zero.ll | 28 +-
llvm/test/CodeGen/X86/psubus.ll | 814 +++++++++----
llvm/test/CodeGen/X86/select.ll | 12 +-
test_direct_uaddo.ll | 9 +
test_sat_pattern.ll | 6 +
test_sat_pattern.s | 0
test_scalar_sat.ll | 6 +
test_uaddo_conversion.ll | 9 +
test_uaddo_only.ll | 6 +
test_uaddo_only.s | 22 +
test_uaddsat.ll | 9 +
test_usubo.ll | 15 +
test_vector_uaddo.ll | 9 +
test_vector_uaddo.s | 21 +
trace_uaddsat.ll | 6 +
17 files changed, 1998 insertions(+), 296 deletions(-)
create mode 100644 llvm/test/CodeGen/PowerPC/sat-add.s
create mode 100644 test_direct_uaddo.ll
create mode 100644 test_sat_pattern.ll
create mode 100644 test_sat_pattern.s
create mode 100644 test_scalar_sat.ll
create mode 100644 test_uaddo_conversion.ll
create mode 100644 test_uaddo_only.ll
create mode 100644 test_uaddo_only.s
create mode 100644 test_uaddsat.ll
create mode 100644 test_usubo.ll
create mode 100644 test_vector_uaddo.ll
create mode 100644 test_vector_uaddo.s
create mode 100644 trace_uaddsat.ll
diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll
index fc608f9f6410b6..012f03f0b884c4 100644
--- a/llvm/test/CodeGen/PowerPC/sat-add.ll
+++ b/llvm/test/CodeGen/PowerPC/sat-add.ll
@@ -24,12 +24,11 @@ define i8 @unsigned_sat_constant_i8_using_min(i8 %x) {
define i8 @unsigned_sat_constant_i8_using_cmp_sum(i8 %x) {
; CHECK-LABEL: unsigned_sat_constant_i8_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrlwi 4, 3, 24
+; CHECK-NEXT: clrlwi 3, 3, 24
; CHECK-NEXT: addi 3, 3, 42
-; CHECK-NEXT: clrlwi 5, 3, 24
-; CHECK-NEXT: cmplw 4, 5
+; CHECK-NEXT: andi. 4, 3, 256
; CHECK-NEXT: li 4, -1
-; CHECK-NEXT: iselgt 3, 4, 3
+; CHECK-NEXT: iseleq 3, 3, 4
; CHECK-NEXT: blr
%a = add i8 %x, 42
%c = icmp ugt i8 %x, %a
@@ -70,12 +69,11 @@ define i16 @unsigned_sat_constant_i16_using_min(i16 %x) {
define i16 @unsigned_sat_constant_i16_using_cmp_sum(i16 %x) {
; CHECK-LABEL: unsigned_sat_constant_i16_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrlwi 4, 3, 16
+; CHECK-NEXT: clrlwi 3, 3, 16
; CHECK-NEXT: addi 3, 3, 42
-; CHECK-NEXT: clrlwi 5, 3, 16
-; CHECK-NEXT: cmplw 4, 5
+; CHECK-NEXT: andis. 4, 3, 1
; CHECK-NEXT: li 4, -1
-; CHECK-NEXT: iselgt 3, 4, 3
+; CHECK-NEXT: iseleq 3, 3, 4
; CHECK-NEXT: blr
%a = add i16 %x, 42
%c = icmp ugt i16 %x, %a
@@ -116,9 +114,9 @@ define i32 @unsigned_sat_constant_i32_using_cmp_sum(i32 %x) {
; CHECK-LABEL: unsigned_sat_constant_i32_using_cmp_sum:
; CHECK: # %bb.0:
; CHECK-NEXT: addi 4, 3, 42
-; CHECK-NEXT: cmplw 3, 4
+; CHECK-NEXT: cmplw 4, 3
; CHECK-NEXT: li 3, -1
-; CHECK-NEXT: iselgt 3, 3, 4
+; CHECK-NEXT: isellt 3, 3, 4
; CHECK-NEXT: blr
%a = add i32 %x, 42
%c = icmp ugt i32 %x, %a
@@ -205,12 +203,12 @@ define i8 @unsigned_sat_variable_i8_using_min(i8 %x, i8 %y) {
define i8 @unsigned_sat_variable_i8_using_cmp_sum(i8 %x, i8 %y) {
; CHECK-LABEL: unsigned_sat_variable_i8_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrlwi 5, 3, 24
+; CHECK-NEXT: clrlwi 4, 4, 24
+; CHECK-NEXT: clrlwi 3, 3, 24
; CHECK-NEXT: add 3, 3, 4
-; CHECK-NEXT: clrlwi 4, 3, 24
-; CHECK-NEXT: cmplw 5, 4
+; CHECK-NEXT: andi. 4, 3, 256
; CHECK-NEXT: li 4, -1
-; CHECK-NEXT: iselgt 3, 4, 3
+; CHECK-NEXT: iseleq 3, 3, 4
; CHECK-NEXT: blr
%a = add i8 %x, %y
%c = icmp ugt i8 %x, %a
@@ -256,12 +254,12 @@ define i16 @unsigned_sat_variable_i16_using_min(i16 %x, i16 %y) {
define i16 @unsigned_sat_variable_i16_using_cmp_sum(i16 %x, i16 %y) {
; CHECK-LABEL: unsigned_sat_variable_i16_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrlwi 5, 3, 16
+; CHECK-NEXT: clrlwi 4, 4, 16
+; CHECK-NEXT: clrlwi 3, 3, 16
; CHECK-NEXT: add 3, 3, 4
-; CHECK-NEXT: clrlwi 4, 3, 16
-; CHECK-NEXT: cmplw 5, 4
+; CHECK-NEXT: andis. 4, 3, 1
; CHECK-NEXT: li 4, -1
-; CHECK-NEXT: iselgt 3, 4, 3
+; CHECK-NEXT: iseleq 3, 3, 4
; CHECK-NEXT: blr
%a = add i16 %x, %y
%c = icmp ugt i16 %x, %a
@@ -306,9 +304,9 @@ define i32 @unsigned_sat_variable_i32_using_cmp_sum(i32 %x, i32 %y) {
; CHECK-LABEL: unsigned_sat_variable_i32_using_cmp_sum:
; CHECK: # %bb.0:
; CHECK-NEXT: add 4, 3, 4
-; CHECK-NEXT: cmplw 3, 4
+; CHECK-NEXT: cmplw 4, 3
; CHECK-NEXT: li 3, -1
-; CHECK-NEXT: iselgt 3, 3, 4
+; CHECK-NEXT: isellt 3, 3, 4
; CHECK-NEXT: blr
%a = add i32 %x, %y
%c = icmp ugt i32 %x, %a
@@ -402,7 +400,9 @@ define <16 x i8> @unsigned_sat_constant_v16i8_using_cmp_sum(<16 x i8> %x) {
; CHECK-NEXT: addis 3, 2, .LCPI25_0 at toc@ha
; CHECK-NEXT: addi 3, 3, .LCPI25_0 at toc@l
; CHECK-NEXT: lxvd2x 35, 0, 3
-; CHECK-NEXT: vaddubs 2, 2, 3
+; CHECK-NEXT: vaddubm 3, 2, 3
+; CHECK-NEXT: vcmpgtub 2, 2, 3
+; CHECK-NEXT: xxlor 34, 34, 35
; CHECK-NEXT: blr
%a = add <16 x i8> %x, <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
%c = icmp ugt <16 x i8> %x, %a
@@ -448,7 +448,9 @@ define <8 x i16> @unsigned_sat_constant_v8i16_using_cmp_sum(<8 x i16> %x) {
; CHECK-NEXT: addis 3, 2, .LCPI28_0 at toc@ha
; CHECK-NEXT: addi 3, 3, .LCPI28_0 at toc@l
; CHECK-NEXT: lxvd2x 35, 0, 3
-; CHECK-NEXT: vadduhs 2, 2, 3
+; CHECK-NEXT: vadduhm 3, 2, 3
+; CHECK-NEXT: vcmpgtuh 2, 2, 3
+; CHECK-NEXT: xxlor 34, 34, 35
; CHECK-NEXT: blr
%a = add <8 x i16> %x, <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
%c = icmp ugt <8 x i16> %x, %a
@@ -494,7 +496,9 @@ define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_sum(<4 x i32> %x) {
; CHECK-NEXT: addis 3, 2, .LCPI31_0 at toc@ha
; CHECK-NEXT: addi 3, 3, .LCPI31_0 at toc@l
; CHECK-NEXT: lxvd2x 35, 0, 3
-; CHECK-NEXT: vadduws 2, 2, 3
+; CHECK-NEXT: vadduwm 3, 2, 3
+; CHECK-NEXT: vcmpgtuw 2, 2, 3
+; CHECK-NEXT: xxlor 34, 34, 35
; CHECK-NEXT: blr
%a = add <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42>
%c = icmp ugt <4 x i32> %x, %a
@@ -586,7 +590,9 @@ define <16 x i8> @unsigned_sat_variable_v16i8_using_min(<16 x i8> %x, <16 x i8>
define <16 x i8> @unsigned_sat_variable_v16i8_using_cmp_sum(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: vaddubs 2, 2, 3
+; CHECK-NEXT: vaddubm 3, 2, 3
+; CHECK-NEXT: vcmpgtub 2, 2, 3
+; CHECK-NEXT: xxlor 34, 34, 35
; CHECK-NEXT: blr
%a = add <16 x i8> %x, %y
%c = icmp ugt <16 x i8> %x, %a
@@ -626,7 +632,9 @@ define <8 x i16> @unsigned_sat_variable_v8i16_using_min(<8 x i16> %x, <8 x i16>
define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_sum(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: vadduhs 2, 2, 3
+; CHECK-NEXT: vadduhm 3, 2, 3
+; CHECK-NEXT: vcmpgtuh 2, 2, 3
+; CHECK-NEXT: xxlor 34, 34, 35
; CHECK-NEXT: blr
%a = add <8 x i16> %x, %y
%c = icmp ugt <8 x i16> %x, %a
@@ -666,7 +674,9 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_min(<4 x i32> %x, <4 x i32>
define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_sum(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: vadduws 2, 2, 3
+; CHECK-NEXT: vadduwm 3, 2, 3
+; CHECK-NEXT: vcmpgtuw 2, 2, 3
+; CHECK-NEXT: xxlor 34, 34, 35
; CHECK-NEXT: blr
%a = add <4 x i32> %x, %y
%c = icmp ugt <4 x i32> %x, %a
diff --git a/llvm/test/CodeGen/PowerPC/sat-add.s b/llvm/test/CodeGen/PowerPC/sat-add.s
new file mode 100644
index 00000000000000..ca085fc0f6997f
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/sat-add.s
@@ -0,0 +1,1260 @@
+ .abiversion 2
+ .file "sat-add.ll"
+ .text
+ .globl unsigned_sat_constant_i8_using_min # -- Begin function unsigned_sat_constant_i8_using_min
+ .p2align 4
+ .type unsigned_sat_constant_i8_using_min, at function
+unsigned_sat_constant_i8_using_min: # @unsigned_sat_constant_i8_using_min
+.Lfunc_begin0:
+ .cfi_startproc
+# %bb.0:
+ clrlwi 4, 3, 24
+ cmplwi 4, 213
+ li 4, -43
+ isellt 3, 3, 4
+ addi 3, 3, 42
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end0:
+ .size unsigned_sat_constant_i8_using_min, .Lfunc_end0-.Lfunc_begin0
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_constant_i8_using_cmp_sum # -- Begin function unsigned_sat_constant_i8_using_cmp_sum
+ .p2align 4
+ .type unsigned_sat_constant_i8_using_cmp_sum, at function
+unsigned_sat_constant_i8_using_cmp_sum: # @unsigned_sat_constant_i8_using_cmp_sum
+.Lfunc_begin1:
+ .cfi_startproc
+# %bb.0:
+ clrlwi 3, 3, 24
+ addi 3, 3, 42
+ andi. 4, 3, 256
+ li 4, -1
+ iseleq 3, 3, 4
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end1:
+ .size unsigned_sat_constant_i8_using_cmp_sum, .Lfunc_end1-.Lfunc_begin1
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_constant_i8_using_cmp_notval # -- Begin function unsigned_sat_constant_i8_using_cmp_notval
+ .p2align 4
+ .type unsigned_sat_constant_i8_using_cmp_notval, at function
+unsigned_sat_constant_i8_using_cmp_notval: # @unsigned_sat_constant_i8_using_cmp_notval
+.Lfunc_begin2:
+ .cfi_startproc
+# %bb.0:
+ clrlwi 4, 3, 24
+ addi 3, 3, 42
+ cmplwi 4, 213
+ li 4, -1
+ iselgt 3, 4, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end2:
+ .size unsigned_sat_constant_i8_using_cmp_notval, .Lfunc_end2-.Lfunc_begin2
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_constant_i16_using_min # -- Begin function unsigned_sat_constant_i16_using_min
+ .p2align 4
+ .type unsigned_sat_constant_i16_using_min, at function
+unsigned_sat_constant_i16_using_min: # @unsigned_sat_constant_i16_using_min
+.Lfunc_begin3:
+ .cfi_startproc
+# %bb.0:
+ clrlwi 4, 3, 16
+ cmplwi 4, 65493
+ li 4, -43
+ isellt 3, 3, 4
+ addi 3, 3, 42
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end3:
+ .size unsigned_sat_constant_i16_using_min, .Lfunc_end3-.Lfunc_begin3
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_constant_i16_using_cmp_sum # -- Begin function unsigned_sat_constant_i16_using_cmp_sum
+ .p2align 4
+ .type unsigned_sat_constant_i16_using_cmp_sum, at function
+unsigned_sat_constant_i16_using_cmp_sum: # @unsigned_sat_constant_i16_using_cmp_sum
+.Lfunc_begin4:
+ .cfi_startproc
+# %bb.0:
+ clrlwi 3, 3, 16
+ addi 3, 3, 42
+ andis. 4, 3, 1
+ li 4, -1
+ iseleq 3, 3, 4
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end4:
+ .size unsigned_sat_constant_i16_using_cmp_sum, .Lfunc_end4-.Lfunc_begin4
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_constant_i16_using_cmp_notval # -- Begin function unsigned_sat_constant_i16_using_cmp_notval
+ .p2align 4
+ .type unsigned_sat_constant_i16_using_cmp_notval, at function
+unsigned_sat_constant_i16_using_cmp_notval: # @unsigned_sat_constant_i16_using_cmp_notval
+.Lfunc_begin5:
+ .cfi_startproc
+# %bb.0:
+ clrlwi 4, 3, 16
+ addi 3, 3, 42
+ cmplwi 4, 65493
+ li 4, -1
+ iselgt 3, 4, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end5:
+ .size unsigned_sat_constant_i16_using_cmp_notval, .Lfunc_end5-.Lfunc_begin5
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_constant_i32_using_min # -- Begin function unsigned_sat_constant_i32_using_min
+ .p2align 4
+ .type unsigned_sat_constant_i32_using_min, at function
+unsigned_sat_constant_i32_using_min: # @unsigned_sat_constant_i32_using_min
+.Lfunc_begin6:
+ .cfi_startproc
+# %bb.0:
+ li 4, -43
+ cmplw 3, 4
+ isellt 3, 3, 4
+ addi 3, 3, 42
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end6:
+ .size unsigned_sat_constant_i32_using_min, .Lfunc_end6-.Lfunc_begin6
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_constant_i32_using_cmp_sum # -- Begin function unsigned_sat_constant_i32_using_cmp_sum
+ .p2align 4
+ .type unsigned_sat_constant_i32_using_cmp_sum, at function
+unsigned_sat_constant_i32_using_cmp_sum: # @unsigned_sat_constant_i32_using_cmp_sum
+.Lfunc_begin7:
+ .cfi_startproc
+# %bb.0:
+ addi 4, 3, 42
+ cmplw 4, 3
+ li 3, -1
+ isellt 3, 3, 4
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end7:
+ .size unsigned_sat_constant_i32_using_cmp_sum, .Lfunc_end7-.Lfunc_begin7
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_constant_i32_using_cmp_notval # -- Begin function unsigned_sat_constant_i32_using_cmp_notval
+ .p2align 4
+ .type unsigned_sat_constant_i32_using_cmp_notval, at function
+unsigned_sat_constant_i32_using_cmp_notval: # @unsigned_sat_constant_i32_using_cmp_notval
+.Lfunc_begin8:
+ .cfi_startproc
+# %bb.0:
+ li 5, -43
+ addi 4, 3, 42
+ cmplw 3, 5
+ li 3, -1
+ iselgt 3, 3, 4
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end8:
+ .size unsigned_sat_constant_i32_using_cmp_notval, .Lfunc_end8-.Lfunc_begin8
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_constant_i64_using_min # -- Begin function unsigned_sat_constant_i64_using_min
+ .p2align 4
+ .type unsigned_sat_constant_i64_using_min, at function
+unsigned_sat_constant_i64_using_min: # @unsigned_sat_constant_i64_using_min
+.Lfunc_begin9:
+ .cfi_startproc
+# %bb.0:
+ li 4, -43
+ cmpld 3, 4
+ isellt 3, 3, 4
+ addi 3, 3, 42
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end9:
+ .size unsigned_sat_constant_i64_using_min, .Lfunc_end9-.Lfunc_begin9
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_constant_i64_using_cmp_sum # -- Begin function unsigned_sat_constant_i64_using_cmp_sum
+ .p2align 4
+ .type unsigned_sat_constant_i64_using_cmp_sum, at function
+unsigned_sat_constant_i64_using_cmp_sum: # @unsigned_sat_constant_i64_using_cmp_sum
+.Lfunc_begin10:
+ .cfi_startproc
+# %bb.0:
+ li 4, 0
+ addic 3, 3, 42
+ addze. 4, 4
+ li 4, -1
+ iseleq 3, 3, 4
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end10:
+ .size unsigned_sat_constant_i64_using_cmp_sum, .Lfunc_end10-.Lfunc_begin10
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_constant_i64_using_cmp_notval # -- Begin function unsigned_sat_constant_i64_using_cmp_notval
+ .p2align 4
+ .type unsigned_sat_constant_i64_using_cmp_notval, at function
+unsigned_sat_constant_i64_using_cmp_notval: # @unsigned_sat_constant_i64_using_cmp_notval
+.Lfunc_begin11:
+ .cfi_startproc
+# %bb.0:
+ li 4, 0
+ addic 3, 3, 42
+ addze. 4, 4
+ li 4, -1
+ iseleq 3, 3, 4
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end11:
+ .size unsigned_sat_constant_i64_using_cmp_notval, .Lfunc_end11-.Lfunc_begin11
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_i8_using_min # -- Begin function unsigned_sat_variable_i8_using_min
+ .p2align 4
+ .type unsigned_sat_variable_i8_using_min, at function
+unsigned_sat_variable_i8_using_min: # @unsigned_sat_variable_i8_using_min
+.Lfunc_begin12:
+ .cfi_startproc
+# %bb.0:
+ not 6, 4
+ clrlwi 5, 3, 24
+ clrlwi 7, 6, 24
+ cmplw 5, 7
+ isellt 3, 3, 6
+ add 3, 3, 4
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end12:
+ .size unsigned_sat_variable_i8_using_min, .Lfunc_end12-.Lfunc_begin12
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_i8_using_cmp_sum # -- Begin function unsigned_sat_variable_i8_using_cmp_sum
+ .p2align 4
+ .type unsigned_sat_variable_i8_using_cmp_sum, at function
+unsigned_sat_variable_i8_using_cmp_sum: # @unsigned_sat_variable_i8_using_cmp_sum
+.Lfunc_begin13:
+ .cfi_startproc
+# %bb.0:
+ clrlwi 4, 4, 24
+ clrlwi 3, 3, 24
+ add 3, 3, 4
+ andi. 4, 3, 256
+ li 4, -1
+ iseleq 3, 3, 4
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end13:
+ .size unsigned_sat_variable_i8_using_cmp_sum, .Lfunc_end13-.Lfunc_begin13
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_i8_using_cmp_notval # -- Begin function unsigned_sat_variable_i8_using_cmp_notval
+ .p2align 4
+ .type unsigned_sat_variable_i8_using_cmp_notval, at function
+unsigned_sat_variable_i8_using_cmp_notval: # @unsigned_sat_variable_i8_using_cmp_notval
+.Lfunc_begin14:
+ .cfi_startproc
+# %bb.0:
+ not 6, 4
+ clrlwi 5, 3, 24
+ add 3, 3, 4
+ li 4, -1
+ clrlwi 6, 6, 24
+ cmplw 5, 6
+ iselgt 3, 4, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end14:
+ .size unsigned_sat_variable_i8_using_cmp_notval, .Lfunc_end14-.Lfunc_begin14
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_i16_using_min # -- Begin function unsigned_sat_variable_i16_using_min
+ .p2align 4
+ .type unsigned_sat_variable_i16_using_min, at function
+unsigned_sat_variable_i16_using_min: # @unsigned_sat_variable_i16_using_min
+.Lfunc_begin15:
+ .cfi_startproc
+# %bb.0:
+ not 6, 4
+ clrlwi 5, 3, 16
+ clrlwi 7, 6, 16
+ cmplw 5, 7
+ isellt 3, 3, 6
+ add 3, 3, 4
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end15:
+ .size unsigned_sat_variable_i16_using_min, .Lfunc_end15-.Lfunc_begin15
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_i16_using_cmp_sum # -- Begin function unsigned_sat_variable_i16_using_cmp_sum
+ .p2align 4
+ .type unsigned_sat_variable_i16_using_cmp_sum, at function
+unsigned_sat_variable_i16_using_cmp_sum: # @unsigned_sat_variable_i16_using_cmp_sum
+.Lfunc_begin16:
+ .cfi_startproc
+# %bb.0:
+ clrlwi 4, 4, 16
+ clrlwi 3, 3, 16
+ add 3, 3, 4
+ andis. 4, 3, 1
+ li 4, -1
+ iseleq 3, 3, 4
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end16:
+ .size unsigned_sat_variable_i16_using_cmp_sum, .Lfunc_end16-.Lfunc_begin16
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_i16_using_cmp_notval # -- Begin function unsigned_sat_variable_i16_using_cmp_notval
+ .p2align 4
+ .type unsigned_sat_variable_i16_using_cmp_notval, at function
+unsigned_sat_variable_i16_using_cmp_notval: # @unsigned_sat_variable_i16_using_cmp_notval
+.Lfunc_begin17:
+ .cfi_startproc
+# %bb.0:
+ not 6, 4
+ clrlwi 5, 3, 16
+ add 3, 3, 4
+ li 4, -1
+ clrlwi 6, 6, 16
+ cmplw 5, 6
+ iselgt 3, 4, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end17:
+ .size unsigned_sat_variable_i16_using_cmp_notval, .Lfunc_end17-.Lfunc_begin17
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_i32_using_min # -- Begin function unsigned_sat_variable_i32_using_min
+ .p2align 4
+ .type unsigned_sat_variable_i32_using_min, at function
+unsigned_sat_variable_i32_using_min: # @unsigned_sat_variable_i32_using_min
+.Lfunc_begin18:
+ .cfi_startproc
+# %bb.0:
+ not 5, 4
+ cmplw 3, 5
+ isellt 3, 3, 5
+ add 3, 3, 4
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end18:
+ .size unsigned_sat_variable_i32_using_min, .Lfunc_end18-.Lfunc_begin18
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_i32_using_cmp_sum # -- Begin function unsigned_sat_variable_i32_using_cmp_sum
+ .p2align 4
+ .type unsigned_sat_variable_i32_using_cmp_sum, at function
+unsigned_sat_variable_i32_using_cmp_sum: # @unsigned_sat_variable_i32_using_cmp_sum
+.Lfunc_begin19:
+ .cfi_startproc
+# %bb.0:
+ add 4, 3, 4
+ cmplw 4, 3
+ li 3, -1
+ isellt 3, 3, 4
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end19:
+ .size unsigned_sat_variable_i32_using_cmp_sum, .Lfunc_end19-.Lfunc_begin19
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_i32_using_cmp_notval # -- Begin function unsigned_sat_variable_i32_using_cmp_notval
+ .p2align 4
+ .type unsigned_sat_variable_i32_using_cmp_notval, at function
+unsigned_sat_variable_i32_using_cmp_notval: # @unsigned_sat_variable_i32_using_cmp_notval
+.Lfunc_begin20:
+ .cfi_startproc
+# %bb.0:
+ not 5, 4
+ add 4, 3, 4
+ cmplw 3, 5
+ li 3, -1
+ iselgt 3, 3, 4
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end20:
+ .size unsigned_sat_variable_i32_using_cmp_notval, .Lfunc_end20-.Lfunc_begin20
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_i64_using_min # -- Begin function unsigned_sat_variable_i64_using_min
+ .p2align 4
+ .type unsigned_sat_variable_i64_using_min, at function
+unsigned_sat_variable_i64_using_min: # @unsigned_sat_variable_i64_using_min
+.Lfunc_begin21:
+ .cfi_startproc
+# %bb.0:
+ not 5, 4
+ cmpld 3, 5
+ isellt 3, 3, 5
+ add 3, 3, 4
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end21:
+ .size unsigned_sat_variable_i64_using_min, .Lfunc_end21-.Lfunc_begin21
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_i64_using_cmp_sum # -- Begin function unsigned_sat_variable_i64_using_cmp_sum
+ .p2align 4
+ .type unsigned_sat_variable_i64_using_cmp_sum, at function
+unsigned_sat_variable_i64_using_cmp_sum: # @unsigned_sat_variable_i64_using_cmp_sum
+.Lfunc_begin22:
+ .cfi_startproc
+# %bb.0:
+ addc 3, 3, 4
+ li 4, 0
+ addze. 4, 4
+ li 4, -1
+ iseleq 3, 3, 4
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end22:
+ .size unsigned_sat_variable_i64_using_cmp_sum, .Lfunc_end22-.Lfunc_begin22
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_i64_using_cmp_notval # -- Begin function unsigned_sat_variable_i64_using_cmp_notval
+ .p2align 4
+ .type unsigned_sat_variable_i64_using_cmp_notval, at function
+unsigned_sat_variable_i64_using_cmp_notval: # @unsigned_sat_variable_i64_using_cmp_notval
+.Lfunc_begin23:
+ .cfi_startproc
+# %bb.0:
+ not 5, 4
+ add 4, 3, 4
+ cmpld 3, 5
+ li 3, -1
+ iselgt 3, 3, 4
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end23:
+ .size unsigned_sat_variable_i64_using_cmp_notval, .Lfunc_end23-.Lfunc_begin23
+ .cfi_endproc
+ # -- End function
+ .section .rodata.cst16,"aM", at progbits,16
+ .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v16i8_using_min
+.LCPI24_0:
+ .space 16,213
+.LCPI24_1:
+ .space 16,42
+ .text
+ .globl unsigned_sat_constant_v16i8_using_min
+ .p2align 4
+ .type unsigned_sat_constant_v16i8_using_min, at function
+unsigned_sat_constant_v16i8_using_min: # @unsigned_sat_constant_v16i8_using_min
+.Lfunc_begin24:
+ .cfi_startproc
+.Lfunc_gep24:
+ addis 2, 12, .TOC.-.Lfunc_gep24 at ha
+ addi 2, 2, .TOC.-.Lfunc_gep24 at l
+.Lfunc_lep24:
+ .localentry unsigned_sat_constant_v16i8_using_min, .Lfunc_lep24-.Lfunc_gep24
+# %bb.0:
+ addis 3, 2, .LCPI24_0 at toc@ha
+ addi 3, 3, .LCPI24_0 at toc@l
+ lxvd2x 35, 0, 3
+ addis 3, 2, .LCPI24_1 at toc@ha
+ addi 3, 3, .LCPI24_1 at toc@l
+ vminub 2, 2, 3
+ lxvd2x 35, 0, 3
+ vaddubm 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end24:
+ .size unsigned_sat_constant_v16i8_using_min, .Lfunc_end24-.Lfunc_begin24
+ .cfi_endproc
+ # -- End function
+ .section .rodata.cst16,"aM", at progbits,16
+ .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v16i8_using_cmp_sum
+.LCPI25_0:
+ .space 16,42
+ .text
+ .globl unsigned_sat_constant_v16i8_using_cmp_sum
+ .p2align 4
+ .type unsigned_sat_constant_v16i8_using_cmp_sum, at function
+unsigned_sat_constant_v16i8_using_cmp_sum: # @unsigned_sat_constant_v16i8_using_cmp_sum
+.Lfunc_begin25:
+ .cfi_startproc
+.Lfunc_gep25:
+ addis 2, 12, .TOC.-.Lfunc_gep25 at ha
+ addi 2, 2, .TOC.-.Lfunc_gep25 at l
+.Lfunc_lep25:
+ .localentry unsigned_sat_constant_v16i8_using_cmp_sum, .Lfunc_lep25-.Lfunc_gep25
+# %bb.0:
+ addis 3, 2, .LCPI25_0 at toc@ha
+ addi 3, 3, .LCPI25_0 at toc@l
+ lxvd2x 35, 0, 3
+ vaddubs 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end25:
+ .size unsigned_sat_constant_v16i8_using_cmp_sum, .Lfunc_end25-.Lfunc_begin25
+ .cfi_endproc
+ # -- End function
+ .section .rodata.cst16,"aM", at progbits,16
+ .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v16i8_using_cmp_notval
+.LCPI26_0:
+ .space 16,42
+ .text
+ .globl unsigned_sat_constant_v16i8_using_cmp_notval
+ .p2align 4
+ .type unsigned_sat_constant_v16i8_using_cmp_notval, at function
+unsigned_sat_constant_v16i8_using_cmp_notval: # @unsigned_sat_constant_v16i8_using_cmp_notval
+.Lfunc_begin26:
+ .cfi_startproc
+.Lfunc_gep26:
+ addis 2, 12, .TOC.-.Lfunc_gep26 at ha
+ addi 2, 2, .TOC.-.Lfunc_gep26 at l
+.Lfunc_lep26:
+ .localentry unsigned_sat_constant_v16i8_using_cmp_notval, .Lfunc_lep26-.Lfunc_gep26
+# %bb.0:
+ addis 3, 2, .LCPI26_0 at toc@ha
+ addi 3, 3, .LCPI26_0 at toc@l
+ lxvd2x 35, 0, 3
+ vaddubs 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end26:
+ .size unsigned_sat_constant_v16i8_using_cmp_notval, .Lfunc_end26-.Lfunc_begin26
+ .cfi_endproc
+ # -- End function
+ .section .rodata.cst16,"aM", at progbits,16
+ .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v8i16_using_min
+.LCPI27_0:
+ .short 65493 # 0xffd5
+ .short 65493 # 0xffd5
+ .short 65493 # 0xffd5
+ .short 65493 # 0xffd5
+ .short 65493 # 0xffd5
+ .short 65493 # 0xffd5
+ .short 65493 # 0xffd5
+ .short 65493 # 0xffd5
+.LCPI27_1:
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .text
+ .globl unsigned_sat_constant_v8i16_using_min
+ .p2align 4
+ .type unsigned_sat_constant_v8i16_using_min, at function
+unsigned_sat_constant_v8i16_using_min: # @unsigned_sat_constant_v8i16_using_min
+.Lfunc_begin27:
+ .cfi_startproc
+.Lfunc_gep27:
+ addis 2, 12, .TOC.-.Lfunc_gep27 at ha
+ addi 2, 2, .TOC.-.Lfunc_gep27 at l
+.Lfunc_lep27:
+ .localentry unsigned_sat_constant_v8i16_using_min, .Lfunc_lep27-.Lfunc_gep27
+# %bb.0:
+ addis 3, 2, .LCPI27_0 at toc@ha
+ addi 3, 3, .LCPI27_0 at toc@l
+ lxvd2x 35, 0, 3
+ addis 3, 2, .LCPI27_1 at toc@ha
+ addi 3, 3, .LCPI27_1 at toc@l
+ vminuh 2, 2, 3
+ lxvd2x 35, 0, 3
+ vadduhm 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end27:
+ .size unsigned_sat_constant_v8i16_using_min, .Lfunc_end27-.Lfunc_begin27
+ .cfi_endproc
+ # -- End function
+ .section .rodata.cst16,"aM", at progbits,16
+ .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v8i16_using_cmp_sum
+.LCPI28_0:
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .text
+ .globl unsigned_sat_constant_v8i16_using_cmp_sum
+ .p2align 4
+ .type unsigned_sat_constant_v8i16_using_cmp_sum, at function
+unsigned_sat_constant_v8i16_using_cmp_sum: # @unsigned_sat_constant_v8i16_using_cmp_sum
+.Lfunc_begin28:
+ .cfi_startproc
+.Lfunc_gep28:
+ addis 2, 12, .TOC.-.Lfunc_gep28 at ha
+ addi 2, 2, .TOC.-.Lfunc_gep28 at l
+.Lfunc_lep28:
+ .localentry unsigned_sat_constant_v8i16_using_cmp_sum, .Lfunc_lep28-.Lfunc_gep28
+# %bb.0:
+ addis 3, 2, .LCPI28_0 at toc@ha
+ addi 3, 3, .LCPI28_0 at toc@l
+ lxvd2x 35, 0, 3
+ vadduhs 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end28:
+ .size unsigned_sat_constant_v8i16_using_cmp_sum, .Lfunc_end28-.Lfunc_begin28
+ .cfi_endproc
+ # -- End function
+ .section .rodata.cst16,"aM", at progbits,16
+ .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v8i16_using_cmp_notval
+.LCPI29_0:
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .short 42 # 0x2a
+ .text
+ .globl unsigned_sat_constant_v8i16_using_cmp_notval
+ .p2align 4
+ .type unsigned_sat_constant_v8i16_using_cmp_notval, at function
+unsigned_sat_constant_v8i16_using_cmp_notval: # @unsigned_sat_constant_v8i16_using_cmp_notval
+.Lfunc_begin29:
+ .cfi_startproc
+.Lfunc_gep29:
+ addis 2, 12, .TOC.-.Lfunc_gep29 at ha
+ addi 2, 2, .TOC.-.Lfunc_gep29 at l
+.Lfunc_lep29:
+ .localentry unsigned_sat_constant_v8i16_using_cmp_notval, .Lfunc_lep29-.Lfunc_gep29
+# %bb.0:
+ addis 3, 2, .LCPI29_0 at toc@ha
+ addi 3, 3, .LCPI29_0 at toc@l
+ lxvd2x 35, 0, 3
+ vadduhs 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end29:
+ .size unsigned_sat_constant_v8i16_using_cmp_notval, .Lfunc_end29-.Lfunc_begin29
+ .cfi_endproc
+ # -- End function
+ .section .rodata.cst16,"aM", at progbits,16
+ .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v4i32_using_min
+.LCPI30_0:
+ .long 4294967253 # 0xffffffd5
+ .long 4294967253 # 0xffffffd5
+ .long 4294967253 # 0xffffffd5
+ .long 4294967253 # 0xffffffd5
+.LCPI30_1:
+ .long 42 # 0x2a
+ .long 42 # 0x2a
+ .long 42 # 0x2a
+ .long 42 # 0x2a
+ .text
+ .globl unsigned_sat_constant_v4i32_using_min
+ .p2align 4
+ .type unsigned_sat_constant_v4i32_using_min, at function
+unsigned_sat_constant_v4i32_using_min: # @unsigned_sat_constant_v4i32_using_min
+.Lfunc_begin30:
+ .cfi_startproc
+.Lfunc_gep30:
+ addis 2, 12, .TOC.-.Lfunc_gep30 at ha
+ addi 2, 2, .TOC.-.Lfunc_gep30 at l
+.Lfunc_lep30:
+ .localentry unsigned_sat_constant_v4i32_using_min, .Lfunc_lep30-.Lfunc_gep30
+# %bb.0:
+ addis 3, 2, .LCPI30_0 at toc@ha
+ addi 3, 3, .LCPI30_0 at toc@l
+ lxvd2x 35, 0, 3
+ addis 3, 2, .LCPI30_1 at toc@ha
+ addi 3, 3, .LCPI30_1 at toc@l
+ vminuw 2, 2, 3
+ lxvd2x 35, 0, 3
+ vadduwm 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end30:
+ .size unsigned_sat_constant_v4i32_using_min, .Lfunc_end30-.Lfunc_begin30
+ .cfi_endproc
+ # -- End function
+ .section .rodata.cst16,"aM", at progbits,16
+ .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v4i32_using_cmp_sum
+.LCPI31_0:
+ .long 42 # 0x2a
+ .long 42 # 0x2a
+ .long 42 # 0x2a
+ .long 42 # 0x2a
+ .text
+ .globl unsigned_sat_constant_v4i32_using_cmp_sum
+ .p2align 4
+ .type unsigned_sat_constant_v4i32_using_cmp_sum, at function
+unsigned_sat_constant_v4i32_using_cmp_sum: # @unsigned_sat_constant_v4i32_using_cmp_sum
+.Lfunc_begin31:
+ .cfi_startproc
+.Lfunc_gep31:
+ addis 2, 12, .TOC.-.Lfunc_gep31 at ha
+ addi 2, 2, .TOC.-.Lfunc_gep31 at l
+.Lfunc_lep31:
+ .localentry unsigned_sat_constant_v4i32_using_cmp_sum, .Lfunc_lep31-.Lfunc_gep31
+# %bb.0:
+ addis 3, 2, .LCPI31_0 at toc@ha
+ addi 3, 3, .LCPI31_0 at toc@l
+ lxvd2x 35, 0, 3
+ vadduws 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end31:
+ .size unsigned_sat_constant_v4i32_using_cmp_sum, .Lfunc_end31-.Lfunc_begin31
+ .cfi_endproc
+ # -- End function
+ .section .rodata.cst16,"aM", at progbits,16
+ .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v4i32_using_cmp_notval
+.LCPI32_0:
+ .long 42 # 0x2a
+ .long 42 # 0x2a
+ .long 42 # 0x2a
+ .long 42 # 0x2a
+ .text
+ .globl unsigned_sat_constant_v4i32_using_cmp_notval
+ .p2align 4
+ .type unsigned_sat_constant_v4i32_using_cmp_notval, at function
+unsigned_sat_constant_v4i32_using_cmp_notval: # @unsigned_sat_constant_v4i32_using_cmp_notval
+.Lfunc_begin32:
+ .cfi_startproc
+.Lfunc_gep32:
+ addis 2, 12, .TOC.-.Lfunc_gep32 at ha
+ addi 2, 2, .TOC.-.Lfunc_gep32 at l
+.Lfunc_lep32:
+ .localentry unsigned_sat_constant_v4i32_using_cmp_notval, .Lfunc_lep32-.Lfunc_gep32
+# %bb.0:
+ addis 3, 2, .LCPI32_0 at toc@ha
+ addi 3, 3, .LCPI32_0 at toc@l
+ lxvd2x 35, 0, 3
+ vadduws 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end32:
+ .size unsigned_sat_constant_v4i32_using_cmp_notval, .Lfunc_end32-.Lfunc_begin32
+ .cfi_endproc
+ # -- End function
+ .section .rodata.cst16,"aM", at progbits,16
+ .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v2i64_using_min
+.LCPI33_0:
+ .quad -43 # 0xffffffffffffffd5
+ .quad -43 # 0xffffffffffffffd5
+.LCPI33_1:
+ .quad 42 # 0x2a
+ .quad 42 # 0x2a
+ .text
+ .globl unsigned_sat_constant_v2i64_using_min
+ .p2align 4
+ .type unsigned_sat_constant_v2i64_using_min, at function
+unsigned_sat_constant_v2i64_using_min: # @unsigned_sat_constant_v2i64_using_min
+.Lfunc_begin33:
+ .cfi_startproc
+.Lfunc_gep33:
+ addis 2, 12, .TOC.-.Lfunc_gep33 at ha
+ addi 2, 2, .TOC.-.Lfunc_gep33 at l
+.Lfunc_lep33:
+ .localentry unsigned_sat_constant_v2i64_using_min, .Lfunc_lep33-.Lfunc_gep33
+# %bb.0:
+ addis 3, 2, .LCPI33_0 at toc@ha
+ addi 3, 3, .LCPI33_0 at toc@l
+ lxvd2x 35, 0, 3
+ addis 3, 2, .LCPI33_1 at toc@ha
+ addi 3, 3, .LCPI33_1 at toc@l
+ vminud 2, 2, 3
+ lxvd2x 35, 0, 3
+ vaddudm 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end33:
+ .size unsigned_sat_constant_v2i64_using_min, .Lfunc_end33-.Lfunc_begin33
+ .cfi_endproc
+ # -- End function
+ .section .rodata.cst16,"aM", at progbits,16
+ .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v2i64_using_cmp_sum
+.LCPI34_0:
+ .quad 42 # 0x2a
+ .quad 42 # 0x2a
+.LCPI34_1:
+ .quad -43 # 0xffffffffffffffd5
+ .quad -43 # 0xffffffffffffffd5
+ .text
+ .globl unsigned_sat_constant_v2i64_using_cmp_sum
+ .p2align 4
+ .type unsigned_sat_constant_v2i64_using_cmp_sum, at function
+unsigned_sat_constant_v2i64_using_cmp_sum: # @unsigned_sat_constant_v2i64_using_cmp_sum
+.Lfunc_begin34:
+ .cfi_startproc
+.Lfunc_gep34:
+ addis 2, 12, .TOC.-.Lfunc_gep34 at ha
+ addi 2, 2, .TOC.-.Lfunc_gep34 at l
+.Lfunc_lep34:
+ .localentry unsigned_sat_constant_v2i64_using_cmp_sum, .Lfunc_lep34-.Lfunc_gep34
+# %bb.0:
+ addis 3, 2, .LCPI34_0 at toc@ha
+ addi 3, 3, .LCPI34_0 at toc@l
+ lxvd2x 35, 0, 3
+ addis 3, 2, .LCPI34_1 at toc@ha
+ addi 3, 3, .LCPI34_1 at toc@l
+ lxvd2x 36, 0, 3
+ vminud 2, 2, 4
+ vaddudm 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end34:
+ .size unsigned_sat_constant_v2i64_using_cmp_sum, .Lfunc_end34-.Lfunc_begin34
+ .cfi_endproc
+ # -- End function
+ .section .rodata.cst16,"aM", at progbits,16
+ .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v2i64_using_cmp_notval
+.LCPI35_0:
+ .quad 42 # 0x2a
+ .quad 42 # 0x2a
+.LCPI35_1:
+ .quad -43 # 0xffffffffffffffd5
+ .quad -43 # 0xffffffffffffffd5
+ .text
+ .globl unsigned_sat_constant_v2i64_using_cmp_notval
+ .p2align 4
+ .type unsigned_sat_constant_v2i64_using_cmp_notval, at function
+unsigned_sat_constant_v2i64_using_cmp_notval: # @unsigned_sat_constant_v2i64_using_cmp_notval
+.Lfunc_begin35:
+ .cfi_startproc
+.Lfunc_gep35:
+ addis 2, 12, .TOC.-.Lfunc_gep35 at ha
+ addi 2, 2, .TOC.-.Lfunc_gep35 at l
+.Lfunc_lep35:
+ .localentry unsigned_sat_constant_v2i64_using_cmp_notval, .Lfunc_lep35-.Lfunc_gep35
+# %bb.0:
+ addis 3, 2, .LCPI35_0 at toc@ha
+ addi 3, 3, .LCPI35_0 at toc@l
+ lxvd2x 35, 0, 3
+ addis 3, 2, .LCPI35_1 at toc@ha
+ addi 3, 3, .LCPI35_1 at toc@l
+ lxvd2x 36, 0, 3
+ vaddudm 3, 2, 3
+ vcmpgtud 2, 2, 4
+ xxlor 34, 34, 35
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end35:
+ .size unsigned_sat_constant_v2i64_using_cmp_notval, .Lfunc_end35-.Lfunc_begin35
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_v16i8_using_min # -- Begin function unsigned_sat_variable_v16i8_using_min
+ .p2align 4
+ .type unsigned_sat_variable_v16i8_using_min, at function
+unsigned_sat_variable_v16i8_using_min: # @unsigned_sat_variable_v16i8_using_min
+.Lfunc_begin36:
+ .cfi_startproc
+# %bb.0:
+ xxlnor 36, 35, 35
+ vminub 2, 2, 4
+ vaddubm 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end36:
+ .size unsigned_sat_variable_v16i8_using_min, .Lfunc_end36-.Lfunc_begin36
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_v16i8_using_cmp_sum # -- Begin function unsigned_sat_variable_v16i8_using_cmp_sum
+ .p2align 4
+ .type unsigned_sat_variable_v16i8_using_cmp_sum, at function
+unsigned_sat_variable_v16i8_using_cmp_sum: # @unsigned_sat_variable_v16i8_using_cmp_sum
+.Lfunc_begin37:
+ .cfi_startproc
+# %bb.0:
+ vaddubs 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end37:
+ .size unsigned_sat_variable_v16i8_using_cmp_sum, .Lfunc_end37-.Lfunc_begin37
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_v16i8_using_cmp_notval # -- Begin function unsigned_sat_variable_v16i8_using_cmp_notval
+ .p2align 4
+ .type unsigned_sat_variable_v16i8_using_cmp_notval, at function
+unsigned_sat_variable_v16i8_using_cmp_notval: # @unsigned_sat_variable_v16i8_using_cmp_notval
+.Lfunc_begin38:
+ .cfi_startproc
+# %bb.0:
+ xxlnor 36, 35, 35
+ vaddubm 3, 2, 3
+ vcmpgtub 2, 2, 4
+ xxlor 34, 34, 35
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end38:
+ .size unsigned_sat_variable_v16i8_using_cmp_notval, .Lfunc_end38-.Lfunc_begin38
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_v8i16_using_min # -- Begin function unsigned_sat_variable_v8i16_using_min
+ .p2align 4
+ .type unsigned_sat_variable_v8i16_using_min, at function
+unsigned_sat_variable_v8i16_using_min: # @unsigned_sat_variable_v8i16_using_min
+.Lfunc_begin39:
+ .cfi_startproc
+# %bb.0:
+ xxlnor 36, 35, 35
+ vminuh 2, 2, 4
+ vadduhm 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end39:
+ .size unsigned_sat_variable_v8i16_using_min, .Lfunc_end39-.Lfunc_begin39
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_v8i16_using_cmp_sum # -- Begin function unsigned_sat_variable_v8i16_using_cmp_sum
+ .p2align 4
+ .type unsigned_sat_variable_v8i16_using_cmp_sum, at function
+unsigned_sat_variable_v8i16_using_cmp_sum: # @unsigned_sat_variable_v8i16_using_cmp_sum
+.Lfunc_begin40:
+ .cfi_startproc
+# %bb.0:
+ vadduhs 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end40:
+ .size unsigned_sat_variable_v8i16_using_cmp_sum, .Lfunc_end40-.Lfunc_begin40
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_v8i16_using_cmp_notval # -- Begin function unsigned_sat_variable_v8i16_using_cmp_notval
+ .p2align 4
+ .type unsigned_sat_variable_v8i16_using_cmp_notval, at function
+unsigned_sat_variable_v8i16_using_cmp_notval: # @unsigned_sat_variable_v8i16_using_cmp_notval
+.Lfunc_begin41:
+ .cfi_startproc
+# %bb.0:
+ xxlnor 36, 35, 35
+ vadduhm 3, 2, 3
+ vcmpgtuh 2, 2, 4
+ xxlor 34, 34, 35
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end41:
+ .size unsigned_sat_variable_v8i16_using_cmp_notval, .Lfunc_end41-.Lfunc_begin41
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_v4i32_using_min # -- Begin function unsigned_sat_variable_v4i32_using_min
+ .p2align 4
+ .type unsigned_sat_variable_v4i32_using_min, at function
+unsigned_sat_variable_v4i32_using_min: # @unsigned_sat_variable_v4i32_using_min
+.Lfunc_begin42:
+ .cfi_startproc
+# %bb.0:
+ xxlnor 36, 35, 35
+ vminuw 2, 2, 4
+ vadduwm 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end42:
+ .size unsigned_sat_variable_v4i32_using_min, .Lfunc_end42-.Lfunc_begin42
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_v4i32_using_cmp_sum # -- Begin function unsigned_sat_variable_v4i32_using_cmp_sum
+ .p2align 4
+ .type unsigned_sat_variable_v4i32_using_cmp_sum, at function
+unsigned_sat_variable_v4i32_using_cmp_sum: # @unsigned_sat_variable_v4i32_using_cmp_sum
+.Lfunc_begin43:
+ .cfi_startproc
+# %bb.0:
+ vadduws 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end43:
+ .size unsigned_sat_variable_v4i32_using_cmp_sum, .Lfunc_end43-.Lfunc_begin43
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_v4i32_using_cmp_notval # -- Begin function unsigned_sat_variable_v4i32_using_cmp_notval
+ .p2align 4
+ .type unsigned_sat_variable_v4i32_using_cmp_notval, at function
+unsigned_sat_variable_v4i32_using_cmp_notval: # @unsigned_sat_variable_v4i32_using_cmp_notval
+.Lfunc_begin44:
+ .cfi_startproc
+# %bb.0:
+ xxlnor 36, 35, 35
+ vadduwm 3, 2, 3
+ vcmpgtuw 2, 2, 4
+ xxlor 34, 34, 35
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end44:
+ .size unsigned_sat_variable_v4i32_using_cmp_notval, .Lfunc_end44-.Lfunc_begin44
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_v2i64_using_min # -- Begin function unsigned_sat_variable_v2i64_using_min
+ .p2align 4
+ .type unsigned_sat_variable_v2i64_using_min, at function
+unsigned_sat_variable_v2i64_using_min: # @unsigned_sat_variable_v2i64_using_min
+.Lfunc_begin45:
+ .cfi_startproc
+# %bb.0:
+ xxlnor 36, 35, 35
+ vminud 2, 2, 4
+ vaddudm 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end45:
+ .size unsigned_sat_variable_v2i64_using_min, .Lfunc_end45-.Lfunc_begin45
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_v2i64_using_cmp_sum # -- Begin function unsigned_sat_variable_v2i64_using_cmp_sum
+ .p2align 4
+ .type unsigned_sat_variable_v2i64_using_cmp_sum, at function
+unsigned_sat_variable_v2i64_using_cmp_sum: # @unsigned_sat_variable_v2i64_using_cmp_sum
+.Lfunc_begin46:
+ .cfi_startproc
+# %bb.0:
+ xxlnor 36, 35, 35
+ vminud 2, 2, 4
+ vaddudm 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end46:
+ .size unsigned_sat_variable_v2i64_using_cmp_sum, .Lfunc_end46-.Lfunc_begin46
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_variable_v2i64_using_cmp_notval # -- Begin function unsigned_sat_variable_v2i64_using_cmp_notval
+ .p2align 4
+ .type unsigned_sat_variable_v2i64_using_cmp_notval, at function
+unsigned_sat_variable_v2i64_using_cmp_notval: # @unsigned_sat_variable_v2i64_using_cmp_notval
+.Lfunc_begin47:
+ .cfi_startproc
+# %bb.0:
+ xxlnor 36, 35, 35
+ vaddudm 3, 2, 3
+ vcmpgtud 2, 2, 4
+ xxlor 34, 34, 35
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end47:
+ .size unsigned_sat_variable_v2i64_using_cmp_notval, .Lfunc_end47-.Lfunc_begin47
+ .cfi_endproc
+ # -- End function
+ .section .rodata.cst16,"aM", at progbits,16
+ .p2align 4, 0x0 # -- Begin function sadd
+.LCPI48_0:
+ .quad 0 # 0x0
+ .quad -9223372036854775808 # 0x8000000000000000
+ .text
+ .globl sadd
+ .p2align 4
+ .type sadd, at function
+sadd: # @sadd
+.Lfunc_begin48:
+ .cfi_startproc
+.Lfunc_gep48:
+ addis 2, 12, .TOC.-.Lfunc_gep48 at ha
+ addi 2, 2, .TOC.-.Lfunc_gep48 at l
+.Lfunc_lep48:
+ .localentry sadd, .Lfunc_lep48-.Lfunc_gep48
+# %bb.0:
+ vadduqm 0, 2, 6
+ vadduqm 10, 4, 8
+ mfocrf 12, 32
+ stw 12, 8(1)
+ xxswapd 0, 34
+ xxswapd 4, 36
+ vadduqm 1, 3, 7
+ vadduqm 11, 5, 9
+ mffprd 3, 0
+ mffprd 6, 4
+ lwz 12, 8(1)
+ xxswapd 2, 35
+ xxswapd 5, 37
+ mffprd 4, 2
+ xxswapd 1, 32
+ xxswapd 6, 42
+ mffprd 5, 1
+ cmpld 6, 5, 3
+ mffprd 7, 6
+ xxswapd 3, 33
+ xxswapd 7, 43
+ mffprd 3, 3
+ cmpld 5, 7, 6
+ mffprd 6, 5
+ mffprd 7, 7
+ mfvsrd 5, 36
+ cmpld 3, 4
+ mfvsrd 3, 34
+ cmpld 1, 7, 6
+ mfvsrd 7, 32
+ mfvsrd 4, 35
+ mfvsrd 6, 37
+ cmpld 7, 7, 3
+ cmpd 2, 7, 3
+ mfvsrd 3, 33
+ crandc 21, 8, 30
+ crand 22, 30, 24
+ cmpld 6, 3, 4
+ cmpd 7, 3, 4
+ mfvsrd 4, 42
+ sradi 3, 3, 63
+ mtocrf 32, 12
+ crnor 21, 22, 21
+ crandc 23, 28, 26
+ crand 24, 26, 0
+ cmpld 4, 5
+ cmpd 7, 4, 5
+ mfvsrd 5, 43
+ crnor 22, 24, 23
+ mtfprd 5, 3
+ sradi 4, 4, 63
+ mtfprd 6, 4
+ crandc 25, 28, 2
+ crand 20, 2, 20
+ cmpld 5, 6
+ cmpd 7, 5, 6
+ mfvsrd 6, 38
+ sradi 5, 5, 63
+ crnor 20, 20, 25
+ mtfprd 7, 5
+ sradi 6, 6, 63
+ crandc 26, 28, 2
+ crand 27, 2, 4
+ crnor 23, 27, 26
+ mtfprd 0, 6
+ mfvsrd 6, 39
+ sradi 6, 6, 63
+ mtfprd 1, 6
+ mfvsrd 6, 40
+ sradi 6, 6, 63
+ mtfprd 2, 6
+ mfvsrd 6, 41
+ sradi 6, 6, 63
+ mtfprd 3, 6
+ sradi 6, 7, 63
+ mtfprd 4, 6
+ li 6, -1
+ isel 3, 0, 6, 21
+ isel 4, 0, 6, 22
+ isel 5, 0, 6, 20
+ isel 6, 0, 6, 23
+ mtfprd 8, 3
+ addis 3, 2, .LCPI48_0 at toc@ha
+ mtfprd 10, 4
+ mtfprd 11, 5
+ mtfprd 12, 6
+ addi 3, 3, .LCPI48_0 at toc@l
+ lxvd2x 9, 0, 3
+ xxspltd 45, 6, 0
+ xxspltd 46, 7, 0
+ xxspltd 34, 0, 0
+ xxspltd 40, 5, 0
+ xxspltd 35, 1, 0
+ xxspltd 36, 2, 0
+ xxspltd 38, 3, 0
+ xxspltd 39, 4, 0
+ xxspltd 41, 8, 0
+ xxspltd 44, 10, 0
+ xxspltd 47, 11, 0
+ xxspltd 48, 12, 0
+ xxlxor 0, 34, 41
+ xxlxor 1, 35, 44
+ xxswapd 37, 9
+ xxlxor 2, 39, 37
+ xxlxor 3, 40, 37
+ xxsel 34, 32, 2, 0
+ xxsel 35, 33, 3, 1
+ xxlxor 0, 36, 47
+ xxlxor 1, 45, 37
+ xxsel 36, 42, 1, 0
+ xxlxor 0, 38, 48
+ xxlxor 1, 46, 37
+ xxsel 37, 43, 1, 0
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end48:
+ .size sadd, .Lfunc_end48-.Lfunc_begin48
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_constant_i64_with_single_use # -- Begin function unsigned_sat_constant_i64_with_single_use
+ .p2align 4
+ .type unsigned_sat_constant_i64_with_single_use, at function
+unsigned_sat_constant_i64_with_single_use: # @unsigned_sat_constant_i64_with_single_use
+.Lfunc_begin49:
+ .cfi_startproc
+# %bb.0:
+ li 4, 4
+ subc 3, 3, 4
+ li 4, 0
+ addze. 4, 4
+ iseleq 3, 0, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end49:
+ .size unsigned_sat_constant_i64_with_single_use, .Lfunc_end49-.Lfunc_begin49
+ .cfi_endproc
+ # -- End function
+ .globl unsigned_sat_constant_i64_with_multiple_use # -- Begin function unsigned_sat_constant_i64_with_multiple_use
+ .p2align 4
+ .type unsigned_sat_constant_i64_with_multiple_use, at function
+unsigned_sat_constant_i64_with_multiple_use: # @unsigned_sat_constant_i64_with_multiple_use
+.Lfunc_begin50:
+ .cfi_startproc
+# %bb.0:
+ cmpldi 3, 4
+ li 5, 4
+ isellt 5, 3, 5
+ sub 3, 3, 5
+ add 4, 4, 5
+ mulld 3, 3, 4
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end50:
+ .size unsigned_sat_constant_i64_with_multiple_use, .Lfunc_end50-.Lfunc_begin50
+ .cfi_endproc
+ # -- End function
+ .section ".note.GNU-stack","", at progbits
diff --git a/llvm/test/CodeGen/RISCV/branch-on-zero.ll b/llvm/test/CodeGen/RISCV/branch-on-zero.ll
index b1d396d70ff5fd..02aeebdeb37757 100644
--- a/llvm/test/CodeGen/RISCV/branch-on-zero.ll
+++ b/llvm/test/CodeGen/RISCV/branch-on-zero.ll
@@ -8,21 +8,19 @@ define i32 @optbranch_32(i32 %Arg) {
; RV32-LABEL: optbranch_32:
; RV32: # %bb.0: # %bb
; RV32-NEXT: addi a0, a0, 1
-; RV32-NEXT: beqz a0, .LBB0_2
-; RV32-NEXT: # %bb.1: # %bb3
-; RV32-NEXT: ret
-; RV32-NEXT: .LBB0_2: # %bb2
+; RV32-NEXT: bnez a0, .LBB0_2
+; RV32-NEXT: # %bb.1: # %bb2
; RV32-NEXT: li a0, -1
+; RV32-NEXT: .LBB0_2: # %bb3
; RV32-NEXT: ret
;
; RV64-LABEL: optbranch_32:
; RV64: # %bb.0: # %bb
; RV64-NEXT: addiw a0, a0, 1
-; RV64-NEXT: beqz a0, .LBB0_2
-; RV64-NEXT: # %bb.1: # %bb3
-; RV64-NEXT: ret
-; RV64-NEXT: .LBB0_2: # %bb2
+; RV64-NEXT: bnez a0, .LBB0_2
+; RV64-NEXT: # %bb.1: # %bb2
; RV64-NEXT: li a0, -1
+; RV64-NEXT: .LBB0_2: # %bb3
; RV64-NEXT: ret
bb:
%i1 = icmp eq i32 %Arg, -1
@@ -43,22 +41,20 @@ define i64 @optbranch_64(i64 %Arg) {
; RV32-NEXT: seqz a2, a0
; RV32-NEXT: add a1, a1, a2
; RV32-NEXT: or a2, a0, a1
-; RV32-NEXT: beqz a2, .LBB1_2
-; RV32-NEXT: # %bb.1: # %bb3
-; RV32-NEXT: ret
-; RV32-NEXT: .LBB1_2: # %bb2
+; RV32-NEXT: bnez a2, .LBB1_2
+; RV32-NEXT: # %bb.1: # %bb2
; RV32-NEXT: li a0, -1
; RV32-NEXT: li a1, -1
+; RV32-NEXT: .LBB1_2: # %bb3
; RV32-NEXT: ret
;
; RV64-LABEL: optbranch_64:
; RV64: # %bb.0: # %bb
; RV64-NEXT: addi a0, a0, 1
-; RV64-NEXT: beqz a0, .LBB1_2
-; RV64-NEXT: # %bb.1: # %bb3
-; RV64-NEXT: ret
-; RV64-NEXT: .LBB1_2: # %bb2
+; RV64-NEXT: bnez a0, .LBB1_2
+; RV64-NEXT: # %bb.1: # %bb2
; RV64-NEXT: li a0, -1
+; RV64-NEXT: .LBB1_2: # %bb3
; RV64-NEXT: ret
bb:
%i1 = icmp eq i64 %Arg, -1
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index e10b360b35b569..376bbb7018700d 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -254,33 +254,60 @@ vector.ph:
}
define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind {
-; SSE-LABEL: test3:
-; SSE: # %bb.0: # %vector.ph
-; SSE-NEXT: movd %edi, %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE-NEXT: psubusw %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2OR3-LABEL: test3:
+; SSE2OR3: # %bb.0: # %vector.ph
+; SSE2OR3-NEXT: movd %edi, %xmm1
+; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2OR3-NEXT: movdqa %xmm0, %xmm3
+; SSE2OR3-NEXT: pxor %xmm1, %xmm3
+; SSE2OR3-NEXT: psubw %xmm2, %xmm0
+; SSE2OR3-NEXT: pxor %xmm0, %xmm1
+; SSE2OR3-NEXT: pcmpgtw %xmm3, %xmm1
+; SSE2OR3-NEXT: pandn %xmm0, %xmm1
+; SSE2OR3-NEXT: movdqa %xmm1, %xmm0
+; SSE2OR3-NEXT: retq
+;
+; SSE41-LABEL: test3:
+; SSE41: # %bb.0: # %vector.ph
+; SSE41-NEXT: movd %edi, %xmm1
+; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psubw %xmm1, %xmm2
+; SSE41-NEXT: pminuw %xmm2, %xmm0
+; SSE41-NEXT: pcmpeqw %xmm2, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: retq
;
; AVX1-LABEL: test3:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vmovd %edi, %xmm1
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpminuw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test3:
; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vmovd %edi, %xmm1
; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpminuw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test3:
; AVX512: # %bb.0: # %vector.ph
; AVX512-NEXT: vpbroadcastw %edi, %xmm1
-; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpleuw %xmm0, %xmm1, %k1
+; AVX512-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
; AVX512-NEXT: retq
vector.ph:
%0 = insertelement <8 x i16> undef, i16 %w, i32 0
@@ -332,7 +359,11 @@ define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE2-NEXT: psubusb %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubb %xmm1, %xmm2
+; SSE2-NEXT: pminub %xmm2, %xmm0
+; SSE2-NEXT: pcmpeqb %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test6:
@@ -340,7 +371,11 @@ define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
; SSSE3-NEXT: movd %edi, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: pshufb %xmm2, %xmm1
-; SSSE3-NEXT: psubusb %xmm1, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: psubb %xmm1, %xmm2
+; SSSE3-NEXT: pminub %xmm2, %xmm0
+; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0
+; SSSE3-NEXT: pand %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: test6:
@@ -348,7 +383,11 @@ define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
; SSE41-NEXT: movd %edi, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: pshufb %xmm2, %xmm1
-; SSE41-NEXT: psubusb %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psubb %xmm1, %xmm2
+; SSE41-NEXT: pminub %xmm2, %xmm0
+; SSE41-NEXT: pcmpeqb %xmm2, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test6:
@@ -356,20 +395,28 @@ define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
; AVX1-NEXT: vmovd %edi, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpminub %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test6:
; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vmovd %edi, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpminub %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test6:
; AVX512: # %bb.0: # %vector.ph
; AVX512-NEXT: vpbroadcastb %edi, %xmm1
-; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpleub %xmm0, %xmm1, %k1
+; AVX512-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} {z}
; AVX512-NEXT: retq
vector.ph:
%0 = insertelement <16 x i8> undef, i8 %w, i32 0
@@ -542,14 +589,45 @@ vector.ph:
}
define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind {
-; SSE-LABEL: test9:
-; SSE: # %bb.0: # %vector.ph
-; SSE-NEXT: movd %edi, %xmm2
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; SSE-NEXT: psubusw %xmm2, %xmm0
-; SSE-NEXT: psubusw %xmm2, %xmm1
-; SSE-NEXT: retq
+; SSE2OR3-LABEL: test9:
+; SSE2OR3: # %bb.0: # %vector.ph
+; SSE2OR3-NEXT: movd %edi, %xmm2
+; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1]
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2OR3-NEXT: movdqa %xmm1, %xmm5
+; SSE2OR3-NEXT: pxor %xmm2, %xmm5
+; SSE2OR3-NEXT: psubw %xmm4, %xmm1
+; SSE2OR3-NEXT: movdqa %xmm1, %xmm3
+; SSE2OR3-NEXT: pxor %xmm2, %xmm3
+; SSE2OR3-NEXT: pcmpgtw %xmm5, %xmm3
+; SSE2OR3-NEXT: movdqa %xmm0, %xmm5
+; SSE2OR3-NEXT: pxor %xmm2, %xmm5
+; SSE2OR3-NEXT: psubw %xmm4, %xmm0
+; SSE2OR3-NEXT: pxor %xmm0, %xmm2
+; SSE2OR3-NEXT: pcmpgtw %xmm5, %xmm2
+; SSE2OR3-NEXT: pandn %xmm0, %xmm2
+; SSE2OR3-NEXT: pandn %xmm1, %xmm3
+; SSE2OR3-NEXT: movdqa %xmm2, %xmm0
+; SSE2OR3-NEXT: movdqa %xmm3, %xmm1
+; SSE2OR3-NEXT: retq
+;
+; SSE41-LABEL: test9:
+; SSE41: # %bb.0: # %vector.ph
+; SSE41-NEXT: movd %edi, %xmm2
+; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: psubw %xmm2, %xmm3
+; SSE41-NEXT: pminuw %xmm3, %xmm1
+; SSE41-NEXT: pcmpeqw %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psubw %xmm2, %xmm4
+; SSE41-NEXT: pminuw %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqw %xmm4, %xmm0
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: pand %xmm3, %xmm1
+; SSE41-NEXT: retq
;
; AVX1-LABEL: test9:
; AVX1: # %bb.0: # %vector.ph
@@ -557,22 +635,33 @@ define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind {
; AVX1-NEXT: vmovd %edi, %xmm2
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsubw %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpminuw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpcmpeqw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpminuw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpcmpeqw %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test9:
; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vmovd %edi, %xmm1
; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
-; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpminuw %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpcmpeqw %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test9:
; AVX512: # %bb.0: # %vector.ph
; AVX512-NEXT: vpbroadcastw %edi, %ymm1
-; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpsubw %ymm1, %ymm0, %ymm1
+; AVX512-NEXT: vpcmpleuw %ymm0, %ymm1, %k1
+; AVX512-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} {z}
; AVX512-NEXT: retq
vector.ph:
%0 = insertelement <16 x i16> undef, i16 %w, i32 0
@@ -687,8 +776,16 @@ define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; SSE2-NEXT: psubusb %xmm2, %xmm0
-; SSE2-NEXT: psubusb %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psubb %xmm2, %xmm3
+; SSE2-NEXT: pminub %xmm3, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psubb %xmm2, %xmm4
+; SSE2-NEXT: pminub %xmm4, %xmm0
+; SSE2-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test12:
@@ -696,8 +793,16 @@ define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
; SSSE3-NEXT: movd %edi, %xmm2
; SSSE3-NEXT: pxor %xmm3, %xmm3
; SSSE3-NEXT: pshufb %xmm3, %xmm2
-; SSSE3-NEXT: psubusb %xmm2, %xmm0
-; SSSE3-NEXT: psubusb %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm3
+; SSSE3-NEXT: psubb %xmm2, %xmm3
+; SSSE3-NEXT: pminub %xmm3, %xmm1
+; SSSE3-NEXT: pcmpeqb %xmm3, %xmm1
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: psubb %xmm2, %xmm4
+; SSSE3-NEXT: pminub %xmm4, %xmm0
+; SSSE3-NEXT: pcmpeqb %xmm4, %xmm0
+; SSSE3-NEXT: pand %xmm4, %xmm0
+; SSSE3-NEXT: pand %xmm3, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: test12:
@@ -705,8 +810,16 @@ define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
; SSE41-NEXT: movd %edi, %xmm2
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: pshufb %xmm3, %xmm2
-; SSE41-NEXT: psubusb %xmm2, %xmm0
-; SSE41-NEXT: psubusb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: psubb %xmm2, %xmm3
+; SSE41-NEXT: pminub %xmm3, %xmm1
+; SSE41-NEXT: pcmpeqb %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psubb %xmm2, %xmm4
+; SSE41-NEXT: pminub %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: pand %xmm3, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: test12:
@@ -715,22 +828,33 @@ define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpsubusb %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpminub %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test12:
; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vmovd %edi, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
-; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpminub %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test12:
; AVX512: # %bb.0: # %vector.ph
; AVX512-NEXT: vpbroadcastb %edi, %ymm1
-; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm1
+; AVX512-NEXT: vpcmpleub %ymm0, %ymm1, %k1
+; AVX512-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} {z}
; AVX512-NEXT: retq
vector.ph:
%0 = insertelement <32 x i8> undef, i8 %w, i32 0
@@ -744,87 +868,122 @@ vector.ph:
define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSE2-LABEL: test13:
; SSE2: # %bb.0: # %vector.ph
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm3, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm6
-; SSE2-NEXT: por %xmm2, %xmm6
-; SSE2-NEXT: pslld $16, %xmm6
-; SSE2-NEXT: psrad $16, %xmm6
-; SSE2-NEXT: pxor %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: pand %xmm1, %xmm5
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pslld $16, %xmm5
-; SSE2-NEXT: psrad $16, %xmm5
-; SSE2-NEXT: packssdw %xmm6, %xmm5
-; SSE2-NEXT: psubusw %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: por %xmm3, %xmm5
+; SSE2-NEXT: psubd %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: por %xmm3, %xmm5
+; SSE2-NEXT: psubd %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm3
+; SSE2-NEXT: packssdw %xmm2, %xmm3
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: pslld $16, %xmm4
+; SSE2-NEXT: psrad $16, %xmm4
+; SSE2-NEXT: packssdw %xmm0, %xmm4
+; SSE2-NEXT: pandn %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test13:
; SSSE3: # %bb.0: # %vector.ph
+; SSSE3-NEXT: pxor %xmm3, %xmm3
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: pxor %xmm3, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
-; SSSE3-NEXT: movdqa %xmm5, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
-; SSSE3-NEXT: pand %xmm6, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm6
-; SSSE3-NEXT: por %xmm2, %xmm6
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm2, %xmm6
-; SSSE3-NEXT: pxor %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT: pxor %xmm5, %xmm4
-; SSSE3-NEXT: pand %xmm1, %xmm5
-; SSSE3-NEXT: por %xmm4, %xmm5
-; SSSE3-NEXT: pshufb %xmm2, %xmm5
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
-; SSSE3-NEXT: psubusw %xmm5, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm5
+; SSSE3-NEXT: por %xmm3, %xmm5
+; SSSE3-NEXT: psubd %xmm2, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pxor %xmm3, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: por %xmm3, %xmm5
+; SSSE3-NEXT: psubd %xmm1, %xmm4
+; SSSE3-NEXT: pxor %xmm4, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm3
+; SSSE3-NEXT: packssdw %xmm2, %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: pshufb %xmm1, %xmm4
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
+; SSSE3-NEXT: pandn %xmm4, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: test13:
; SSE41: # %bb.0: # %vector.ph
-; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
-; SSE41-NEXT: pminud %xmm3, %xmm2
-; SSE41-NEXT: pminud %xmm3, %xmm1
-; SSE41-NEXT: packusdw %xmm2, %xmm1
-; SSE41-NEXT: psubusw %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm3
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: psubd %xmm2, %xmm5
+; SSE41-NEXT: pminud %xmm5, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm0
+; SSE41-NEXT: movdqa %xmm4, %xmm2
+; SSE41-NEXT: psubd %xmm1, %xmm2
+; SSE41-NEXT: pminud %xmm2, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm4
+; SSE41-NEXT: packssdw %xmm0, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4],xmm3[5],xmm5[6],xmm3[7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
+; SSE41-NEXT: packusdw %xmm5, %xmm2
+; SSE41-NEXT: pand %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test13:
; AVX1: # %bb.0: # %vector.ph
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535]
-; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpsubd %xmm4, %xmm3, %xmm4
+; AVX1-NEXT: vpminud %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4],xmm2[5],xmm4[6],xmm2[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test13:
; AVX2: # %bb.0: # %vector.ph
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpminud %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: test13:
; AVX512: # %bb.0: # %vector.ph
-; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
-; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT: vpcmpleud %ymm0, %ymm1, %k1
+; AVX512-NEXT: vpmovdw %ymm1, %xmm0 {%k1} {z}
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
vector.ph:
@@ -840,80 +999,92 @@ vector.ph:
define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; SSE2OR3-LABEL: test14:
; SSE2OR3: # %bb.0: # %vector.ph
-; SSE2OR3-NEXT: pxor %xmm6, %xmm6
; SSE2OR3-NEXT: movdqa %xmm0, %xmm5
-; SSE2OR3-NEXT: movdqa %xmm4, %xmm7
+; SSE2OR3-NEXT: pxor %xmm0, %xmm0
+; SSE2OR3-NEXT: movdqa %xmm5, %xmm7
+; SSE2OR3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; SSE2OR3-NEXT: movdqa %xmm7, %xmm6
+; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
+; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; SSE2OR3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSE2OR3-NEXT: movdqa %xmm5, %xmm8
+; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
+; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2OR3-NEXT: movdqa %xmm4, %xmm9
+; SSE2OR3-NEXT: pxor %xmm0, %xmm9
+; SSE2OR3-NEXT: psubd %xmm5, %xmm4
+; SSE2OR3-NEXT: movdqa %xmm4, %xmm5
+; SSE2OR3-NEXT: pxor %xmm0, %xmm5
+; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm5
+; SSE2OR3-NEXT: movdqa %xmm3, %xmm9
+; SSE2OR3-NEXT: pxor %xmm0, %xmm9
+; SSE2OR3-NEXT: psubd %xmm8, %xmm3
; SSE2OR3-NEXT: movdqa %xmm3, %xmm8
-; SSE2OR3-NEXT: movdqa %xmm2, %xmm9
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm10 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2OR3-NEXT: pand %xmm10, %xmm4
-; SSE2OR3-NEXT: pand %xmm10, %xmm3
+; SSE2OR3-NEXT: pxor %xmm0, %xmm8
+; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm8
+; SSE2OR3-NEXT: packssdw %xmm5, %xmm8
+; SSE2OR3-NEXT: movdqa %xmm2, %xmm5
+; SSE2OR3-NEXT: pxor %xmm0, %xmm5
+; SSE2OR3-NEXT: psubd %xmm7, %xmm2
+; SSE2OR3-NEXT: movdqa %xmm2, %xmm7
+; SSE2OR3-NEXT: pxor %xmm0, %xmm7
+; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm7
+; SSE2OR3-NEXT: movdqa %xmm1, %xmm5
+; SSE2OR3-NEXT: pxor %xmm0, %xmm5
+; SSE2OR3-NEXT: psubd %xmm6, %xmm1
+; SSE2OR3-NEXT: pxor %xmm1, %xmm0
+; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm0
+; SSE2OR3-NEXT: packssdw %xmm7, %xmm0
+; SSE2OR3-NEXT: packsswb %xmm8, %xmm0
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2OR3-NEXT: pand %xmm5, %xmm4
+; SSE2OR3-NEXT: pand %xmm5, %xmm3
; SSE2OR3-NEXT: packuswb %xmm4, %xmm3
-; SSE2OR3-NEXT: movdqa %xmm1, %xmm4
-; SSE2OR3-NEXT: pand %xmm10, %xmm2
-; SSE2OR3-NEXT: pand %xmm10, %xmm1
+; SSE2OR3-NEXT: pand %xmm5, %xmm2
+; SSE2OR3-NEXT: pand %xmm5, %xmm1
; SSE2OR3-NEXT: packuswb %xmm2, %xmm1
; SSE2OR3-NEXT: packuswb %xmm3, %xmm1
-; SSE2OR3-NEXT: psubb %xmm0, %xmm1
-; SSE2OR3-NEXT: movdqa %xmm0, %xmm2
-; SSE2OR3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; SSE2OR3-NEXT: movdqa %xmm2, %xmm0
-; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; SSE2OR3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
-; SSE2OR3-NEXT: movdqa %xmm5, %xmm3
-; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2OR3-NEXT: pxor %xmm6, %xmm7
-; SSE2OR3-NEXT: por %xmm6, %xmm5
-; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm5
-; SSE2OR3-NEXT: pxor %xmm6, %xmm8
-; SSE2OR3-NEXT: por %xmm6, %xmm3
-; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm3
-; SSE2OR3-NEXT: packssdw %xmm5, %xmm3
-; SSE2OR3-NEXT: pxor %xmm6, %xmm9
-; SSE2OR3-NEXT: por %xmm6, %xmm2
-; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm2
-; SSE2OR3-NEXT: pxor %xmm6, %xmm4
-; SSE2OR3-NEXT: por %xmm6, %xmm0
-; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm0
-; SSE2OR3-NEXT: packssdw %xmm2, %xmm0
-; SSE2OR3-NEXT: packsswb %xmm3, %xmm0
; SSE2OR3-NEXT: pandn %xmm1, %xmm0
; SSE2OR3-NEXT: retq
;
; SSE41-LABEL: test14:
; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[3,3,3,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
-; SSE41-NEXT: pmaxud %xmm4, %xmm8
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm8
-; SSE41-NEXT: pmaxud %xmm3, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm7
-; SSE41-NEXT: packssdw %xmm8, %xmm7
-; SSE41-NEXT: pmaxud %xmm1, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm6
-; SSE41-NEXT: pmaxud %xmm2, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm5
-; SSE41-NEXT: packssdw %xmm5, %xmm6
-; SSE41-NEXT: packsswb %xmm7, %xmm6
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = [255,255,255,255]
-; SSE41-NEXT: pand %xmm5, %xmm4
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: packusdw %xmm4, %xmm3
-; SSE41-NEXT: pand %xmm5, %xmm2
-; SSE41-NEXT: pand %xmm1, %xmm5
-; SSE41-NEXT: packusdw %xmm2, %xmm5
-; SSE41-NEXT: packuswb %xmm3, %xmm5
-; SSE41-NEXT: psubb %xmm0, %xmm5
-; SSE41-NEXT: pand %xmm6, %xmm5
-; SSE41-NEXT: movdqa %xmm5, %xmm0
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT: movdqa %xmm4, %xmm6
+; SSE41-NEXT: psubd %xmm0, %xmm6
+; SSE41-NEXT: pminud %xmm6, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm4
+; SSE41-NEXT: movdqa %xmm3, %xmm5
+; SSE41-NEXT: psubd %xmm9, %xmm5
+; SSE41-NEXT: pminud %xmm5, %xmm3
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm3
+; SSE41-NEXT: packssdw %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psubd %xmm8, %xmm0
+; SSE41-NEXT: pminud %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psubd %xmm7, %xmm4
+; SSE41-NEXT: pminud %xmm4, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm2
+; SSE41-NEXT: packssdw %xmm2, %xmm1
+; SSE41-NEXT: packsswb %xmm3, %xmm1
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = [255,255,255,255]
+; SSE41-NEXT: pand %xmm2, %xmm6
+; SSE41-NEXT: pand %xmm2, %xmm5
+; SSE41-NEXT: packusdw %xmm6, %xmm5
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm4
+; SSE41-NEXT: packusdw %xmm4, %xmm0
+; SSE41-NEXT: packuswb %xmm5, %xmm0
+; SSE41-NEXT: pand %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test14:
@@ -923,31 +1094,34 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT: vpmaxud %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vpmaxud %xmm5, %xmm2, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm5
-; AVX1-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpmaxud %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
-; AVX1-NEXT: vpmaxud %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpacksswb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vpminud %xmm6, %xmm0, %xmm6
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm0, %xmm6
+; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm5
+; AVX1-NEXT: vpminud %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpackssdw %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm4, %xmm1, %xmm4
+; AVX1-NEXT: vpminud %xmm1, %xmm4, %xmm6
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm4, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpackssdw %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255]
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT: vpackusdw %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4
+; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -955,35 +1129,38 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpmaxud %ymm4, %ymm1, %ymm4
-; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm4
-; AVX2-NEXT: vpmaxud %ymm3, %ymm2, %ymm3
-; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm3
-; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vpacksswb %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpsubd %ymm3, %ymm2, %ymm3
+; AVX2-NEXT: vpminud %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vpackssdw %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: test14:
; AVX512: # %bb.0: # %vector.ph
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT: vpcmpnltud %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512-NEXT: vpsubb %xmm0, %xmm1, %xmm0 {%k1} {z}
+; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT: vpsubd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: vpcmpleud %zmm1, %zmm0, %k1
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
vector.ph:
@@ -1221,10 +1398,26 @@ define <64 x i8> @test17(<64 x i8> %x, i8 zeroext %w) nounwind {
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
-; SSE2-NEXT: psubusb %xmm4, %xmm0
-; SSE2-NEXT: psubusb %xmm4, %xmm1
-; SSE2-NEXT: psubusb %xmm4, %xmm2
-; SSE2-NEXT: psubusb %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psubb %xmm4, %xmm5
+; SSE2-NEXT: pminub %xmm5, %xmm3
+; SSE2-NEXT: pcmpeqb %xmm5, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psubb %xmm4, %xmm6
+; SSE2-NEXT: pminub %xmm6, %xmm2
+; SSE2-NEXT: pcmpeqb %xmm6, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psubb %xmm4, %xmm7
+; SSE2-NEXT: pminub %xmm7, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm7, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: psubb %xmm4, %xmm8
+; SSE2-NEXT: pminub %xmm8, %xmm0
+; SSE2-NEXT: pcmpeqb %xmm8, %xmm0
+; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: pand %xmm6, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test17:
@@ -1232,10 +1425,26 @@ define <64 x i8> @test17(<64 x i8> %x, i8 zeroext %w) nounwind {
; SSSE3-NEXT: movd %edi, %xmm4
; SSSE3-NEXT: pxor %xmm5, %xmm5
; SSSE3-NEXT: pshufb %xmm5, %xmm4
-; SSSE3-NEXT: psubusb %xmm4, %xmm0
-; SSSE3-NEXT: psubusb %xmm4, %xmm1
-; SSSE3-NEXT: psubusb %xmm4, %xmm2
-; SSSE3-NEXT: psubusb %xmm4, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: psubb %xmm4, %xmm5
+; SSSE3-NEXT: pminub %xmm5, %xmm3
+; SSSE3-NEXT: pcmpeqb %xmm5, %xmm3
+; SSSE3-NEXT: movdqa %xmm2, %xmm6
+; SSSE3-NEXT: psubb %xmm4, %xmm6
+; SSSE3-NEXT: pminub %xmm6, %xmm2
+; SSSE3-NEXT: pcmpeqb %xmm6, %xmm2
+; SSSE3-NEXT: movdqa %xmm1, %xmm7
+; SSSE3-NEXT: psubb %xmm4, %xmm7
+; SSSE3-NEXT: pminub %xmm7, %xmm1
+; SSSE3-NEXT: pcmpeqb %xmm7, %xmm1
+; SSSE3-NEXT: movdqa %xmm0, %xmm8
+; SSSE3-NEXT: psubb %xmm4, %xmm8
+; SSSE3-NEXT: pminub %xmm8, %xmm0
+; SSSE3-NEXT: pcmpeqb %xmm8, %xmm0
+; SSSE3-NEXT: pand %xmm8, %xmm0
+; SSSE3-NEXT: pand %xmm7, %xmm1
+; SSSE3-NEXT: pand %xmm6, %xmm2
+; SSSE3-NEXT: pand %xmm5, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: test17:
@@ -1243,10 +1452,26 @@ define <64 x i8> @test17(<64 x i8> %x, i8 zeroext %w) nounwind {
; SSE41-NEXT: movd %edi, %xmm4
; SSE41-NEXT: pxor %xmm5, %xmm5
; SSE41-NEXT: pshufb %xmm5, %xmm4
-; SSE41-NEXT: psubusb %xmm4, %xmm0
-; SSE41-NEXT: psubusb %xmm4, %xmm1
-; SSE41-NEXT: psubusb %xmm4, %xmm2
-; SSE41-NEXT: psubusb %xmm4, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm5
+; SSE41-NEXT: psubb %xmm4, %xmm5
+; SSE41-NEXT: pminub %xmm5, %xmm3
+; SSE41-NEXT: pcmpeqb %xmm5, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm6
+; SSE41-NEXT: psubb %xmm4, %xmm6
+; SSE41-NEXT: pminub %xmm6, %xmm2
+; SSE41-NEXT: pcmpeqb %xmm6, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm7
+; SSE41-NEXT: psubb %xmm4, %xmm7
+; SSE41-NEXT: pminub %xmm7, %xmm1
+; SSE41-NEXT: pcmpeqb %xmm7, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm8
+; SSE41-NEXT: psubb %xmm4, %xmm8
+; SSE41-NEXT: pminub %xmm8, %xmm0
+; SSE41-NEXT: pcmpeqb %xmm8, %xmm0
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: pand %xmm7, %xmm1
+; SSE41-NEXT: pand %xmm6, %xmm2
+; SSE41-NEXT: pand %xmm5, %xmm3
; SSE41-NEXT: retq
;
; AVX1-LABEL: test17:
@@ -1254,28 +1479,48 @@ define <64 x i8> @test17(<64 x i8> %x, i8 zeroext %w) nounwind {
; AVX1-NEXT: vmovd %edi, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm4
+; AVX1-NEXT: vpminub %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpcmpeqb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm5
+; AVX1-NEXT: vpminub %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpcmpeqb %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm6
+; AVX1-NEXT: vpminub %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpcmpeqb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpminub %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm3
+; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test17:
; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vmovd %edi, %xmm2
; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsubusb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsubb %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpminub %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpsubb %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpminub %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: test17:
; AVX512: # %bb.0: # %vector.ph
; AVX512-NEXT: vpbroadcastb %edi, %zmm1
-; AVX512-NEXT: vpsubusb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vpcmpleub %zmm0, %zmm1, %k1
+; AVX512-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} {z}
; AVX512-NEXT: retq
vector.ph:
%0 = insertelement <64 x i8> undef, i8 %w, i32 0
@@ -1287,44 +1532,119 @@ vector.ph:
}
define <32 x i16> @test18(<32 x i16> %x, i16 zeroext %w) nounwind {
-; SSE-LABEL: test18:
-; SSE: # %bb.0: # %vector.ph
-; SSE-NEXT: movd %edi, %xmm4
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
-; SSE-NEXT: psubusw %xmm4, %xmm0
-; SSE-NEXT: psubusw %xmm4, %xmm1
-; SSE-NEXT: psubusw %xmm4, %xmm2
-; SSE-NEXT: psubusw %xmm4, %xmm3
-; SSE-NEXT: retq
+; SSE2OR3-LABEL: test18:
+; SSE2OR3: # %bb.0: # %vector.ph
+; SSE2OR3-NEXT: movdqa %xmm0, %xmm5
+; SSE2OR3-NEXT: movd %edi, %xmm0
+; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1]
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm0 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2OR3-NEXT: movdqa %xmm3, %xmm6
+; SSE2OR3-NEXT: pxor %xmm0, %xmm6
+; SSE2OR3-NEXT: psubw %xmm8, %xmm3
+; SSE2OR3-NEXT: movdqa %xmm3, %xmm4
+; SSE2OR3-NEXT: pxor %xmm0, %xmm4
+; SSE2OR3-NEXT: pcmpgtw %xmm6, %xmm4
+; SSE2OR3-NEXT: movdqa %xmm2, %xmm7
+; SSE2OR3-NEXT: pxor %xmm0, %xmm7
+; SSE2OR3-NEXT: psubw %xmm8, %xmm2
+; SSE2OR3-NEXT: movdqa %xmm2, %xmm6
+; SSE2OR3-NEXT: pxor %xmm0, %xmm6
+; SSE2OR3-NEXT: pcmpgtw %xmm7, %xmm6
+; SSE2OR3-NEXT: movdqa %xmm1, %xmm9
+; SSE2OR3-NEXT: pxor %xmm0, %xmm9
+; SSE2OR3-NEXT: psubw %xmm8, %xmm1
+; SSE2OR3-NEXT: movdqa %xmm1, %xmm7
+; SSE2OR3-NEXT: pxor %xmm0, %xmm7
+; SSE2OR3-NEXT: pcmpgtw %xmm9, %xmm7
+; SSE2OR3-NEXT: movdqa %xmm5, %xmm9
+; SSE2OR3-NEXT: pxor %xmm0, %xmm9
+; SSE2OR3-NEXT: psubw %xmm8, %xmm5
+; SSE2OR3-NEXT: pxor %xmm5, %xmm0
+; SSE2OR3-NEXT: pcmpgtw %xmm9, %xmm0
+; SSE2OR3-NEXT: pandn %xmm5, %xmm0
+; SSE2OR3-NEXT: pandn %xmm1, %xmm7
+; SSE2OR3-NEXT: pandn %xmm2, %xmm6
+; SSE2OR3-NEXT: pandn %xmm3, %xmm4
+; SSE2OR3-NEXT: movdqa %xmm7, %xmm1
+; SSE2OR3-NEXT: movdqa %xmm6, %xmm2
+; SSE2OR3-NEXT: movdqa %xmm4, %xmm3
+; SSE2OR3-NEXT: retq
+;
+; SSE41-LABEL: test18:
+; SSE41: # %bb.0: # %vector.ph
+; SSE41-NEXT: movd %edi, %xmm4
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; SSE41-NEXT: movdqa %xmm3, %xmm5
+; SSE41-NEXT: psubw %xmm4, %xmm5
+; SSE41-NEXT: pminuw %xmm5, %xmm3
+; SSE41-NEXT: pcmpeqw %xmm5, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm6
+; SSE41-NEXT: psubw %xmm4, %xmm6
+; SSE41-NEXT: pminuw %xmm6, %xmm2
+; SSE41-NEXT: pcmpeqw %xmm6, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm7
+; SSE41-NEXT: psubw %xmm4, %xmm7
+; SSE41-NEXT: pminuw %xmm7, %xmm1
+; SSE41-NEXT: pcmpeqw %xmm7, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm8
+; SSE41-NEXT: psubw %xmm4, %xmm8
+; SSE41-NEXT: pminuw %xmm8, %xmm0
+; SSE41-NEXT: pcmpeqw %xmm8, %xmm0
+; SSE41-NEXT: pand %xmm8, %xmm0
+; SSE41-NEXT: pand %xmm7, %xmm1
+; SSE41-NEXT: pand %xmm6, %xmm2
+; SSE41-NEXT: pand %xmm5, %xmm3
+; SSE41-NEXT: retq
;
; AVX1-LABEL: test18:
; AVX1: # %bb.0: # %vector.ph
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovd %edi, %xmm3
; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
-; AVX1-NEXT: vpsubusw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsubusw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpsubusw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsubusw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsubw %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpminuw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm5
+; AVX1-NEXT: vpminuw %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpcmpeqw %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsubw %xmm3, %xmm2, %xmm6
+; AVX1-NEXT: vpminuw %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpsubw %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpminuw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpcmpeqw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm2
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm2
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test18:
; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vmovd %edi, %xmm2
; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsubusw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsubw %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpminuw %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpsubw %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpminuw %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpcmpeqw %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: test18:
; AVX512: # %bb.0: # %vector.ph
; AVX512-NEXT: vpbroadcastw %edi, %zmm1
-; AVX512-NEXT: vpsubusw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpsubw %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vpcmpleuw %zmm0, %zmm1, %k1
+; AVX512-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z}
; AVX512-NEXT: retq
vector.ph:
%0 = insertelement <32 x i16> undef, i16 %w, i32 0
diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll
index 1b307b30d8c0d1..4e31b48ec5cece 100644
--- a/llvm/test/CodeGen/X86/select.ll
+++ b/llvm/test/CodeGen/X86/select.ll
@@ -2065,11 +2065,10 @@ define i64 @PR51612(i64 %x, i64 %y) {
; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %eax
; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %ecx
; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %edx
-; ATHLON-NEXT: addl $1, %ecx
-; ATHLON-NEXT: adcl $0, %edx
-; ATHLON-NEXT: incl %eax
-; ATHLON-NEXT: orl %ecx, %edx
-; ATHLON-NEXT: cmovnel %ecx, %eax
+; ATHLON-NEXT: incl %edx
+; ATHLON-NEXT: addl $1, %eax
+; ATHLON-NEXT: adcl $0, %ecx
+; ATHLON-NEXT: cmovbl %edx, %eax
; ATHLON-NEXT: andl 10, %eax
; ATHLON-NEXT: xorl %edx, %edx
; ATHLON-NEXT: retl
@@ -2078,8 +2077,7 @@ define i64 @PR51612(i64 %x, i64 %y) {
; MCU: # %bb.0:
; MCU-NEXT: addl $1, %eax
; MCU-NEXT: adcl $0, %edx
-; MCU-NEXT: orl %eax, %edx
-; MCU-NEXT: jne .LBB45_2
+; MCU-NEXT: jae .LBB45_2
; MCU-NEXT: # %bb.1:
; MCU-NEXT: movl {{[0-9]+}}(%esp), %eax
; MCU-NEXT: incl %eax
diff --git a/test_direct_uaddo.ll b/test_direct_uaddo.ll
new file mode 100644
index 00000000000000..a923d212bbf904
--- /dev/null
+++ b/test_direct_uaddo.ll
@@ -0,0 +1,9 @@
+define i32 @test_direct_uaddo(i32 %x, i32 %y) {
+ %result = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
+ %ovf = extractvalue {i32, i1} %result, 1
+ %val = extractvalue {i32, i1} %result, 0
+ %sel = select i1 %ovf, i32 -1, i32 %val
+ ret i32 %sel
+}
+
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32)
diff --git a/test_sat_pattern.ll b/test_sat_pattern.ll
new file mode 100644
index 00000000000000..150c8081a77ac7
--- /dev/null
+++ b/test_sat_pattern.ll
@@ -0,0 +1,6 @@
+define <8 x i16> @test_sat_pattern(<8 x i16> %x, <8 x i16> %y) {
+ %a = add <8 x i16> %x, %y
+ %c = icmp ugt <8 x i16> %x, %a
+ %r = select <8 x i1> %c, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %a
+ ret <8 x i16> %r
+}
diff --git a/test_sat_pattern.s b/test_sat_pattern.s
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/test_scalar_sat.ll b/test_scalar_sat.ll
new file mode 100644
index 00000000000000..6ef9729e66a754
--- /dev/null
+++ b/test_scalar_sat.ll
@@ -0,0 +1,6 @@
+define i8 @test_scalar_sat(i8 %x) {
+ %a = add i8 %x, 42
+ %c = icmp ugt i8 %x, %a
+ %r = select i1 %c, i8 -1, i8 %a
+ ret i8 %r
+}
diff --git a/test_uaddo_conversion.ll b/test_uaddo_conversion.ll
new file mode 100644
index 00000000000000..ca433863997b79
--- /dev/null
+++ b/test_uaddo_conversion.ll
@@ -0,0 +1,9 @@
+define i32 @test_uaddo_to_uaddsat(i32 %x, i32 %y) {
+ %result = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
+ %val = extractvalue {i32, i1} %result, 0
+ %ovf = extractvalue {i32, i1} %result, 1
+ %sel = select i1 %ovf, i32 -1, i32 %val
+ ret i32 %sel
+}
+
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32)
diff --git a/test_uaddo_only.ll b/test_uaddo_only.ll
new file mode 100644
index 00000000000000..4f7056148fa994
--- /dev/null
+++ b/test_uaddo_only.ll
@@ -0,0 +1,6 @@
+define i32 @test_uaddo_only(i32 %x, i32 %y) {
+ %add = add i32 %x, %y
+ %cmp = icmp ugt i32 %x, %add
+ %sel = select i1 %cmp, i32 -1, i32 %add
+ ret i32 %sel
+}
diff --git a/test_uaddo_only.s b/test_uaddo_only.s
new file mode 100644
index 00000000000000..e04ea329bd8e97
--- /dev/null
+++ b/test_uaddo_only.s
@@ -0,0 +1,22 @@
+ .abiversion 2
+ .file "test_uaddo_only.ll"
+ .text
+ .globl test_uaddo_only # -- Begin function test_uaddo_only
+ .p2align 4
+ .type test_uaddo_only, at function
+test_uaddo_only: # @test_uaddo_only
+.Lfunc_begin0:
+ .cfi_startproc
+# %bb.0:
+ add 4, 3, 4
+ cmplw 4, 3
+ li 3, -1
+ isellt 3, 3, 4
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end0:
+ .size test_uaddo_only, .Lfunc_end0-.Lfunc_begin0
+ .cfi_endproc
+ # -- End function
+ .section ".note.GNU-stack","", at progbits
diff --git a/test_uaddsat.ll b/test_uaddsat.ll
new file mode 100644
index 00000000000000..0c5423504fb482
--- /dev/null
+++ b/test_uaddsat.ll
@@ -0,0 +1,9 @@
+; Test file to verify uaddo -> uaddsat conversion
+define i32 @test_uaddsat_pattern(i32 %x, i32 %y) {
+ %add = add i32 %x, %y
+ %cmp = icmp ugt i32 %x, %add
+ %sel = select i1 %cmp, i32 -1, i32 %add
+ ret i32 %sel
+}
+
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32)
diff --git a/test_usubo.ll b/test_usubo.ll
new file mode 100644
index 00000000000000..e588f43f3cec9e
--- /dev/null
+++ b/test_usubo.ll
@@ -0,0 +1,15 @@
+; Test file to verify usubo -> usubsat conversion
+define i32 @test_usubo_to_usubsat(i32 %x, i32 %y) {
+ %result = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %x, i32 %y)
+ %val = extractvalue {i32, i1} %result, 0
+ ret i32 %val
+}
+
+define i32 @test_uaddo_to_uaddsat(i32 %x, i32 %y) {
+ %result = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
+ %val = extractvalue {i32, i1} %result, 0
+ ret i32 %val
+}
+
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32)
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32)
diff --git a/test_vector_uaddo.ll b/test_vector_uaddo.ll
new file mode 100644
index 00000000000000..8105ed0041f54e
--- /dev/null
+++ b/test_vector_uaddo.ll
@@ -0,0 +1,9 @@
+define <8 x i16> @test_vector_uaddo(<8 x i16> %x, <8 x i16> %y) {
+ %result = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> %x, <8 x i16> %y)
+ %ovf = extractvalue { <8 x i16>, <8 x i1> } %result, 1
+ %val = extractvalue { <8 x i16>, <8 x i1> } %result, 0
+ %sel = select <8 x i1> %ovf, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %val
+ ret <8 x i16> %sel
+}
+
+declare { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16>, <8 x i16>)
diff --git a/test_vector_uaddo.s b/test_vector_uaddo.s
new file mode 100644
index 00000000000000..5834fc58aa5621
--- /dev/null
+++ b/test_vector_uaddo.s
@@ -0,0 +1,21 @@
+ .abiversion 2
+ .file "test_vector_uaddo.ll"
+ .text
+ .globl test_vector_uaddo # -- Begin function test_vector_uaddo
+ .p2align 4
+ .type test_vector_uaddo, at function
+test_vector_uaddo: # @test_vector_uaddo
+.Lfunc_begin0:
+ .cfi_startproc
+# %bb.0:
+ vadduhm 3, 2, 3
+ vcmpgtuh 2, 2, 3
+ xxlor 34, 34, 35
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end0:
+ .size test_vector_uaddo, .Lfunc_end0-.Lfunc_begin0
+ .cfi_endproc
+ # -- End function
+ .section ".note.GNU-stack","", at progbits
diff --git a/trace_uaddsat.ll b/trace_uaddsat.ll
new file mode 100644
index 00000000000000..8fccd2816d67fd
--- /dev/null
+++ b/trace_uaddsat.ll
@@ -0,0 +1,6 @@
+define i32 @test_uaddsat(i32 %x, i32 %y) {
+ %add = add i32 %x, %y
+ %cmp = icmp ugt i32 %x, %add
+ %sel = select i1 %cmp, i32 -1, i32 %add
+ ret i32 %sel
+}
>From 3a7fdcbfc121fa588b2905efbf132c44b0eb80e6 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Wed, 24 Sep 2025 12:34:21 -0400
Subject: [PATCH 04/17] j
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 25 ++++++++++++
llvm/test/CodeGen/PowerPC/sat-add.ll | 38 +++++++------------
2 files changed, 39 insertions(+), 24 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a6ba6e518899f8..db2fc895cf09ff 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13347,6 +13347,31 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
DAG.getNode(ISD::AND, DL, N0.getValueType(), N1.getOperand(1), N0));
}
+ // vselect uaddo(x, y).overflow, -1, uaddo(x, y) -> uaddsat(x, y)
+ // This converts the pattern created by CodeGenPrepare back to uaddsat
+ // Handle the case where overflow might be sign-extended
+ if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+ // Look through sign_extend_inreg to find the actual overflow flag
+ (void)N0.getOperand(0);
+ if ((N1.getOpcode() == ISD::UADDO && N1.getResNo() == 0 &&
+ ISD::isConstantSplatVectorAllOnes(N2.getNode())) ||
+ (N2.getOpcode() == ISD::UADDO && N2.getResNo() == 0 &&
+ ISD::isConstantSplatVectorAllOnes(N1.getNode()))) {
+ LLVM_DEBUG(dbgs() << "Converting uaddo to uaddsat\n");
+ return DAG.getNode(ISD::UADDSAT, DL, VT,
+ N1.getOpcode() == ISD::UADDO ? N1.getOperand(0) : N2.getOperand(0),
+ N1.getOpcode() == ISD::UADDO ? N1.getOperand(1) : N2.getOperand(1));
+ }
+ } else if ((N1.getOpcode() == ISD::UADDO && N1.getResNo() == 0 &&
+ ISD::isConstantSplatVectorAllOnes(N2.getNode())) ||
+ (N2.getOpcode() == ISD::UADDO && N2.getResNo() == 0 &&
+ ISD::isConstantSplatVectorAllOnes(N1.getNode()))) {
+ LLVM_DEBUG(dbgs() << "Converting uaddo to uaddsat\n");
+ return DAG.getNode(ISD::UADDSAT, DL, VT,
+ N1.getOpcode() == ISD::UADDO ? N1.getOperand(0) : N2.getOperand(0),
+ N1.getOpcode() == ISD::UADDO ? N1.getOperand(1) : N2.getOperand(1));
+ }
+
// Canonicalize integer abs.
// vselect (setg[te] X, 0), X, -X ->
// vselect (setgt X, -1), X, -X ->
diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll
index 012f03f0b884c4..f04ef6d329bce6 100644
--- a/llvm/test/CodeGen/PowerPC/sat-add.ll
+++ b/llvm/test/CodeGen/PowerPC/sat-add.ll
@@ -400,9 +400,7 @@ define <16 x i8> @unsigned_sat_constant_v16i8_using_cmp_sum(<16 x i8> %x) {
; CHECK-NEXT: addis 3, 2, .LCPI25_0 at toc@ha
; CHECK-NEXT: addi 3, 3, .LCPI25_0 at toc@l
; CHECK-NEXT: lxvd2x 35, 0, 3
-; CHECK-NEXT: vaddubm 3, 2, 3
-; CHECK-NEXT: vcmpgtub 2, 2, 3
-; CHECK-NEXT: xxlor 34, 34, 35
+; CHECK-NEXT: vaddubs 2, 2, 3
; CHECK-NEXT: blr
%a = add <16 x i8> %x, <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
%c = icmp ugt <16 x i8> %x, %a
@@ -448,9 +446,7 @@ define <8 x i16> @unsigned_sat_constant_v8i16_using_cmp_sum(<8 x i16> %x) {
; CHECK-NEXT: addis 3, 2, .LCPI28_0 at toc@ha
; CHECK-NEXT: addi 3, 3, .LCPI28_0 at toc@l
; CHECK-NEXT: lxvd2x 35, 0, 3
-; CHECK-NEXT: vadduhm 3, 2, 3
-; CHECK-NEXT: vcmpgtuh 2, 2, 3
-; CHECK-NEXT: xxlor 34, 34, 35
+; CHECK-NEXT: vadduhs 2, 2, 3
; CHECK-NEXT: blr
%a = add <8 x i16> %x, <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
%c = icmp ugt <8 x i16> %x, %a
@@ -496,9 +492,7 @@ define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_sum(<4 x i32> %x) {
; CHECK-NEXT: addis 3, 2, .LCPI31_0 at toc@ha
; CHECK-NEXT: addi 3, 3, .LCPI31_0 at toc@l
; CHECK-NEXT: lxvd2x 35, 0, 3
-; CHECK-NEXT: vadduwm 3, 2, 3
-; CHECK-NEXT: vcmpgtuw 2, 2, 3
-; CHECK-NEXT: xxlor 34, 34, 35
+; CHECK-NEXT: vadduws 2, 2, 3
; CHECK-NEXT: blr
%a = add <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42>
%c = icmp ugt <4 x i32> %x, %a
@@ -544,9 +538,11 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_sum(<2 x i64> %x) {
; CHECK-NEXT: addis 3, 2, .LCPI34_0 at toc@ha
; CHECK-NEXT: addi 3, 3, .LCPI34_0 at toc@l
; CHECK-NEXT: lxvd2x 35, 0, 3
-; CHECK-NEXT: vaddudm 3, 2, 3
-; CHECK-NEXT: vcmpgtud 2, 2, 3
-; CHECK-NEXT: xxlor 34, 34, 35
+; CHECK-NEXT: addis 3, 2, .LCPI34_1 at toc@ha
+; CHECK-NEXT: addi 3, 3, .LCPI34_1 at toc@l
+; CHECK-NEXT: lxvd2x 36, 0, 3
+; CHECK-NEXT: vminud 2, 2, 4
+; CHECK-NEXT: vaddudm 2, 2, 3
; CHECK-NEXT: blr
%a = add <2 x i64> %x, <i64 42, i64 42>
%c = icmp ugt <2 x i64> %x, %a
@@ -590,9 +586,7 @@ define <16 x i8> @unsigned_sat_variable_v16i8_using_min(<16 x i8> %x, <16 x i8>
define <16 x i8> @unsigned_sat_variable_v16i8_using_cmp_sum(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: vaddubm 3, 2, 3
-; CHECK-NEXT: vcmpgtub 2, 2, 3
-; CHECK-NEXT: xxlor 34, 34, 35
+; CHECK-NEXT: vaddubs 2, 2, 3
; CHECK-NEXT: blr
%a = add <16 x i8> %x, %y
%c = icmp ugt <16 x i8> %x, %a
@@ -632,9 +626,7 @@ define <8 x i16> @unsigned_sat_variable_v8i16_using_min(<8 x i16> %x, <8 x i16>
define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_sum(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: vadduhm 3, 2, 3
-; CHECK-NEXT: vcmpgtuh 2, 2, 3
-; CHECK-NEXT: xxlor 34, 34, 35
+; CHECK-NEXT: vadduhs 2, 2, 3
; CHECK-NEXT: blr
%a = add <8 x i16> %x, %y
%c = icmp ugt <8 x i16> %x, %a
@@ -674,9 +666,7 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_min(<4 x i32> %x, <4 x i32>
define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_sum(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: vadduwm 3, 2, 3
-; CHECK-NEXT: vcmpgtuw 2, 2, 3
-; CHECK-NEXT: xxlor 34, 34, 35
+; CHECK-NEXT: vadduws 2, 2, 3
; CHECK-NEXT: blr
%a = add <4 x i32> %x, %y
%c = icmp ugt <4 x i32> %x, %a
@@ -716,9 +706,9 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_min(<2 x i64> %x, <2 x i64>
define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_sum(<2 x i64> %x, <2 x i64> %y) {
; CHECK-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: vaddudm 3, 2, 3
-; CHECK-NEXT: vcmpgtud 2, 2, 3
-; CHECK-NEXT: xxlor 34, 34, 35
+; CHECK-NEXT: xxlnor 36, 35, 35
+; CHECK-NEXT: vminud 2, 2, 4
+; CHECK-NEXT: vaddudm 2, 2, 3
; CHECK-NEXT: blr
%a = add <2 x i64> %x, %y
%c = icmp ugt <2 x i64> %x, %a
>From 5804715511b74d76ad5aa15227d33a06465c3c5f Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Wed, 24 Sep 2025 12:41:41 -0400
Subject: [PATCH 05/17] Revert "f"
This reverts commit 1342b3027025367dfcaf61d13ee1ce1f6ecd9739.
---
llvm/test/CodeGen/PowerPC/sat-add.ll | 38 +-
llvm/test/CodeGen/PowerPC/sat-add.s | 1260 ---------------------
llvm/test/CodeGen/RISCV/branch-on-zero.ll | 28 +-
llvm/test/CodeGen/X86/psubus.ll | 814 ++++---------
llvm/test/CodeGen/X86/select.ll | 12 +-
test_direct_uaddo.ll | 9 -
test_sat_pattern.ll | 6 -
test_sat_pattern.s | 0
test_scalar_sat.ll | 6 -
test_uaddo_conversion.ll | 9 -
test_uaddo_only.ll | 6 -
test_uaddo_only.s | 22 -
test_uaddsat.ll | 9 -
test_usubo.ll | 15 -
test_vector_uaddo.ll | 9 -
test_vector_uaddo.s | 21 -
trace_uaddsat.ll | 6 -
17 files changed, 290 insertions(+), 1980 deletions(-)
delete mode 100644 llvm/test/CodeGen/PowerPC/sat-add.s
delete mode 100644 test_direct_uaddo.ll
delete mode 100644 test_sat_pattern.ll
delete mode 100644 test_sat_pattern.s
delete mode 100644 test_scalar_sat.ll
delete mode 100644 test_uaddo_conversion.ll
delete mode 100644 test_uaddo_only.ll
delete mode 100644 test_uaddo_only.s
delete mode 100644 test_uaddsat.ll
delete mode 100644 test_usubo.ll
delete mode 100644 test_vector_uaddo.ll
delete mode 100644 test_vector_uaddo.s
delete mode 100644 trace_uaddsat.ll
diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll
index f04ef6d329bce6..771c2ca0a866c3 100644
--- a/llvm/test/CodeGen/PowerPC/sat-add.ll
+++ b/llvm/test/CodeGen/PowerPC/sat-add.ll
@@ -24,11 +24,12 @@ define i8 @unsigned_sat_constant_i8_using_min(i8 %x) {
define i8 @unsigned_sat_constant_i8_using_cmp_sum(i8 %x) {
; CHECK-LABEL: unsigned_sat_constant_i8_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrlwi 3, 3, 24
+; CHECK-NEXT: clrlwi 4, 3, 24
; CHECK-NEXT: addi 3, 3, 42
-; CHECK-NEXT: andi. 4, 3, 256
+; CHECK-NEXT: clrlwi 5, 3, 24
+; CHECK-NEXT: cmplw 4, 5
; CHECK-NEXT: li 4, -1
-; CHECK-NEXT: iseleq 3, 3, 4
+; CHECK-NEXT: iselgt 3, 4, 3
; CHECK-NEXT: blr
%a = add i8 %x, 42
%c = icmp ugt i8 %x, %a
@@ -69,11 +70,12 @@ define i16 @unsigned_sat_constant_i16_using_min(i16 %x) {
define i16 @unsigned_sat_constant_i16_using_cmp_sum(i16 %x) {
; CHECK-LABEL: unsigned_sat_constant_i16_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrlwi 3, 3, 16
+; CHECK-NEXT: clrlwi 4, 3, 16
; CHECK-NEXT: addi 3, 3, 42
-; CHECK-NEXT: andis. 4, 3, 1
+; CHECK-NEXT: clrlwi 5, 3, 16
+; CHECK-NEXT: cmplw 4, 5
; CHECK-NEXT: li 4, -1
-; CHECK-NEXT: iseleq 3, 3, 4
+; CHECK-NEXT: iselgt 3, 4, 3
; CHECK-NEXT: blr
%a = add i16 %x, 42
%c = icmp ugt i16 %x, %a
@@ -114,9 +116,9 @@ define i32 @unsigned_sat_constant_i32_using_cmp_sum(i32 %x) {
; CHECK-LABEL: unsigned_sat_constant_i32_using_cmp_sum:
; CHECK: # %bb.0:
; CHECK-NEXT: addi 4, 3, 42
-; CHECK-NEXT: cmplw 4, 3
+; CHECK-NEXT: cmplw 3, 4
; CHECK-NEXT: li 3, -1
-; CHECK-NEXT: isellt 3, 3, 4
+; CHECK-NEXT: iselgt 3, 3, 4
; CHECK-NEXT: blr
%a = add i32 %x, 42
%c = icmp ugt i32 %x, %a
@@ -203,12 +205,12 @@ define i8 @unsigned_sat_variable_i8_using_min(i8 %x, i8 %y) {
define i8 @unsigned_sat_variable_i8_using_cmp_sum(i8 %x, i8 %y) {
; CHECK-LABEL: unsigned_sat_variable_i8_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrlwi 4, 4, 24
-; CHECK-NEXT: clrlwi 3, 3, 24
+; CHECK-NEXT: clrlwi 5, 3, 24
; CHECK-NEXT: add 3, 3, 4
-; CHECK-NEXT: andi. 4, 3, 256
+; CHECK-NEXT: clrlwi 4, 3, 24
+; CHECK-NEXT: cmplw 5, 4
; CHECK-NEXT: li 4, -1
-; CHECK-NEXT: iseleq 3, 3, 4
+; CHECK-NEXT: iselgt 3, 4, 3
; CHECK-NEXT: blr
%a = add i8 %x, %y
%c = icmp ugt i8 %x, %a
@@ -254,12 +256,12 @@ define i16 @unsigned_sat_variable_i16_using_min(i16 %x, i16 %y) {
define i16 @unsigned_sat_variable_i16_using_cmp_sum(i16 %x, i16 %y) {
; CHECK-LABEL: unsigned_sat_variable_i16_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrlwi 4, 4, 16
-; CHECK-NEXT: clrlwi 3, 3, 16
+; CHECK-NEXT: clrlwi 5, 3, 16
; CHECK-NEXT: add 3, 3, 4
-; CHECK-NEXT: andis. 4, 3, 1
+; CHECK-NEXT: clrlwi 4, 3, 16
+; CHECK-NEXT: cmplw 5, 4
; CHECK-NEXT: li 4, -1
-; CHECK-NEXT: iseleq 3, 3, 4
+; CHECK-NEXT: iselgt 3, 4, 3
; CHECK-NEXT: blr
%a = add i16 %x, %y
%c = icmp ugt i16 %x, %a
@@ -304,9 +306,9 @@ define i32 @unsigned_sat_variable_i32_using_cmp_sum(i32 %x, i32 %y) {
; CHECK-LABEL: unsigned_sat_variable_i32_using_cmp_sum:
; CHECK: # %bb.0:
; CHECK-NEXT: add 4, 3, 4
-; CHECK-NEXT: cmplw 4, 3
+; CHECK-NEXT: cmplw 3, 4
; CHECK-NEXT: li 3, -1
-; CHECK-NEXT: isellt 3, 3, 4
+; CHECK-NEXT: iselgt 3, 3, 4
; CHECK-NEXT: blr
%a = add i32 %x, %y
%c = icmp ugt i32 %x, %a
diff --git a/llvm/test/CodeGen/PowerPC/sat-add.s b/llvm/test/CodeGen/PowerPC/sat-add.s
deleted file mode 100644
index ca085fc0f6997f..00000000000000
--- a/llvm/test/CodeGen/PowerPC/sat-add.s
+++ /dev/null
@@ -1,1260 +0,0 @@
- .abiversion 2
- .file "sat-add.ll"
- .text
- .globl unsigned_sat_constant_i8_using_min # -- Begin function unsigned_sat_constant_i8_using_min
- .p2align 4
- .type unsigned_sat_constant_i8_using_min, at function
-unsigned_sat_constant_i8_using_min: # @unsigned_sat_constant_i8_using_min
-.Lfunc_begin0:
- .cfi_startproc
-# %bb.0:
- clrlwi 4, 3, 24
- cmplwi 4, 213
- li 4, -43
- isellt 3, 3, 4
- addi 3, 3, 42
- blr
- .long 0
- .quad 0
-.Lfunc_end0:
- .size unsigned_sat_constant_i8_using_min, .Lfunc_end0-.Lfunc_begin0
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_constant_i8_using_cmp_sum # -- Begin function unsigned_sat_constant_i8_using_cmp_sum
- .p2align 4
- .type unsigned_sat_constant_i8_using_cmp_sum, at function
-unsigned_sat_constant_i8_using_cmp_sum: # @unsigned_sat_constant_i8_using_cmp_sum
-.Lfunc_begin1:
- .cfi_startproc
-# %bb.0:
- clrlwi 3, 3, 24
- addi 3, 3, 42
- andi. 4, 3, 256
- li 4, -1
- iseleq 3, 3, 4
- blr
- .long 0
- .quad 0
-.Lfunc_end1:
- .size unsigned_sat_constant_i8_using_cmp_sum, .Lfunc_end1-.Lfunc_begin1
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_constant_i8_using_cmp_notval # -- Begin function unsigned_sat_constant_i8_using_cmp_notval
- .p2align 4
- .type unsigned_sat_constant_i8_using_cmp_notval, at function
-unsigned_sat_constant_i8_using_cmp_notval: # @unsigned_sat_constant_i8_using_cmp_notval
-.Lfunc_begin2:
- .cfi_startproc
-# %bb.0:
- clrlwi 4, 3, 24
- addi 3, 3, 42
- cmplwi 4, 213
- li 4, -1
- iselgt 3, 4, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end2:
- .size unsigned_sat_constant_i8_using_cmp_notval, .Lfunc_end2-.Lfunc_begin2
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_constant_i16_using_min # -- Begin function unsigned_sat_constant_i16_using_min
- .p2align 4
- .type unsigned_sat_constant_i16_using_min, at function
-unsigned_sat_constant_i16_using_min: # @unsigned_sat_constant_i16_using_min
-.Lfunc_begin3:
- .cfi_startproc
-# %bb.0:
- clrlwi 4, 3, 16
- cmplwi 4, 65493
- li 4, -43
- isellt 3, 3, 4
- addi 3, 3, 42
- blr
- .long 0
- .quad 0
-.Lfunc_end3:
- .size unsigned_sat_constant_i16_using_min, .Lfunc_end3-.Lfunc_begin3
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_constant_i16_using_cmp_sum # -- Begin function unsigned_sat_constant_i16_using_cmp_sum
- .p2align 4
- .type unsigned_sat_constant_i16_using_cmp_sum, at function
-unsigned_sat_constant_i16_using_cmp_sum: # @unsigned_sat_constant_i16_using_cmp_sum
-.Lfunc_begin4:
- .cfi_startproc
-# %bb.0:
- clrlwi 3, 3, 16
- addi 3, 3, 42
- andis. 4, 3, 1
- li 4, -1
- iseleq 3, 3, 4
- blr
- .long 0
- .quad 0
-.Lfunc_end4:
- .size unsigned_sat_constant_i16_using_cmp_sum, .Lfunc_end4-.Lfunc_begin4
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_constant_i16_using_cmp_notval # -- Begin function unsigned_sat_constant_i16_using_cmp_notval
- .p2align 4
- .type unsigned_sat_constant_i16_using_cmp_notval, at function
-unsigned_sat_constant_i16_using_cmp_notval: # @unsigned_sat_constant_i16_using_cmp_notval
-.Lfunc_begin5:
- .cfi_startproc
-# %bb.0:
- clrlwi 4, 3, 16
- addi 3, 3, 42
- cmplwi 4, 65493
- li 4, -1
- iselgt 3, 4, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end5:
- .size unsigned_sat_constant_i16_using_cmp_notval, .Lfunc_end5-.Lfunc_begin5
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_constant_i32_using_min # -- Begin function unsigned_sat_constant_i32_using_min
- .p2align 4
- .type unsigned_sat_constant_i32_using_min, at function
-unsigned_sat_constant_i32_using_min: # @unsigned_sat_constant_i32_using_min
-.Lfunc_begin6:
- .cfi_startproc
-# %bb.0:
- li 4, -43
- cmplw 3, 4
- isellt 3, 3, 4
- addi 3, 3, 42
- blr
- .long 0
- .quad 0
-.Lfunc_end6:
- .size unsigned_sat_constant_i32_using_min, .Lfunc_end6-.Lfunc_begin6
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_constant_i32_using_cmp_sum # -- Begin function unsigned_sat_constant_i32_using_cmp_sum
- .p2align 4
- .type unsigned_sat_constant_i32_using_cmp_sum, at function
-unsigned_sat_constant_i32_using_cmp_sum: # @unsigned_sat_constant_i32_using_cmp_sum
-.Lfunc_begin7:
- .cfi_startproc
-# %bb.0:
- addi 4, 3, 42
- cmplw 4, 3
- li 3, -1
- isellt 3, 3, 4
- blr
- .long 0
- .quad 0
-.Lfunc_end7:
- .size unsigned_sat_constant_i32_using_cmp_sum, .Lfunc_end7-.Lfunc_begin7
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_constant_i32_using_cmp_notval # -- Begin function unsigned_sat_constant_i32_using_cmp_notval
- .p2align 4
- .type unsigned_sat_constant_i32_using_cmp_notval, at function
-unsigned_sat_constant_i32_using_cmp_notval: # @unsigned_sat_constant_i32_using_cmp_notval
-.Lfunc_begin8:
- .cfi_startproc
-# %bb.0:
- li 5, -43
- addi 4, 3, 42
- cmplw 3, 5
- li 3, -1
- iselgt 3, 3, 4
- blr
- .long 0
- .quad 0
-.Lfunc_end8:
- .size unsigned_sat_constant_i32_using_cmp_notval, .Lfunc_end8-.Lfunc_begin8
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_constant_i64_using_min # -- Begin function unsigned_sat_constant_i64_using_min
- .p2align 4
- .type unsigned_sat_constant_i64_using_min, at function
-unsigned_sat_constant_i64_using_min: # @unsigned_sat_constant_i64_using_min
-.Lfunc_begin9:
- .cfi_startproc
-# %bb.0:
- li 4, -43
- cmpld 3, 4
- isellt 3, 3, 4
- addi 3, 3, 42
- blr
- .long 0
- .quad 0
-.Lfunc_end9:
- .size unsigned_sat_constant_i64_using_min, .Lfunc_end9-.Lfunc_begin9
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_constant_i64_using_cmp_sum # -- Begin function unsigned_sat_constant_i64_using_cmp_sum
- .p2align 4
- .type unsigned_sat_constant_i64_using_cmp_sum, at function
-unsigned_sat_constant_i64_using_cmp_sum: # @unsigned_sat_constant_i64_using_cmp_sum
-.Lfunc_begin10:
- .cfi_startproc
-# %bb.0:
- li 4, 0
- addic 3, 3, 42
- addze. 4, 4
- li 4, -1
- iseleq 3, 3, 4
- blr
- .long 0
- .quad 0
-.Lfunc_end10:
- .size unsigned_sat_constant_i64_using_cmp_sum, .Lfunc_end10-.Lfunc_begin10
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_constant_i64_using_cmp_notval # -- Begin function unsigned_sat_constant_i64_using_cmp_notval
- .p2align 4
- .type unsigned_sat_constant_i64_using_cmp_notval, at function
-unsigned_sat_constant_i64_using_cmp_notval: # @unsigned_sat_constant_i64_using_cmp_notval
-.Lfunc_begin11:
- .cfi_startproc
-# %bb.0:
- li 4, 0
- addic 3, 3, 42
- addze. 4, 4
- li 4, -1
- iseleq 3, 3, 4
- blr
- .long 0
- .quad 0
-.Lfunc_end11:
- .size unsigned_sat_constant_i64_using_cmp_notval, .Lfunc_end11-.Lfunc_begin11
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_i8_using_min # -- Begin function unsigned_sat_variable_i8_using_min
- .p2align 4
- .type unsigned_sat_variable_i8_using_min, at function
-unsigned_sat_variable_i8_using_min: # @unsigned_sat_variable_i8_using_min
-.Lfunc_begin12:
- .cfi_startproc
-# %bb.0:
- not 6, 4
- clrlwi 5, 3, 24
- clrlwi 7, 6, 24
- cmplw 5, 7
- isellt 3, 3, 6
- add 3, 3, 4
- blr
- .long 0
- .quad 0
-.Lfunc_end12:
- .size unsigned_sat_variable_i8_using_min, .Lfunc_end12-.Lfunc_begin12
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_i8_using_cmp_sum # -- Begin function unsigned_sat_variable_i8_using_cmp_sum
- .p2align 4
- .type unsigned_sat_variable_i8_using_cmp_sum, at function
-unsigned_sat_variable_i8_using_cmp_sum: # @unsigned_sat_variable_i8_using_cmp_sum
-.Lfunc_begin13:
- .cfi_startproc
-# %bb.0:
- clrlwi 4, 4, 24
- clrlwi 3, 3, 24
- add 3, 3, 4
- andi. 4, 3, 256
- li 4, -1
- iseleq 3, 3, 4
- blr
- .long 0
- .quad 0
-.Lfunc_end13:
- .size unsigned_sat_variable_i8_using_cmp_sum, .Lfunc_end13-.Lfunc_begin13
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_i8_using_cmp_notval # -- Begin function unsigned_sat_variable_i8_using_cmp_notval
- .p2align 4
- .type unsigned_sat_variable_i8_using_cmp_notval, at function
-unsigned_sat_variable_i8_using_cmp_notval: # @unsigned_sat_variable_i8_using_cmp_notval
-.Lfunc_begin14:
- .cfi_startproc
-# %bb.0:
- not 6, 4
- clrlwi 5, 3, 24
- add 3, 3, 4
- li 4, -1
- clrlwi 6, 6, 24
- cmplw 5, 6
- iselgt 3, 4, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end14:
- .size unsigned_sat_variable_i8_using_cmp_notval, .Lfunc_end14-.Lfunc_begin14
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_i16_using_min # -- Begin function unsigned_sat_variable_i16_using_min
- .p2align 4
- .type unsigned_sat_variable_i16_using_min, at function
-unsigned_sat_variable_i16_using_min: # @unsigned_sat_variable_i16_using_min
-.Lfunc_begin15:
- .cfi_startproc
-# %bb.0:
- not 6, 4
- clrlwi 5, 3, 16
- clrlwi 7, 6, 16
- cmplw 5, 7
- isellt 3, 3, 6
- add 3, 3, 4
- blr
- .long 0
- .quad 0
-.Lfunc_end15:
- .size unsigned_sat_variable_i16_using_min, .Lfunc_end15-.Lfunc_begin15
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_i16_using_cmp_sum # -- Begin function unsigned_sat_variable_i16_using_cmp_sum
- .p2align 4
- .type unsigned_sat_variable_i16_using_cmp_sum, at function
-unsigned_sat_variable_i16_using_cmp_sum: # @unsigned_sat_variable_i16_using_cmp_sum
-.Lfunc_begin16:
- .cfi_startproc
-# %bb.0:
- clrlwi 4, 4, 16
- clrlwi 3, 3, 16
- add 3, 3, 4
- andis. 4, 3, 1
- li 4, -1
- iseleq 3, 3, 4
- blr
- .long 0
- .quad 0
-.Lfunc_end16:
- .size unsigned_sat_variable_i16_using_cmp_sum, .Lfunc_end16-.Lfunc_begin16
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_i16_using_cmp_notval # -- Begin function unsigned_sat_variable_i16_using_cmp_notval
- .p2align 4
- .type unsigned_sat_variable_i16_using_cmp_notval, at function
-unsigned_sat_variable_i16_using_cmp_notval: # @unsigned_sat_variable_i16_using_cmp_notval
-.Lfunc_begin17:
- .cfi_startproc
-# %bb.0:
- not 6, 4
- clrlwi 5, 3, 16
- add 3, 3, 4
- li 4, -1
- clrlwi 6, 6, 16
- cmplw 5, 6
- iselgt 3, 4, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end17:
- .size unsigned_sat_variable_i16_using_cmp_notval, .Lfunc_end17-.Lfunc_begin17
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_i32_using_min # -- Begin function unsigned_sat_variable_i32_using_min
- .p2align 4
- .type unsigned_sat_variable_i32_using_min, at function
-unsigned_sat_variable_i32_using_min: # @unsigned_sat_variable_i32_using_min
-.Lfunc_begin18:
- .cfi_startproc
-# %bb.0:
- not 5, 4
- cmplw 3, 5
- isellt 3, 3, 5
- add 3, 3, 4
- blr
- .long 0
- .quad 0
-.Lfunc_end18:
- .size unsigned_sat_variable_i32_using_min, .Lfunc_end18-.Lfunc_begin18
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_i32_using_cmp_sum # -- Begin function unsigned_sat_variable_i32_using_cmp_sum
- .p2align 4
- .type unsigned_sat_variable_i32_using_cmp_sum, at function
-unsigned_sat_variable_i32_using_cmp_sum: # @unsigned_sat_variable_i32_using_cmp_sum
-.Lfunc_begin19:
- .cfi_startproc
-# %bb.0:
- add 4, 3, 4
- cmplw 4, 3
- li 3, -1
- isellt 3, 3, 4
- blr
- .long 0
- .quad 0
-.Lfunc_end19:
- .size unsigned_sat_variable_i32_using_cmp_sum, .Lfunc_end19-.Lfunc_begin19
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_i32_using_cmp_notval # -- Begin function unsigned_sat_variable_i32_using_cmp_notval
- .p2align 4
- .type unsigned_sat_variable_i32_using_cmp_notval, at function
-unsigned_sat_variable_i32_using_cmp_notval: # @unsigned_sat_variable_i32_using_cmp_notval
-.Lfunc_begin20:
- .cfi_startproc
-# %bb.0:
- not 5, 4
- add 4, 3, 4
- cmplw 3, 5
- li 3, -1
- iselgt 3, 3, 4
- blr
- .long 0
- .quad 0
-.Lfunc_end20:
- .size unsigned_sat_variable_i32_using_cmp_notval, .Lfunc_end20-.Lfunc_begin20
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_i64_using_min # -- Begin function unsigned_sat_variable_i64_using_min
- .p2align 4
- .type unsigned_sat_variable_i64_using_min, at function
-unsigned_sat_variable_i64_using_min: # @unsigned_sat_variable_i64_using_min
-.Lfunc_begin21:
- .cfi_startproc
-# %bb.0:
- not 5, 4
- cmpld 3, 5
- isellt 3, 3, 5
- add 3, 3, 4
- blr
- .long 0
- .quad 0
-.Lfunc_end21:
- .size unsigned_sat_variable_i64_using_min, .Lfunc_end21-.Lfunc_begin21
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_i64_using_cmp_sum # -- Begin function unsigned_sat_variable_i64_using_cmp_sum
- .p2align 4
- .type unsigned_sat_variable_i64_using_cmp_sum, at function
-unsigned_sat_variable_i64_using_cmp_sum: # @unsigned_sat_variable_i64_using_cmp_sum
-.Lfunc_begin22:
- .cfi_startproc
-# %bb.0:
- addc 3, 3, 4
- li 4, 0
- addze. 4, 4
- li 4, -1
- iseleq 3, 3, 4
- blr
- .long 0
- .quad 0
-.Lfunc_end22:
- .size unsigned_sat_variable_i64_using_cmp_sum, .Lfunc_end22-.Lfunc_begin22
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_i64_using_cmp_notval # -- Begin function unsigned_sat_variable_i64_using_cmp_notval
- .p2align 4
- .type unsigned_sat_variable_i64_using_cmp_notval, at function
-unsigned_sat_variable_i64_using_cmp_notval: # @unsigned_sat_variable_i64_using_cmp_notval
-.Lfunc_begin23:
- .cfi_startproc
-# %bb.0:
- not 5, 4
- add 4, 3, 4
- cmpld 3, 5
- li 3, -1
- iselgt 3, 3, 4
- blr
- .long 0
- .quad 0
-.Lfunc_end23:
- .size unsigned_sat_variable_i64_using_cmp_notval, .Lfunc_end23-.Lfunc_begin23
- .cfi_endproc
- # -- End function
- .section .rodata.cst16,"aM", at progbits,16
- .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v16i8_using_min
-.LCPI24_0:
- .space 16,213
-.LCPI24_1:
- .space 16,42
- .text
- .globl unsigned_sat_constant_v16i8_using_min
- .p2align 4
- .type unsigned_sat_constant_v16i8_using_min, at function
-unsigned_sat_constant_v16i8_using_min: # @unsigned_sat_constant_v16i8_using_min
-.Lfunc_begin24:
- .cfi_startproc
-.Lfunc_gep24:
- addis 2, 12, .TOC.-.Lfunc_gep24 at ha
- addi 2, 2, .TOC.-.Lfunc_gep24 at l
-.Lfunc_lep24:
- .localentry unsigned_sat_constant_v16i8_using_min, .Lfunc_lep24-.Lfunc_gep24
-# %bb.0:
- addis 3, 2, .LCPI24_0 at toc@ha
- addi 3, 3, .LCPI24_0 at toc@l
- lxvd2x 35, 0, 3
- addis 3, 2, .LCPI24_1 at toc@ha
- addi 3, 3, .LCPI24_1 at toc@l
- vminub 2, 2, 3
- lxvd2x 35, 0, 3
- vaddubm 2, 2, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end24:
- .size unsigned_sat_constant_v16i8_using_min, .Lfunc_end24-.Lfunc_begin24
- .cfi_endproc
- # -- End function
- .section .rodata.cst16,"aM", at progbits,16
- .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v16i8_using_cmp_sum
-.LCPI25_0:
- .space 16,42
- .text
- .globl unsigned_sat_constant_v16i8_using_cmp_sum
- .p2align 4
- .type unsigned_sat_constant_v16i8_using_cmp_sum, at function
-unsigned_sat_constant_v16i8_using_cmp_sum: # @unsigned_sat_constant_v16i8_using_cmp_sum
-.Lfunc_begin25:
- .cfi_startproc
-.Lfunc_gep25:
- addis 2, 12, .TOC.-.Lfunc_gep25 at ha
- addi 2, 2, .TOC.-.Lfunc_gep25 at l
-.Lfunc_lep25:
- .localentry unsigned_sat_constant_v16i8_using_cmp_sum, .Lfunc_lep25-.Lfunc_gep25
-# %bb.0:
- addis 3, 2, .LCPI25_0 at toc@ha
- addi 3, 3, .LCPI25_0 at toc@l
- lxvd2x 35, 0, 3
- vaddubs 2, 2, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end25:
- .size unsigned_sat_constant_v16i8_using_cmp_sum, .Lfunc_end25-.Lfunc_begin25
- .cfi_endproc
- # -- End function
- .section .rodata.cst16,"aM", at progbits,16
- .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v16i8_using_cmp_notval
-.LCPI26_0:
- .space 16,42
- .text
- .globl unsigned_sat_constant_v16i8_using_cmp_notval
- .p2align 4
- .type unsigned_sat_constant_v16i8_using_cmp_notval, at function
-unsigned_sat_constant_v16i8_using_cmp_notval: # @unsigned_sat_constant_v16i8_using_cmp_notval
-.Lfunc_begin26:
- .cfi_startproc
-.Lfunc_gep26:
- addis 2, 12, .TOC.-.Lfunc_gep26 at ha
- addi 2, 2, .TOC.-.Lfunc_gep26 at l
-.Lfunc_lep26:
- .localentry unsigned_sat_constant_v16i8_using_cmp_notval, .Lfunc_lep26-.Lfunc_gep26
-# %bb.0:
- addis 3, 2, .LCPI26_0 at toc@ha
- addi 3, 3, .LCPI26_0 at toc@l
- lxvd2x 35, 0, 3
- vaddubs 2, 2, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end26:
- .size unsigned_sat_constant_v16i8_using_cmp_notval, .Lfunc_end26-.Lfunc_begin26
- .cfi_endproc
- # -- End function
- .section .rodata.cst16,"aM", at progbits,16
- .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v8i16_using_min
-.LCPI27_0:
- .short 65493 # 0xffd5
- .short 65493 # 0xffd5
- .short 65493 # 0xffd5
- .short 65493 # 0xffd5
- .short 65493 # 0xffd5
- .short 65493 # 0xffd5
- .short 65493 # 0xffd5
- .short 65493 # 0xffd5
-.LCPI27_1:
- .short 42 # 0x2a
- .short 42 # 0x2a
- .short 42 # 0x2a
- .short 42 # 0x2a
- .short 42 # 0x2a
- .short 42 # 0x2a
- .short 42 # 0x2a
- .short 42 # 0x2a
- .text
- .globl unsigned_sat_constant_v8i16_using_min
- .p2align 4
- .type unsigned_sat_constant_v8i16_using_min, at function
-unsigned_sat_constant_v8i16_using_min: # @unsigned_sat_constant_v8i16_using_min
-.Lfunc_begin27:
- .cfi_startproc
-.Lfunc_gep27:
- addis 2, 12, .TOC.-.Lfunc_gep27 at ha
- addi 2, 2, .TOC.-.Lfunc_gep27 at l
-.Lfunc_lep27:
- .localentry unsigned_sat_constant_v8i16_using_min, .Lfunc_lep27-.Lfunc_gep27
-# %bb.0:
- addis 3, 2, .LCPI27_0 at toc@ha
- addi 3, 3, .LCPI27_0 at toc@l
- lxvd2x 35, 0, 3
- addis 3, 2, .LCPI27_1 at toc@ha
- addi 3, 3, .LCPI27_1 at toc@l
- vminuh 2, 2, 3
- lxvd2x 35, 0, 3
- vadduhm 2, 2, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end27:
- .size unsigned_sat_constant_v8i16_using_min, .Lfunc_end27-.Lfunc_begin27
- .cfi_endproc
- # -- End function
- .section .rodata.cst16,"aM", at progbits,16
- .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v8i16_using_cmp_sum
-.LCPI28_0:
- .short 42 # 0x2a
- .short 42 # 0x2a
- .short 42 # 0x2a
- .short 42 # 0x2a
- .short 42 # 0x2a
- .short 42 # 0x2a
- .short 42 # 0x2a
- .short 42 # 0x2a
- .text
- .globl unsigned_sat_constant_v8i16_using_cmp_sum
- .p2align 4
- .type unsigned_sat_constant_v8i16_using_cmp_sum, at function
-unsigned_sat_constant_v8i16_using_cmp_sum: # @unsigned_sat_constant_v8i16_using_cmp_sum
-.Lfunc_begin28:
- .cfi_startproc
-.Lfunc_gep28:
- addis 2, 12, .TOC.-.Lfunc_gep28 at ha
- addi 2, 2, .TOC.-.Lfunc_gep28 at l
-.Lfunc_lep28:
- .localentry unsigned_sat_constant_v8i16_using_cmp_sum, .Lfunc_lep28-.Lfunc_gep28
-# %bb.0:
- addis 3, 2, .LCPI28_0 at toc@ha
- addi 3, 3, .LCPI28_0 at toc@l
- lxvd2x 35, 0, 3
- vadduhs 2, 2, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end28:
- .size unsigned_sat_constant_v8i16_using_cmp_sum, .Lfunc_end28-.Lfunc_begin28
- .cfi_endproc
- # -- End function
- .section .rodata.cst16,"aM", at progbits,16
- .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v8i16_using_cmp_notval
-.LCPI29_0:
- .short 42 # 0x2a
- .short 42 # 0x2a
- .short 42 # 0x2a
- .short 42 # 0x2a
- .short 42 # 0x2a
- .short 42 # 0x2a
- .short 42 # 0x2a
- .short 42 # 0x2a
- .text
- .globl unsigned_sat_constant_v8i16_using_cmp_notval
- .p2align 4
- .type unsigned_sat_constant_v8i16_using_cmp_notval, at function
-unsigned_sat_constant_v8i16_using_cmp_notval: # @unsigned_sat_constant_v8i16_using_cmp_notval
-.Lfunc_begin29:
- .cfi_startproc
-.Lfunc_gep29:
- addis 2, 12, .TOC.-.Lfunc_gep29 at ha
- addi 2, 2, .TOC.-.Lfunc_gep29 at l
-.Lfunc_lep29:
- .localentry unsigned_sat_constant_v8i16_using_cmp_notval, .Lfunc_lep29-.Lfunc_gep29
-# %bb.0:
- addis 3, 2, .LCPI29_0 at toc@ha
- addi 3, 3, .LCPI29_0 at toc@l
- lxvd2x 35, 0, 3
- vadduhs 2, 2, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end29:
- .size unsigned_sat_constant_v8i16_using_cmp_notval, .Lfunc_end29-.Lfunc_begin29
- .cfi_endproc
- # -- End function
- .section .rodata.cst16,"aM", at progbits,16
- .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v4i32_using_min
-.LCPI30_0:
- .long 4294967253 # 0xffffffd5
- .long 4294967253 # 0xffffffd5
- .long 4294967253 # 0xffffffd5
- .long 4294967253 # 0xffffffd5
-.LCPI30_1:
- .long 42 # 0x2a
- .long 42 # 0x2a
- .long 42 # 0x2a
- .long 42 # 0x2a
- .text
- .globl unsigned_sat_constant_v4i32_using_min
- .p2align 4
- .type unsigned_sat_constant_v4i32_using_min, at function
-unsigned_sat_constant_v4i32_using_min: # @unsigned_sat_constant_v4i32_using_min
-.Lfunc_begin30:
- .cfi_startproc
-.Lfunc_gep30:
- addis 2, 12, .TOC.-.Lfunc_gep30 at ha
- addi 2, 2, .TOC.-.Lfunc_gep30 at l
-.Lfunc_lep30:
- .localentry unsigned_sat_constant_v4i32_using_min, .Lfunc_lep30-.Lfunc_gep30
-# %bb.0:
- addis 3, 2, .LCPI30_0 at toc@ha
- addi 3, 3, .LCPI30_0 at toc@l
- lxvd2x 35, 0, 3
- addis 3, 2, .LCPI30_1 at toc@ha
- addi 3, 3, .LCPI30_1 at toc@l
- vminuw 2, 2, 3
- lxvd2x 35, 0, 3
- vadduwm 2, 2, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end30:
- .size unsigned_sat_constant_v4i32_using_min, .Lfunc_end30-.Lfunc_begin30
- .cfi_endproc
- # -- End function
- .section .rodata.cst16,"aM", at progbits,16
- .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v4i32_using_cmp_sum
-.LCPI31_0:
- .long 42 # 0x2a
- .long 42 # 0x2a
- .long 42 # 0x2a
- .long 42 # 0x2a
- .text
- .globl unsigned_sat_constant_v4i32_using_cmp_sum
- .p2align 4
- .type unsigned_sat_constant_v4i32_using_cmp_sum, at function
-unsigned_sat_constant_v4i32_using_cmp_sum: # @unsigned_sat_constant_v4i32_using_cmp_sum
-.Lfunc_begin31:
- .cfi_startproc
-.Lfunc_gep31:
- addis 2, 12, .TOC.-.Lfunc_gep31 at ha
- addi 2, 2, .TOC.-.Lfunc_gep31 at l
-.Lfunc_lep31:
- .localentry unsigned_sat_constant_v4i32_using_cmp_sum, .Lfunc_lep31-.Lfunc_gep31
-# %bb.0:
- addis 3, 2, .LCPI31_0 at toc@ha
- addi 3, 3, .LCPI31_0 at toc@l
- lxvd2x 35, 0, 3
- vadduws 2, 2, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end31:
- .size unsigned_sat_constant_v4i32_using_cmp_sum, .Lfunc_end31-.Lfunc_begin31
- .cfi_endproc
- # -- End function
- .section .rodata.cst16,"aM", at progbits,16
- .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v4i32_using_cmp_notval
-.LCPI32_0:
- .long 42 # 0x2a
- .long 42 # 0x2a
- .long 42 # 0x2a
- .long 42 # 0x2a
- .text
- .globl unsigned_sat_constant_v4i32_using_cmp_notval
- .p2align 4
- .type unsigned_sat_constant_v4i32_using_cmp_notval, at function
-unsigned_sat_constant_v4i32_using_cmp_notval: # @unsigned_sat_constant_v4i32_using_cmp_notval
-.Lfunc_begin32:
- .cfi_startproc
-.Lfunc_gep32:
- addis 2, 12, .TOC.-.Lfunc_gep32 at ha
- addi 2, 2, .TOC.-.Lfunc_gep32 at l
-.Lfunc_lep32:
- .localentry unsigned_sat_constant_v4i32_using_cmp_notval, .Lfunc_lep32-.Lfunc_gep32
-# %bb.0:
- addis 3, 2, .LCPI32_0 at toc@ha
- addi 3, 3, .LCPI32_0 at toc@l
- lxvd2x 35, 0, 3
- vadduws 2, 2, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end32:
- .size unsigned_sat_constant_v4i32_using_cmp_notval, .Lfunc_end32-.Lfunc_begin32
- .cfi_endproc
- # -- End function
- .section .rodata.cst16,"aM", at progbits,16
- .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v2i64_using_min
-.LCPI33_0:
- .quad -43 # 0xffffffffffffffd5
- .quad -43 # 0xffffffffffffffd5
-.LCPI33_1:
- .quad 42 # 0x2a
- .quad 42 # 0x2a
- .text
- .globl unsigned_sat_constant_v2i64_using_min
- .p2align 4
- .type unsigned_sat_constant_v2i64_using_min, at function
-unsigned_sat_constant_v2i64_using_min: # @unsigned_sat_constant_v2i64_using_min
-.Lfunc_begin33:
- .cfi_startproc
-.Lfunc_gep33:
- addis 2, 12, .TOC.-.Lfunc_gep33 at ha
- addi 2, 2, .TOC.-.Lfunc_gep33 at l
-.Lfunc_lep33:
- .localentry unsigned_sat_constant_v2i64_using_min, .Lfunc_lep33-.Lfunc_gep33
-# %bb.0:
- addis 3, 2, .LCPI33_0 at toc@ha
- addi 3, 3, .LCPI33_0 at toc@l
- lxvd2x 35, 0, 3
- addis 3, 2, .LCPI33_1 at toc@ha
- addi 3, 3, .LCPI33_1 at toc@l
- vminud 2, 2, 3
- lxvd2x 35, 0, 3
- vaddudm 2, 2, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end33:
- .size unsigned_sat_constant_v2i64_using_min, .Lfunc_end33-.Lfunc_begin33
- .cfi_endproc
- # -- End function
- .section .rodata.cst16,"aM", at progbits,16
- .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v2i64_using_cmp_sum
-.LCPI34_0:
- .quad 42 # 0x2a
- .quad 42 # 0x2a
-.LCPI34_1:
- .quad -43 # 0xffffffffffffffd5
- .quad -43 # 0xffffffffffffffd5
- .text
- .globl unsigned_sat_constant_v2i64_using_cmp_sum
- .p2align 4
- .type unsigned_sat_constant_v2i64_using_cmp_sum, at function
-unsigned_sat_constant_v2i64_using_cmp_sum: # @unsigned_sat_constant_v2i64_using_cmp_sum
-.Lfunc_begin34:
- .cfi_startproc
-.Lfunc_gep34:
- addis 2, 12, .TOC.-.Lfunc_gep34 at ha
- addi 2, 2, .TOC.-.Lfunc_gep34 at l
-.Lfunc_lep34:
- .localentry unsigned_sat_constant_v2i64_using_cmp_sum, .Lfunc_lep34-.Lfunc_gep34
-# %bb.0:
- addis 3, 2, .LCPI34_0 at toc@ha
- addi 3, 3, .LCPI34_0 at toc@l
- lxvd2x 35, 0, 3
- addis 3, 2, .LCPI34_1 at toc@ha
- addi 3, 3, .LCPI34_1 at toc@l
- lxvd2x 36, 0, 3
- vminud 2, 2, 4
- vaddudm 2, 2, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end34:
- .size unsigned_sat_constant_v2i64_using_cmp_sum, .Lfunc_end34-.Lfunc_begin34
- .cfi_endproc
- # -- End function
- .section .rodata.cst16,"aM", at progbits,16
- .p2align 4, 0x0 # -- Begin function unsigned_sat_constant_v2i64_using_cmp_notval
-.LCPI35_0:
- .quad 42 # 0x2a
- .quad 42 # 0x2a
-.LCPI35_1:
- .quad -43 # 0xffffffffffffffd5
- .quad -43 # 0xffffffffffffffd5
- .text
- .globl unsigned_sat_constant_v2i64_using_cmp_notval
- .p2align 4
- .type unsigned_sat_constant_v2i64_using_cmp_notval, at function
-unsigned_sat_constant_v2i64_using_cmp_notval: # @unsigned_sat_constant_v2i64_using_cmp_notval
-.Lfunc_begin35:
- .cfi_startproc
-.Lfunc_gep35:
- addis 2, 12, .TOC.-.Lfunc_gep35 at ha
- addi 2, 2, .TOC.-.Lfunc_gep35 at l
-.Lfunc_lep35:
- .localentry unsigned_sat_constant_v2i64_using_cmp_notval, .Lfunc_lep35-.Lfunc_gep35
-# %bb.0:
- addis 3, 2, .LCPI35_0 at toc@ha
- addi 3, 3, .LCPI35_0 at toc@l
- lxvd2x 35, 0, 3
- addis 3, 2, .LCPI35_1 at toc@ha
- addi 3, 3, .LCPI35_1 at toc@l
- lxvd2x 36, 0, 3
- vaddudm 3, 2, 3
- vcmpgtud 2, 2, 4
- xxlor 34, 34, 35
- blr
- .long 0
- .quad 0
-.Lfunc_end35:
- .size unsigned_sat_constant_v2i64_using_cmp_notval, .Lfunc_end35-.Lfunc_begin35
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_v16i8_using_min # -- Begin function unsigned_sat_variable_v16i8_using_min
- .p2align 4
- .type unsigned_sat_variable_v16i8_using_min, at function
-unsigned_sat_variable_v16i8_using_min: # @unsigned_sat_variable_v16i8_using_min
-.Lfunc_begin36:
- .cfi_startproc
-# %bb.0:
- xxlnor 36, 35, 35
- vminub 2, 2, 4
- vaddubm 2, 2, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end36:
- .size unsigned_sat_variable_v16i8_using_min, .Lfunc_end36-.Lfunc_begin36
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_v16i8_using_cmp_sum # -- Begin function unsigned_sat_variable_v16i8_using_cmp_sum
- .p2align 4
- .type unsigned_sat_variable_v16i8_using_cmp_sum, at function
-unsigned_sat_variable_v16i8_using_cmp_sum: # @unsigned_sat_variable_v16i8_using_cmp_sum
-.Lfunc_begin37:
- .cfi_startproc
-# %bb.0:
- vaddubs 2, 2, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end37:
- .size unsigned_sat_variable_v16i8_using_cmp_sum, .Lfunc_end37-.Lfunc_begin37
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_v16i8_using_cmp_notval # -- Begin function unsigned_sat_variable_v16i8_using_cmp_notval
- .p2align 4
- .type unsigned_sat_variable_v16i8_using_cmp_notval, at function
-unsigned_sat_variable_v16i8_using_cmp_notval: # @unsigned_sat_variable_v16i8_using_cmp_notval
-.Lfunc_begin38:
- .cfi_startproc
-# %bb.0:
- xxlnor 36, 35, 35
- vaddubm 3, 2, 3
- vcmpgtub 2, 2, 4
- xxlor 34, 34, 35
- blr
- .long 0
- .quad 0
-.Lfunc_end38:
- .size unsigned_sat_variable_v16i8_using_cmp_notval, .Lfunc_end38-.Lfunc_begin38
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_v8i16_using_min # -- Begin function unsigned_sat_variable_v8i16_using_min
- .p2align 4
- .type unsigned_sat_variable_v8i16_using_min, at function
-unsigned_sat_variable_v8i16_using_min: # @unsigned_sat_variable_v8i16_using_min
-.Lfunc_begin39:
- .cfi_startproc
-# %bb.0:
- xxlnor 36, 35, 35
- vminuh 2, 2, 4
- vadduhm 2, 2, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end39:
- .size unsigned_sat_variable_v8i16_using_min, .Lfunc_end39-.Lfunc_begin39
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_v8i16_using_cmp_sum # -- Begin function unsigned_sat_variable_v8i16_using_cmp_sum
- .p2align 4
- .type unsigned_sat_variable_v8i16_using_cmp_sum, at function
-unsigned_sat_variable_v8i16_using_cmp_sum: # @unsigned_sat_variable_v8i16_using_cmp_sum
-.Lfunc_begin40:
- .cfi_startproc
-# %bb.0:
- vadduhs 2, 2, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end40:
- .size unsigned_sat_variable_v8i16_using_cmp_sum, .Lfunc_end40-.Lfunc_begin40
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_v8i16_using_cmp_notval # -- Begin function unsigned_sat_variable_v8i16_using_cmp_notval
- .p2align 4
- .type unsigned_sat_variable_v8i16_using_cmp_notval, at function
-unsigned_sat_variable_v8i16_using_cmp_notval: # @unsigned_sat_variable_v8i16_using_cmp_notval
-.Lfunc_begin41:
- .cfi_startproc
-# %bb.0:
- xxlnor 36, 35, 35
- vadduhm 3, 2, 3
- vcmpgtuh 2, 2, 4
- xxlor 34, 34, 35
- blr
- .long 0
- .quad 0
-.Lfunc_end41:
- .size unsigned_sat_variable_v8i16_using_cmp_notval, .Lfunc_end41-.Lfunc_begin41
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_v4i32_using_min # -- Begin function unsigned_sat_variable_v4i32_using_min
- .p2align 4
- .type unsigned_sat_variable_v4i32_using_min, at function
-unsigned_sat_variable_v4i32_using_min: # @unsigned_sat_variable_v4i32_using_min
-.Lfunc_begin42:
- .cfi_startproc
-# %bb.0:
- xxlnor 36, 35, 35
- vminuw 2, 2, 4
- vadduwm 2, 2, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end42:
- .size unsigned_sat_variable_v4i32_using_min, .Lfunc_end42-.Lfunc_begin42
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_v4i32_using_cmp_sum # -- Begin function unsigned_sat_variable_v4i32_using_cmp_sum
- .p2align 4
- .type unsigned_sat_variable_v4i32_using_cmp_sum, at function
-unsigned_sat_variable_v4i32_using_cmp_sum: # @unsigned_sat_variable_v4i32_using_cmp_sum
-.Lfunc_begin43:
- .cfi_startproc
-# %bb.0:
- vadduws 2, 2, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end43:
- .size unsigned_sat_variable_v4i32_using_cmp_sum, .Lfunc_end43-.Lfunc_begin43
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_v4i32_using_cmp_notval # -- Begin function unsigned_sat_variable_v4i32_using_cmp_notval
- .p2align 4
- .type unsigned_sat_variable_v4i32_using_cmp_notval, at function
-unsigned_sat_variable_v4i32_using_cmp_notval: # @unsigned_sat_variable_v4i32_using_cmp_notval
-.Lfunc_begin44:
- .cfi_startproc
-# %bb.0:
- xxlnor 36, 35, 35
- vadduwm 3, 2, 3
- vcmpgtuw 2, 2, 4
- xxlor 34, 34, 35
- blr
- .long 0
- .quad 0
-.Lfunc_end44:
- .size unsigned_sat_variable_v4i32_using_cmp_notval, .Lfunc_end44-.Lfunc_begin44
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_v2i64_using_min # -- Begin function unsigned_sat_variable_v2i64_using_min
- .p2align 4
- .type unsigned_sat_variable_v2i64_using_min, at function
-unsigned_sat_variable_v2i64_using_min: # @unsigned_sat_variable_v2i64_using_min
-.Lfunc_begin45:
- .cfi_startproc
-# %bb.0:
- xxlnor 36, 35, 35
- vminud 2, 2, 4
- vaddudm 2, 2, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end45:
- .size unsigned_sat_variable_v2i64_using_min, .Lfunc_end45-.Lfunc_begin45
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_v2i64_using_cmp_sum # -- Begin function unsigned_sat_variable_v2i64_using_cmp_sum
- .p2align 4
- .type unsigned_sat_variable_v2i64_using_cmp_sum, at function
-unsigned_sat_variable_v2i64_using_cmp_sum: # @unsigned_sat_variable_v2i64_using_cmp_sum
-.Lfunc_begin46:
- .cfi_startproc
-# %bb.0:
- xxlnor 36, 35, 35
- vminud 2, 2, 4
- vaddudm 2, 2, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end46:
- .size unsigned_sat_variable_v2i64_using_cmp_sum, .Lfunc_end46-.Lfunc_begin46
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_variable_v2i64_using_cmp_notval # -- Begin function unsigned_sat_variable_v2i64_using_cmp_notval
- .p2align 4
- .type unsigned_sat_variable_v2i64_using_cmp_notval, at function
-unsigned_sat_variable_v2i64_using_cmp_notval: # @unsigned_sat_variable_v2i64_using_cmp_notval
-.Lfunc_begin47:
- .cfi_startproc
-# %bb.0:
- xxlnor 36, 35, 35
- vaddudm 3, 2, 3
- vcmpgtud 2, 2, 4
- xxlor 34, 34, 35
- blr
- .long 0
- .quad 0
-.Lfunc_end47:
- .size unsigned_sat_variable_v2i64_using_cmp_notval, .Lfunc_end47-.Lfunc_begin47
- .cfi_endproc
- # -- End function
- .section .rodata.cst16,"aM", at progbits,16
- .p2align 4, 0x0 # -- Begin function sadd
-.LCPI48_0:
- .quad 0 # 0x0
- .quad -9223372036854775808 # 0x8000000000000000
- .text
- .globl sadd
- .p2align 4
- .type sadd, at function
-sadd: # @sadd
-.Lfunc_begin48:
- .cfi_startproc
-.Lfunc_gep48:
- addis 2, 12, .TOC.-.Lfunc_gep48 at ha
- addi 2, 2, .TOC.-.Lfunc_gep48 at l
-.Lfunc_lep48:
- .localentry sadd, .Lfunc_lep48-.Lfunc_gep48
-# %bb.0:
- vadduqm 0, 2, 6
- vadduqm 10, 4, 8
- mfocrf 12, 32
- stw 12, 8(1)
- xxswapd 0, 34
- xxswapd 4, 36
- vadduqm 1, 3, 7
- vadduqm 11, 5, 9
- mffprd 3, 0
- mffprd 6, 4
- lwz 12, 8(1)
- xxswapd 2, 35
- xxswapd 5, 37
- mffprd 4, 2
- xxswapd 1, 32
- xxswapd 6, 42
- mffprd 5, 1
- cmpld 6, 5, 3
- mffprd 7, 6
- xxswapd 3, 33
- xxswapd 7, 43
- mffprd 3, 3
- cmpld 5, 7, 6
- mffprd 6, 5
- mffprd 7, 7
- mfvsrd 5, 36
- cmpld 3, 4
- mfvsrd 3, 34
- cmpld 1, 7, 6
- mfvsrd 7, 32
- mfvsrd 4, 35
- mfvsrd 6, 37
- cmpld 7, 7, 3
- cmpd 2, 7, 3
- mfvsrd 3, 33
- crandc 21, 8, 30
- crand 22, 30, 24
- cmpld 6, 3, 4
- cmpd 7, 3, 4
- mfvsrd 4, 42
- sradi 3, 3, 63
- mtocrf 32, 12
- crnor 21, 22, 21
- crandc 23, 28, 26
- crand 24, 26, 0
- cmpld 4, 5
- cmpd 7, 4, 5
- mfvsrd 5, 43
- crnor 22, 24, 23
- mtfprd 5, 3
- sradi 4, 4, 63
- mtfprd 6, 4
- crandc 25, 28, 2
- crand 20, 2, 20
- cmpld 5, 6
- cmpd 7, 5, 6
- mfvsrd 6, 38
- sradi 5, 5, 63
- crnor 20, 20, 25
- mtfprd 7, 5
- sradi 6, 6, 63
- crandc 26, 28, 2
- crand 27, 2, 4
- crnor 23, 27, 26
- mtfprd 0, 6
- mfvsrd 6, 39
- sradi 6, 6, 63
- mtfprd 1, 6
- mfvsrd 6, 40
- sradi 6, 6, 63
- mtfprd 2, 6
- mfvsrd 6, 41
- sradi 6, 6, 63
- mtfprd 3, 6
- sradi 6, 7, 63
- mtfprd 4, 6
- li 6, -1
- isel 3, 0, 6, 21
- isel 4, 0, 6, 22
- isel 5, 0, 6, 20
- isel 6, 0, 6, 23
- mtfprd 8, 3
- addis 3, 2, .LCPI48_0 at toc@ha
- mtfprd 10, 4
- mtfprd 11, 5
- mtfprd 12, 6
- addi 3, 3, .LCPI48_0 at toc@l
- lxvd2x 9, 0, 3
- xxspltd 45, 6, 0
- xxspltd 46, 7, 0
- xxspltd 34, 0, 0
- xxspltd 40, 5, 0
- xxspltd 35, 1, 0
- xxspltd 36, 2, 0
- xxspltd 38, 3, 0
- xxspltd 39, 4, 0
- xxspltd 41, 8, 0
- xxspltd 44, 10, 0
- xxspltd 47, 11, 0
- xxspltd 48, 12, 0
- xxlxor 0, 34, 41
- xxlxor 1, 35, 44
- xxswapd 37, 9
- xxlxor 2, 39, 37
- xxlxor 3, 40, 37
- xxsel 34, 32, 2, 0
- xxsel 35, 33, 3, 1
- xxlxor 0, 36, 47
- xxlxor 1, 45, 37
- xxsel 36, 42, 1, 0
- xxlxor 0, 38, 48
- xxlxor 1, 46, 37
- xxsel 37, 43, 1, 0
- blr
- .long 0
- .quad 0
-.Lfunc_end48:
- .size sadd, .Lfunc_end48-.Lfunc_begin48
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_constant_i64_with_single_use # -- Begin function unsigned_sat_constant_i64_with_single_use
- .p2align 4
- .type unsigned_sat_constant_i64_with_single_use, at function
-unsigned_sat_constant_i64_with_single_use: # @unsigned_sat_constant_i64_with_single_use
-.Lfunc_begin49:
- .cfi_startproc
-# %bb.0:
- li 4, 4
- subc 3, 3, 4
- li 4, 0
- addze. 4, 4
- iseleq 3, 0, 3
- blr
- .long 0
- .quad 0
-.Lfunc_end49:
- .size unsigned_sat_constant_i64_with_single_use, .Lfunc_end49-.Lfunc_begin49
- .cfi_endproc
- # -- End function
- .globl unsigned_sat_constant_i64_with_multiple_use # -- Begin function unsigned_sat_constant_i64_with_multiple_use
- .p2align 4
- .type unsigned_sat_constant_i64_with_multiple_use, at function
-unsigned_sat_constant_i64_with_multiple_use: # @unsigned_sat_constant_i64_with_multiple_use
-.Lfunc_begin50:
- .cfi_startproc
-# %bb.0:
- cmpldi 3, 4
- li 5, 4
- isellt 5, 3, 5
- sub 3, 3, 5
- add 4, 4, 5
- mulld 3, 3, 4
- blr
- .long 0
- .quad 0
-.Lfunc_end50:
- .size unsigned_sat_constant_i64_with_multiple_use, .Lfunc_end50-.Lfunc_begin50
- .cfi_endproc
- # -- End function
- .section ".note.GNU-stack","", at progbits
diff --git a/llvm/test/CodeGen/RISCV/branch-on-zero.ll b/llvm/test/CodeGen/RISCV/branch-on-zero.ll
index 02aeebdeb37757..b1d396d70ff5fd 100644
--- a/llvm/test/CodeGen/RISCV/branch-on-zero.ll
+++ b/llvm/test/CodeGen/RISCV/branch-on-zero.ll
@@ -8,19 +8,21 @@ define i32 @optbranch_32(i32 %Arg) {
; RV32-LABEL: optbranch_32:
; RV32: # %bb.0: # %bb
; RV32-NEXT: addi a0, a0, 1
-; RV32-NEXT: bnez a0, .LBB0_2
-; RV32-NEXT: # %bb.1: # %bb2
+; RV32-NEXT: beqz a0, .LBB0_2
+; RV32-NEXT: # %bb.1: # %bb3
+; RV32-NEXT: ret
+; RV32-NEXT: .LBB0_2: # %bb2
; RV32-NEXT: li a0, -1
-; RV32-NEXT: .LBB0_2: # %bb3
; RV32-NEXT: ret
;
; RV64-LABEL: optbranch_32:
; RV64: # %bb.0: # %bb
; RV64-NEXT: addiw a0, a0, 1
-; RV64-NEXT: bnez a0, .LBB0_2
-; RV64-NEXT: # %bb.1: # %bb2
+; RV64-NEXT: beqz a0, .LBB0_2
+; RV64-NEXT: # %bb.1: # %bb3
+; RV64-NEXT: ret
+; RV64-NEXT: .LBB0_2: # %bb2
; RV64-NEXT: li a0, -1
-; RV64-NEXT: .LBB0_2: # %bb3
; RV64-NEXT: ret
bb:
%i1 = icmp eq i32 %Arg, -1
@@ -41,20 +43,22 @@ define i64 @optbranch_64(i64 %Arg) {
; RV32-NEXT: seqz a2, a0
; RV32-NEXT: add a1, a1, a2
; RV32-NEXT: or a2, a0, a1
-; RV32-NEXT: bnez a2, .LBB1_2
-; RV32-NEXT: # %bb.1: # %bb2
+; RV32-NEXT: beqz a2, .LBB1_2
+; RV32-NEXT: # %bb.1: # %bb3
+; RV32-NEXT: ret
+; RV32-NEXT: .LBB1_2: # %bb2
; RV32-NEXT: li a0, -1
; RV32-NEXT: li a1, -1
-; RV32-NEXT: .LBB1_2: # %bb3
; RV32-NEXT: ret
;
; RV64-LABEL: optbranch_64:
; RV64: # %bb.0: # %bb
; RV64-NEXT: addi a0, a0, 1
-; RV64-NEXT: bnez a0, .LBB1_2
-; RV64-NEXT: # %bb.1: # %bb2
+; RV64-NEXT: beqz a0, .LBB1_2
+; RV64-NEXT: # %bb.1: # %bb3
+; RV64-NEXT: ret
+; RV64-NEXT: .LBB1_2: # %bb2
; RV64-NEXT: li a0, -1
-; RV64-NEXT: .LBB1_2: # %bb3
; RV64-NEXT: ret
bb:
%i1 = icmp eq i64 %Arg, -1
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index 376bbb7018700d..e10b360b35b569 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -254,60 +254,33 @@ vector.ph:
}
define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind {
-; SSE2OR3-LABEL: test3:
-; SSE2OR3: # %bb.0: # %vector.ph
-; SSE2OR3-NEXT: movd %edi, %xmm1
-; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2OR3-NEXT: movdqa %xmm0, %xmm3
-; SSE2OR3-NEXT: pxor %xmm1, %xmm3
-; SSE2OR3-NEXT: psubw %xmm2, %xmm0
-; SSE2OR3-NEXT: pxor %xmm0, %xmm1
-; SSE2OR3-NEXT: pcmpgtw %xmm3, %xmm1
-; SSE2OR3-NEXT: pandn %xmm0, %xmm1
-; SSE2OR3-NEXT: movdqa %xmm1, %xmm0
-; SSE2OR3-NEXT: retq
-;
-; SSE41-LABEL: test3:
-; SSE41: # %bb.0: # %vector.ph
-; SSE41-NEXT: movd %edi, %xmm1
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psubw %xmm1, %xmm2
-; SSE41-NEXT: pminuw %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqw %xmm2, %xmm0
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: test3:
+; SSE: # %bb.0: # %vector.ph
+; SSE-NEXT: movd %edi, %xmm1
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE-NEXT: psubusw %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX1-LABEL: test3:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vmovd %edi, %xmm1
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpminuw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test3:
; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vmovd %edi, %xmm1
; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpminuw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test3:
; AVX512: # %bb.0: # %vector.ph
; AVX512-NEXT: vpbroadcastw %edi, %xmm1
-; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpleuw %xmm0, %xmm1, %k1
-; AVX512-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
+; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
vector.ph:
%0 = insertelement <8 x i16> undef, i16 %w, i32 0
@@ -359,11 +332,7 @@ define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psubb %xmm1, %xmm2
-; SSE2-NEXT: pminub %xmm2, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: psubusb %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test6:
@@ -371,11 +340,7 @@ define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
; SSSE3-NEXT: movd %edi, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: pshufb %xmm2, %xmm1
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: psubb %xmm1, %xmm2
-; SSSE3-NEXT: pminub %xmm2, %xmm0
-; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0
-; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: psubusb %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: test6:
@@ -383,11 +348,7 @@ define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
; SSE41-NEXT: movd %edi, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: pshufb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psubb %xmm1, %xmm2
-; SSE41-NEXT: pminub %xmm2, %xmm0
-; SSE41-NEXT: pcmpeqb %xmm2, %xmm0
-; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: psubusb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test6:
@@ -395,28 +356,20 @@ define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
; AVX1-NEXT: vmovd %edi, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpminub %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpeqb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test6:
; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vmovd %edi, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpminub %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpcmpeqb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test6:
; AVX512: # %bb.0: # %vector.ph
; AVX512-NEXT: vpbroadcastb %edi, %xmm1
-; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpcmpleub %xmm0, %xmm1, %k1
-; AVX512-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} {z}
+; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
vector.ph:
%0 = insertelement <16 x i8> undef, i8 %w, i32 0
@@ -589,45 +542,14 @@ vector.ph:
}
define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind {
-; SSE2OR3-LABEL: test9:
-; SSE2OR3: # %bb.0: # %vector.ph
-; SSE2OR3-NEXT: movd %edi, %xmm2
-; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1]
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2OR3-NEXT: movdqa %xmm1, %xmm5
-; SSE2OR3-NEXT: pxor %xmm2, %xmm5
-; SSE2OR3-NEXT: psubw %xmm4, %xmm1
-; SSE2OR3-NEXT: movdqa %xmm1, %xmm3
-; SSE2OR3-NEXT: pxor %xmm2, %xmm3
-; SSE2OR3-NEXT: pcmpgtw %xmm5, %xmm3
-; SSE2OR3-NEXT: movdqa %xmm0, %xmm5
-; SSE2OR3-NEXT: pxor %xmm2, %xmm5
-; SSE2OR3-NEXT: psubw %xmm4, %xmm0
-; SSE2OR3-NEXT: pxor %xmm0, %xmm2
-; SSE2OR3-NEXT: pcmpgtw %xmm5, %xmm2
-; SSE2OR3-NEXT: pandn %xmm0, %xmm2
-; SSE2OR3-NEXT: pandn %xmm1, %xmm3
-; SSE2OR3-NEXT: movdqa %xmm2, %xmm0
-; SSE2OR3-NEXT: movdqa %xmm3, %xmm1
-; SSE2OR3-NEXT: retq
-;
-; SSE41-LABEL: test9:
-; SSE41: # %bb.0: # %vector.ph
-; SSE41-NEXT: movd %edi, %xmm2
-; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psubw %xmm2, %xmm3
-; SSE41-NEXT: pminuw %xmm3, %xmm1
-; SSE41-NEXT: pcmpeqw %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: psubw %xmm2, %xmm4
-; SSE41-NEXT: pminuw %xmm4, %xmm0
-; SSE41-NEXT: pcmpeqw %xmm4, %xmm0
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: retq
+; SSE-LABEL: test9:
+; SSE: # %bb.0: # %vector.ph
+; SSE-NEXT: movd %edi, %xmm2
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; SSE-NEXT: psubusw %xmm2, %xmm0
+; SSE-NEXT: psubusw %xmm2, %xmm1
+; SSE-NEXT: retq
;
; AVX1-LABEL: test9:
; AVX1: # %bb.0: # %vector.ph
@@ -635,33 +557,22 @@ define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind {
; AVX1-NEXT: vmovd %edi, %xmm2
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX1-NEXT: vpsubw %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpminuw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpeqw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpminuw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpcmpeqw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test9:
; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vmovd %edi, %xmm1
; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
-; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpminuw %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpcmpeqw %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test9:
; AVX512: # %bb.0: # %vector.ph
; AVX512-NEXT: vpbroadcastw %edi, %ymm1
-; AVX512-NEXT: vpsubw %ymm1, %ymm0, %ymm1
-; AVX512-NEXT: vpcmpleuw %ymm0, %ymm1, %k1
-; AVX512-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} {z}
+; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
vector.ph:
%0 = insertelement <16 x i16> undef, i16 %w, i32 0
@@ -776,16 +687,8 @@ define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: psubb %xmm2, %xmm3
-; SSE2-NEXT: pminub %xmm3, %xmm1
-; SSE2-NEXT: pcmpeqb %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psubb %xmm2, %xmm4
-; SSE2-NEXT: pminub %xmm4, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm4, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: psubusb %xmm2, %xmm0
+; SSE2-NEXT: psubusb %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test12:
@@ -793,16 +696,8 @@ define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
; SSSE3-NEXT: movd %edi, %xmm2
; SSSE3-NEXT: pxor %xmm3, %xmm3
; SSSE3-NEXT: pshufb %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: psubb %xmm2, %xmm3
-; SSSE3-NEXT: pminub %xmm3, %xmm1
-; SSSE3-NEXT: pcmpeqb %xmm3, %xmm1
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: psubb %xmm2, %xmm4
-; SSSE3-NEXT: pminub %xmm4, %xmm0
-; SSSE3-NEXT: pcmpeqb %xmm4, %xmm0
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: psubusb %xmm2, %xmm0
+; SSSE3-NEXT: psubusb %xmm2, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: test12:
@@ -810,16 +705,8 @@ define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
; SSE41-NEXT: movd %edi, %xmm2
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: pshufb %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psubb %xmm2, %xmm3
-; SSE41-NEXT: pminub %xmm3, %xmm1
-; SSE41-NEXT: pcmpeqb %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: psubb %xmm2, %xmm4
-; SSE41-NEXT: pminub %xmm4, %xmm0
-; SSE41-NEXT: pcmpeqb %xmm4, %xmm0
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: pand %xmm3, %xmm1
+; SSE41-NEXT: psubusb %xmm2, %xmm0
+; SSE41-NEXT: psubusb %xmm2, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: test12:
@@ -828,33 +715,22 @@ define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpminub %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpeqb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsubusb %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test12:
; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vmovd %edi, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
-; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpminub %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test12:
; AVX512: # %bb.0: # %vector.ph
; AVX512-NEXT: vpbroadcastb %edi, %ymm1
-; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm1
-; AVX512-NEXT: vpcmpleub %ymm0, %ymm1, %k1
-; AVX512-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} {z}
+; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
vector.ph:
%0 = insertelement <32 x i8> undef, i8 %w, i32 0
@@ -868,122 +744,87 @@ vector.ph:
define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSE2-LABEL: test13:
; SSE2: # %bb.0: # %vector.ph
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: por %xmm3, %xmm5
-; SSE2-NEXT: psubd %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: por %xmm3, %xmm5
-; SSE2-NEXT: psubd %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm3
-; SSE2-NEXT: packssdw %xmm2, %xmm3
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pslld $16, %xmm4
-; SSE2-NEXT: psrad $16, %xmm4
-; SSE2-NEXT: packssdw %xmm0, %xmm4
-; SSE2-NEXT: pandn %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm3, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT: pand %xmm6, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: por %xmm2, %xmm6
+; SSE2-NEXT: pslld $16, %xmm6
+; SSE2-NEXT: psrad $16, %xmm6
+; SSE2-NEXT: pxor %xmm1, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT: pxor %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pslld $16, %xmm5
+; SSE2-NEXT: psrad $16, %xmm5
+; SSE2-NEXT: packssdw %xmm6, %xmm5
+; SSE2-NEXT: psubusw %xmm5, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test13:
; SSSE3: # %bb.0: # %vector.ph
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: por %xmm3, %xmm5
-; SSSE3-NEXT: psubd %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm3, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2
-; SSSE3-NEXT: movdqa %xmm4, %xmm5
-; SSSE3-NEXT: por %xmm3, %xmm5
-; SSSE3-NEXT: psubd %xmm1, %xmm4
-; SSSE3-NEXT: pxor %xmm4, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm3
-; SSSE3-NEXT: packssdw %xmm2, %xmm3
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm1, %xmm0
-; SSSE3-NEXT: pshufb %xmm1, %xmm4
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
-; SSSE3-NEXT: pandn %xmm4, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pxor %xmm3, %xmm4
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
+; SSSE3-NEXT: pand %xmm6, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm6
+; SSSE3-NEXT: por %xmm2, %xmm6
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: pshufb %xmm2, %xmm6
+; SSSE3-NEXT: pxor %xmm1, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
+; SSSE3-NEXT: pxor %xmm5, %xmm4
+; SSSE3-NEXT: pand %xmm1, %xmm5
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: pshufb %xmm2, %xmm5
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
+; SSSE3-NEXT: psubusw %xmm5, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: test13:
; SSE41: # %bb.0: # %vector.ph
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: psubd %xmm2, %xmm5
-; SSE41-NEXT: pminud %xmm5, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm2
-; SSE41-NEXT: psubd %xmm1, %xmm2
-; SSE41-NEXT: pminud %xmm2, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE41-NEXT: packssdw %xmm0, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4],xmm3[5],xmm5[6],xmm3[7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
-; SSE41-NEXT: packusdw %xmm5, %xmm2
-; SSE41-NEXT: pand %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
+; SSE41-NEXT: pminud %xmm3, %xmm2
+; SSE41-NEXT: pminud %xmm3, %xmm1
+; SSE41-NEXT: packusdw %xmm2, %xmm1
+; SSE41-NEXT: psubusw %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test13:
; AVX1: # %bb.0: # %vector.ph
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vpsubd %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpminud %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4],xmm2[5],xmm4[6],xmm2[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535]
+; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test13:
; AVX2: # %bb.0: # %vector.ph
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpminud %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: test13:
; AVX512: # %bb.0: # %vector.ph
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1
-; AVX512-NEXT: vpcmpleud %ymm0, %ymm1, %k1
-; AVX512-NEXT: vpmovdw %ymm1, %xmm0 {%k1} {z}
+; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
+; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
vector.ph:
@@ -999,92 +840,80 @@ vector.ph:
define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; SSE2OR3-LABEL: test14:
; SSE2OR3: # %bb.0: # %vector.ph
+; SSE2OR3-NEXT: pxor %xmm6, %xmm6
; SSE2OR3-NEXT: movdqa %xmm0, %xmm5
-; SSE2OR3-NEXT: pxor %xmm0, %xmm0
-; SSE2OR3-NEXT: movdqa %xmm5, %xmm7
-; SSE2OR3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2OR3-NEXT: movdqa %xmm7, %xmm6
-; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2OR3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; SSE2OR3-NEXT: movdqa %xmm5, %xmm8
-; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2OR3-NEXT: movdqa %xmm4, %xmm9
-; SSE2OR3-NEXT: pxor %xmm0, %xmm9
-; SSE2OR3-NEXT: psubd %xmm5, %xmm4
-; SSE2OR3-NEXT: movdqa %xmm4, %xmm5
-; SSE2OR3-NEXT: pxor %xmm0, %xmm5
-; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm5
-; SSE2OR3-NEXT: movdqa %xmm3, %xmm9
-; SSE2OR3-NEXT: pxor %xmm0, %xmm9
-; SSE2OR3-NEXT: psubd %xmm8, %xmm3
+; SSE2OR3-NEXT: movdqa %xmm4, %xmm7
; SSE2OR3-NEXT: movdqa %xmm3, %xmm8
-; SSE2OR3-NEXT: pxor %xmm0, %xmm8
-; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm8
-; SSE2OR3-NEXT: packssdw %xmm5, %xmm8
-; SSE2OR3-NEXT: movdqa %xmm2, %xmm5
-; SSE2OR3-NEXT: pxor %xmm0, %xmm5
-; SSE2OR3-NEXT: psubd %xmm7, %xmm2
-; SSE2OR3-NEXT: movdqa %xmm2, %xmm7
-; SSE2OR3-NEXT: pxor %xmm0, %xmm7
-; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2OR3-NEXT: movdqa %xmm1, %xmm5
-; SSE2OR3-NEXT: pxor %xmm0, %xmm5
-; SSE2OR3-NEXT: psubd %xmm6, %xmm1
-; SSE2OR3-NEXT: pxor %xmm1, %xmm0
-; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm0
-; SSE2OR3-NEXT: packssdw %xmm7, %xmm0
-; SSE2OR3-NEXT: packsswb %xmm8, %xmm0
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2OR3-NEXT: pand %xmm5, %xmm4
-; SSE2OR3-NEXT: pand %xmm5, %xmm3
+; SSE2OR3-NEXT: movdqa %xmm2, %xmm9
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm10 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2OR3-NEXT: pand %xmm10, %xmm4
+; SSE2OR3-NEXT: pand %xmm10, %xmm3
; SSE2OR3-NEXT: packuswb %xmm4, %xmm3
-; SSE2OR3-NEXT: pand %xmm5, %xmm2
-; SSE2OR3-NEXT: pand %xmm5, %xmm1
+; SSE2OR3-NEXT: movdqa %xmm1, %xmm4
+; SSE2OR3-NEXT: pand %xmm10, %xmm2
+; SSE2OR3-NEXT: pand %xmm10, %xmm1
; SSE2OR3-NEXT: packuswb %xmm2, %xmm1
; SSE2OR3-NEXT: packuswb %xmm3, %xmm1
+; SSE2OR3-NEXT: psubb %xmm0, %xmm1
+; SSE2OR3-NEXT: movdqa %xmm0, %xmm2
+; SSE2OR3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; SSE2OR3-NEXT: movdqa %xmm2, %xmm0
+; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; SSE2OR3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; SSE2OR3-NEXT: movdqa %xmm5, %xmm3
+; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
+; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2OR3-NEXT: pxor %xmm6, %xmm7
+; SSE2OR3-NEXT: por %xmm6, %xmm5
+; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm5
+; SSE2OR3-NEXT: pxor %xmm6, %xmm8
+; SSE2OR3-NEXT: por %xmm6, %xmm3
+; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm3
+; SSE2OR3-NEXT: packssdw %xmm5, %xmm3
+; SSE2OR3-NEXT: pxor %xmm6, %xmm9
+; SSE2OR3-NEXT: por %xmm6, %xmm2
+; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm2
+; SSE2OR3-NEXT: pxor %xmm6, %xmm4
+; SSE2OR3-NEXT: por %xmm6, %xmm0
+; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE2OR3-NEXT: packssdw %xmm2, %xmm0
+; SSE2OR3-NEXT: packsswb %xmm3, %xmm0
; SSE2OR3-NEXT: pandn %xmm1, %xmm0
; SSE2OR3-NEXT: retq
;
; SSE41-LABEL: test14:
; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: psubd %xmm0, %xmm6
-; SSE41-NEXT: pminud %xmm6, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm4
-; SSE41-NEXT: movdqa %xmm3, %xmm5
-; SSE41-NEXT: psubd %xmm9, %xmm5
-; SSE41-NEXT: pminud %xmm5, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE41-NEXT: packssdw %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psubd %xmm8, %xmm0
-; SSE41-NEXT: pminud %xmm0, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: psubd %xmm7, %xmm4
-; SSE41-NEXT: pminud %xmm4, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm2
-; SSE41-NEXT: packssdw %xmm2, %xmm1
-; SSE41-NEXT: packsswb %xmm3, %xmm1
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = [255,255,255,255]
-; SSE41-NEXT: pand %xmm2, %xmm6
-; SSE41-NEXT: pand %xmm2, %xmm5
-; SSE41-NEXT: packusdw %xmm6, %xmm5
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: pand %xmm2, %xmm4
-; SSE41-NEXT: packusdw %xmm4, %xmm0
-; SSE41-NEXT: packuswb %xmm5, %xmm0
-; SSE41-NEXT: pand %xmm1, %xmm0
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[3,3,3,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
+; SSE41-NEXT: pmaxud %xmm4, %xmm8
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm8
+; SSE41-NEXT: pmaxud %xmm3, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm7
+; SSE41-NEXT: packssdw %xmm8, %xmm7
+; SSE41-NEXT: pmaxud %xmm1, %xmm6
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm6
+; SSE41-NEXT: pmaxud %xmm2, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm5
+; SSE41-NEXT: packssdw %xmm5, %xmm6
+; SSE41-NEXT: packsswb %xmm7, %xmm6
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = [255,255,255,255]
+; SSE41-NEXT: pand %xmm5, %xmm4
+; SSE41-NEXT: pand %xmm5, %xmm3
+; SSE41-NEXT: packusdw %xmm4, %xmm3
+; SSE41-NEXT: pand %xmm5, %xmm2
+; SSE41-NEXT: pand %xmm1, %xmm5
+; SSE41-NEXT: packusdw %xmm2, %xmm5
+; SSE41-NEXT: packuswb %xmm3, %xmm5
+; SSE41-NEXT: psubb %xmm0, %xmm5
+; SSE41-NEXT: pand %xmm6, %xmm5
+; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test14:
@@ -1094,34 +923,31 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm0
-; AVX1-NEXT: vpminud %xmm6, %xmm0, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm0, %xmm6
-; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm5
-; AVX1-NEXT: vpminud %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpackssdw %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpsubd %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpminud %xmm1, %xmm4, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm4, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpackssdw %xmm1, %xmm6, %xmm1
-; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255]
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
-; AVX1-NEXT: vpackusdw %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT: vpmaxud %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vpmaxud %xmm5, %xmm2, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm5
+; AVX1-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpmaxud %xmm4, %xmm1, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT: vpmaxud %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpacksswb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -1129,38 +955,35 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT: vpmaxud %ymm4, %ymm1, %ymm4
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm4
+; AVX2-NEXT: vpmaxud %ymm3, %ymm2, %ymm3
+; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm3
+; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
-; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1
-; AVX2-NEXT: vpsubd %ymm3, %ymm2, %ymm3
-; AVX2-NEXT: vpminud %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpacksswb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: test14:
; AVX512: # %bb.0: # %vector.ph
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT: vpsubd %zmm0, %zmm1, %zmm0
-; AVX512-NEXT: vpcmpleud %zmm1, %zmm0, %k1
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT: vpcmpnltud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512-NEXT: vpsubb %xmm0, %xmm1, %xmm0 {%k1} {z}
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
vector.ph:
@@ -1398,26 +1221,10 @@ define <64 x i8> @test17(<64 x i8> %x, i8 zeroext %w) nounwind {
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: psubb %xmm4, %xmm5
-; SSE2-NEXT: pminub %xmm5, %xmm3
-; SSE2-NEXT: pcmpeqb %xmm5, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: psubb %xmm4, %xmm6
-; SSE2-NEXT: pminub %xmm6, %xmm2
-; SSE2-NEXT: pcmpeqb %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm7
-; SSE2-NEXT: psubb %xmm4, %xmm7
-; SSE2-NEXT: pminub %xmm7, %xmm1
-; SSE2-NEXT: pcmpeqb %xmm7, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm8
-; SSE2-NEXT: psubb %xmm4, %xmm8
-; SSE2-NEXT: pminub %xmm8, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm8, %xmm0
-; SSE2-NEXT: pand %xmm8, %xmm0
-; SSE2-NEXT: pand %xmm7, %xmm1
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: psubusb %xmm4, %xmm0
+; SSE2-NEXT: psubusb %xmm4, %xmm1
+; SSE2-NEXT: psubusb %xmm4, %xmm2
+; SSE2-NEXT: psubusb %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test17:
@@ -1425,26 +1232,10 @@ define <64 x i8> @test17(<64 x i8> %x, i8 zeroext %w) nounwind {
; SSSE3-NEXT: movd %edi, %xmm4
; SSSE3-NEXT: pxor %xmm5, %xmm5
; SSSE3-NEXT: pshufb %xmm5, %xmm4
-; SSSE3-NEXT: movdqa %xmm3, %xmm5
-; SSSE3-NEXT: psubb %xmm4, %xmm5
-; SSSE3-NEXT: pminub %xmm5, %xmm3
-; SSSE3-NEXT: pcmpeqb %xmm5, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm6
-; SSSE3-NEXT: psubb %xmm4, %xmm6
-; SSSE3-NEXT: pminub %xmm6, %xmm2
-; SSSE3-NEXT: pcmpeqb %xmm6, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm7
-; SSSE3-NEXT: psubb %xmm4, %xmm7
-; SSSE3-NEXT: pminub %xmm7, %xmm1
-; SSSE3-NEXT: pcmpeqb %xmm7, %xmm1
-; SSSE3-NEXT: movdqa %xmm0, %xmm8
-; SSSE3-NEXT: psubb %xmm4, %xmm8
-; SSSE3-NEXT: pminub %xmm8, %xmm0
-; SSSE3-NEXT: pcmpeqb %xmm8, %xmm0
-; SSSE3-NEXT: pand %xmm8, %xmm0
-; SSSE3-NEXT: pand %xmm7, %xmm1
-; SSSE3-NEXT: pand %xmm6, %xmm2
-; SSSE3-NEXT: pand %xmm5, %xmm3
+; SSSE3-NEXT: psubusb %xmm4, %xmm0
+; SSSE3-NEXT: psubusb %xmm4, %xmm1
+; SSSE3-NEXT: psubusb %xmm4, %xmm2
+; SSSE3-NEXT: psubusb %xmm4, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: test17:
@@ -1452,26 +1243,10 @@ define <64 x i8> @test17(<64 x i8> %x, i8 zeroext %w) nounwind {
; SSE41-NEXT: movd %edi, %xmm4
; SSE41-NEXT: pxor %xmm5, %xmm5
; SSE41-NEXT: pshufb %xmm5, %xmm4
-; SSE41-NEXT: movdqa %xmm3, %xmm5
-; SSE41-NEXT: psubb %xmm4, %xmm5
-; SSE41-NEXT: pminub %xmm5, %xmm3
-; SSE41-NEXT: pcmpeqb %xmm5, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: psubb %xmm4, %xmm6
-; SSE41-NEXT: pminub %xmm6, %xmm2
-; SSE41-NEXT: pcmpeqb %xmm6, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm7
-; SSE41-NEXT: psubb %xmm4, %xmm7
-; SSE41-NEXT: pminub %xmm7, %xmm1
-; SSE41-NEXT: pcmpeqb %xmm7, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: psubb %xmm4, %xmm8
-; SSE41-NEXT: pminub %xmm8, %xmm0
-; SSE41-NEXT: pcmpeqb %xmm8, %xmm0
-; SSE41-NEXT: pand %xmm8, %xmm0
-; SSE41-NEXT: pand %xmm7, %xmm1
-; SSE41-NEXT: pand %xmm6, %xmm2
-; SSE41-NEXT: pand %xmm5, %xmm3
+; SSE41-NEXT: psubusb %xmm4, %xmm0
+; SSE41-NEXT: psubusb %xmm4, %xmm1
+; SSE41-NEXT: psubusb %xmm4, %xmm2
+; SSE41-NEXT: psubusb %xmm4, %xmm3
; SSE41-NEXT: retq
;
; AVX1-LABEL: test17:
@@ -1479,48 +1254,28 @@ define <64 x i8> @test17(<64 x i8> %x, i8 zeroext %w) nounwind {
; AVX1-NEXT: vmovd %edi, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpminub %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpcmpeqb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm5
-; AVX1-NEXT: vpminub %xmm1, %xmm5, %xmm1
-; AVX1-NEXT: vpcmpeqb %xmm1, %xmm5, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm6
-; AVX1-NEXT: vpminub %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpcmpeqb %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpminub %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm3
-; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test17:
; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vmovd %edi, %xmm2
; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX2-NEXT: vpsubb %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vpminub %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpcmpeqb %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpsubb %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpminub %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsubusb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: test17:
; AVX512: # %bb.0: # %vector.ph
; AVX512-NEXT: vpbroadcastb %edi, %zmm1
-; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm1
-; AVX512-NEXT: vpcmpleub %zmm0, %zmm1, %k1
-; AVX512-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpsubusb %zmm1, %zmm0, %zmm0
; AVX512-NEXT: retq
vector.ph:
%0 = insertelement <64 x i8> undef, i8 %w, i32 0
@@ -1532,119 +1287,44 @@ vector.ph:
}
define <32 x i16> @test18(<32 x i16> %x, i16 zeroext %w) nounwind {
-; SSE2OR3-LABEL: test18:
-; SSE2OR3: # %bb.0: # %vector.ph
-; SSE2OR3-NEXT: movdqa %xmm0, %xmm5
-; SSE2OR3-NEXT: movd %edi, %xmm0
-; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1]
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm0 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2OR3-NEXT: movdqa %xmm3, %xmm6
-; SSE2OR3-NEXT: pxor %xmm0, %xmm6
-; SSE2OR3-NEXT: psubw %xmm8, %xmm3
-; SSE2OR3-NEXT: movdqa %xmm3, %xmm4
-; SSE2OR3-NEXT: pxor %xmm0, %xmm4
-; SSE2OR3-NEXT: pcmpgtw %xmm6, %xmm4
-; SSE2OR3-NEXT: movdqa %xmm2, %xmm7
-; SSE2OR3-NEXT: pxor %xmm0, %xmm7
-; SSE2OR3-NEXT: psubw %xmm8, %xmm2
-; SSE2OR3-NEXT: movdqa %xmm2, %xmm6
-; SSE2OR3-NEXT: pxor %xmm0, %xmm6
-; SSE2OR3-NEXT: pcmpgtw %xmm7, %xmm6
-; SSE2OR3-NEXT: movdqa %xmm1, %xmm9
-; SSE2OR3-NEXT: pxor %xmm0, %xmm9
-; SSE2OR3-NEXT: psubw %xmm8, %xmm1
-; SSE2OR3-NEXT: movdqa %xmm1, %xmm7
-; SSE2OR3-NEXT: pxor %xmm0, %xmm7
-; SSE2OR3-NEXT: pcmpgtw %xmm9, %xmm7
-; SSE2OR3-NEXT: movdqa %xmm5, %xmm9
-; SSE2OR3-NEXT: pxor %xmm0, %xmm9
-; SSE2OR3-NEXT: psubw %xmm8, %xmm5
-; SSE2OR3-NEXT: pxor %xmm5, %xmm0
-; SSE2OR3-NEXT: pcmpgtw %xmm9, %xmm0
-; SSE2OR3-NEXT: pandn %xmm5, %xmm0
-; SSE2OR3-NEXT: pandn %xmm1, %xmm7
-; SSE2OR3-NEXT: pandn %xmm2, %xmm6
-; SSE2OR3-NEXT: pandn %xmm3, %xmm4
-; SSE2OR3-NEXT: movdqa %xmm7, %xmm1
-; SSE2OR3-NEXT: movdqa %xmm6, %xmm2
-; SSE2OR3-NEXT: movdqa %xmm4, %xmm3
-; SSE2OR3-NEXT: retq
-;
-; SSE41-LABEL: test18:
-; SSE41: # %bb.0: # %vector.ph
-; SSE41-NEXT: movd %edi, %xmm4
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
-; SSE41-NEXT: movdqa %xmm3, %xmm5
-; SSE41-NEXT: psubw %xmm4, %xmm5
-; SSE41-NEXT: pminuw %xmm5, %xmm3
-; SSE41-NEXT: pcmpeqw %xmm5, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: psubw %xmm4, %xmm6
-; SSE41-NEXT: pminuw %xmm6, %xmm2
-; SSE41-NEXT: pcmpeqw %xmm6, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm7
-; SSE41-NEXT: psubw %xmm4, %xmm7
-; SSE41-NEXT: pminuw %xmm7, %xmm1
-; SSE41-NEXT: pcmpeqw %xmm7, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: psubw %xmm4, %xmm8
-; SSE41-NEXT: pminuw %xmm8, %xmm0
-; SSE41-NEXT: pcmpeqw %xmm8, %xmm0
-; SSE41-NEXT: pand %xmm8, %xmm0
-; SSE41-NEXT: pand %xmm7, %xmm1
-; SSE41-NEXT: pand %xmm6, %xmm2
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: retq
+; SSE-LABEL: test18:
+; SSE: # %bb.0: # %vector.ph
+; SSE-NEXT: movd %edi, %xmm4
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; SSE-NEXT: psubusw %xmm4, %xmm0
+; SSE-NEXT: psubusw %xmm4, %xmm1
+; SSE-NEXT: psubusw %xmm4, %xmm2
+; SSE-NEXT: psubusw %xmm4, %xmm3
+; SSE-NEXT: retq
;
; AVX1-LABEL: test18:
; AVX1: # %bb.0: # %vector.ph
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovd %edi, %xmm3
; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
-; AVX1-NEXT: vpsubw %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpminuw %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpcmpeqw %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm5
-; AVX1-NEXT: vpminuw %xmm1, %xmm5, %xmm1
-; AVX1-NEXT: vpcmpeqw %xmm1, %xmm5, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpsubw %xmm3, %xmm2, %xmm6
-; AVX1-NEXT: vpminuw %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vpcmpeqw %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vpsubw %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpminuw %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpcmpeqw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpsubusw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsubusw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm2
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm2
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsubusw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsubusw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test18:
; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vmovd %edi, %xmm2
; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX2-NEXT: vpsubw %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vpminuw %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpsubw %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpminuw %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpcmpeqw %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsubusw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: test18:
; AVX512: # %bb.0: # %vector.ph
; AVX512-NEXT: vpbroadcastw %edi, %zmm1
-; AVX512-NEXT: vpsubw %zmm1, %zmm0, %zmm1
-; AVX512-NEXT: vpcmpleuw %zmm0, %zmm1, %k1
-; AVX512-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpsubusw %zmm1, %zmm0, %zmm0
; AVX512-NEXT: retq
vector.ph:
%0 = insertelement <32 x i16> undef, i16 %w, i32 0
diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll
index 4e31b48ec5cece..1b307b30d8c0d1 100644
--- a/llvm/test/CodeGen/X86/select.ll
+++ b/llvm/test/CodeGen/X86/select.ll
@@ -2065,10 +2065,11 @@ define i64 @PR51612(i64 %x, i64 %y) {
; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %eax
; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %ecx
; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %edx
-; ATHLON-NEXT: incl %edx
-; ATHLON-NEXT: addl $1, %eax
-; ATHLON-NEXT: adcl $0, %ecx
-; ATHLON-NEXT: cmovbl %edx, %eax
+; ATHLON-NEXT: addl $1, %ecx
+; ATHLON-NEXT: adcl $0, %edx
+; ATHLON-NEXT: incl %eax
+; ATHLON-NEXT: orl %ecx, %edx
+; ATHLON-NEXT: cmovnel %ecx, %eax
; ATHLON-NEXT: andl 10, %eax
; ATHLON-NEXT: xorl %edx, %edx
; ATHLON-NEXT: retl
@@ -2077,7 +2078,8 @@ define i64 @PR51612(i64 %x, i64 %y) {
; MCU: # %bb.0:
; MCU-NEXT: addl $1, %eax
; MCU-NEXT: adcl $0, %edx
-; MCU-NEXT: jae .LBB45_2
+; MCU-NEXT: orl %eax, %edx
+; MCU-NEXT: jne .LBB45_2
; MCU-NEXT: # %bb.1:
; MCU-NEXT: movl {{[0-9]+}}(%esp), %eax
; MCU-NEXT: incl %eax
diff --git a/test_direct_uaddo.ll b/test_direct_uaddo.ll
deleted file mode 100644
index a923d212bbf904..00000000000000
--- a/test_direct_uaddo.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-define i32 @test_direct_uaddo(i32 %x, i32 %y) {
- %result = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
- %ovf = extractvalue {i32, i1} %result, 1
- %val = extractvalue {i32, i1} %result, 0
- %sel = select i1 %ovf, i32 -1, i32 %val
- ret i32 %sel
-}
-
-declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32)
diff --git a/test_sat_pattern.ll b/test_sat_pattern.ll
deleted file mode 100644
index 150c8081a77ac7..00000000000000
--- a/test_sat_pattern.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-define <8 x i16> @test_sat_pattern(<8 x i16> %x, <8 x i16> %y) {
- %a = add <8 x i16> %x, %y
- %c = icmp ugt <8 x i16> %x, %a
- %r = select <8 x i1> %c, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %a
- ret <8 x i16> %r
-}
diff --git a/test_sat_pattern.s b/test_sat_pattern.s
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/test_scalar_sat.ll b/test_scalar_sat.ll
deleted file mode 100644
index 6ef9729e66a754..00000000000000
--- a/test_scalar_sat.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-define i8 @test_scalar_sat(i8 %x) {
- %a = add i8 %x, 42
- %c = icmp ugt i8 %x, %a
- %r = select i1 %c, i8 -1, i8 %a
- ret i8 %r
-}
diff --git a/test_uaddo_conversion.ll b/test_uaddo_conversion.ll
deleted file mode 100644
index ca433863997b79..00000000000000
--- a/test_uaddo_conversion.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-define i32 @test_uaddo_to_uaddsat(i32 %x, i32 %y) {
- %result = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
- %val = extractvalue {i32, i1} %result, 0
- %ovf = extractvalue {i32, i1} %result, 1
- %sel = select i1 %ovf, i32 -1, i32 %val
- ret i32 %sel
-}
-
-declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32)
diff --git a/test_uaddo_only.ll b/test_uaddo_only.ll
deleted file mode 100644
index 4f7056148fa994..00000000000000
--- a/test_uaddo_only.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-define i32 @test_uaddo_only(i32 %x, i32 %y) {
- %add = add i32 %x, %y
- %cmp = icmp ugt i32 %x, %add
- %sel = select i1 %cmp, i32 -1, i32 %add
- ret i32 %sel
-}
diff --git a/test_uaddo_only.s b/test_uaddo_only.s
deleted file mode 100644
index e04ea329bd8e97..00000000000000
--- a/test_uaddo_only.s
+++ /dev/null
@@ -1,22 +0,0 @@
- .abiversion 2
- .file "test_uaddo_only.ll"
- .text
- .globl test_uaddo_only # -- Begin function test_uaddo_only
- .p2align 4
- .type test_uaddo_only, at function
-test_uaddo_only: # @test_uaddo_only
-.Lfunc_begin0:
- .cfi_startproc
-# %bb.0:
- add 4, 3, 4
- cmplw 4, 3
- li 3, -1
- isellt 3, 3, 4
- blr
- .long 0
- .quad 0
-.Lfunc_end0:
- .size test_uaddo_only, .Lfunc_end0-.Lfunc_begin0
- .cfi_endproc
- # -- End function
- .section ".note.GNU-stack","", at progbits
diff --git a/test_uaddsat.ll b/test_uaddsat.ll
deleted file mode 100644
index 0c5423504fb482..00000000000000
--- a/test_uaddsat.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; Test file to verify uaddo -> uaddsat conversion
-define i32 @test_uaddsat_pattern(i32 %x, i32 %y) {
- %add = add i32 %x, %y
- %cmp = icmp ugt i32 %x, %add
- %sel = select i1 %cmp, i32 -1, i32 %add
- ret i32 %sel
-}
-
-declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32)
diff --git a/test_usubo.ll b/test_usubo.ll
deleted file mode 100644
index e588f43f3cec9e..00000000000000
--- a/test_usubo.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; Test file to verify usubo -> usubsat conversion
-define i32 @test_usubo_to_usubsat(i32 %x, i32 %y) {
- %result = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %x, i32 %y)
- %val = extractvalue {i32, i1} %result, 0
- ret i32 %val
-}
-
-define i32 @test_uaddo_to_uaddsat(i32 %x, i32 %y) {
- %result = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
- %val = extractvalue {i32, i1} %result, 0
- ret i32 %val
-}
-
-declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32)
-declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32)
diff --git a/test_vector_uaddo.ll b/test_vector_uaddo.ll
deleted file mode 100644
index 8105ed0041f54e..00000000000000
--- a/test_vector_uaddo.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-define <8 x i16> @test_vector_uaddo(<8 x i16> %x, <8 x i16> %y) {
- %result = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> %x, <8 x i16> %y)
- %ovf = extractvalue { <8 x i16>, <8 x i1> } %result, 1
- %val = extractvalue { <8 x i16>, <8 x i1> } %result, 0
- %sel = select <8 x i1> %ovf, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %val
- ret <8 x i16> %sel
-}
-
-declare { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16>, <8 x i16>)
diff --git a/test_vector_uaddo.s b/test_vector_uaddo.s
deleted file mode 100644
index 5834fc58aa5621..00000000000000
--- a/test_vector_uaddo.s
+++ /dev/null
@@ -1,21 +0,0 @@
- .abiversion 2
- .file "test_vector_uaddo.ll"
- .text
- .globl test_vector_uaddo # -- Begin function test_vector_uaddo
- .p2align 4
- .type test_vector_uaddo, at function
-test_vector_uaddo: # @test_vector_uaddo
-.Lfunc_begin0:
- .cfi_startproc
-# %bb.0:
- vadduhm 3, 2, 3
- vcmpgtuh 2, 2, 3
- xxlor 34, 34, 35
- blr
- .long 0
- .quad 0
-.Lfunc_end0:
- .size test_vector_uaddo, .Lfunc_end0-.Lfunc_begin0
- .cfi_endproc
- # -- End function
- .section ".note.GNU-stack","", at progbits
diff --git a/trace_uaddsat.ll b/trace_uaddsat.ll
deleted file mode 100644
index 8fccd2816d67fd..00000000000000
--- a/trace_uaddsat.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-define i32 @test_uaddsat(i32 %x, i32 %y) {
- %add = add i32 %x, %y
- %cmp = icmp ugt i32 %x, %add
- %sel = select i1 %cmp, i32 -1, i32 %add
- ret i32 %sel
-}
>From a58c3651fd9d32a9b9b83dc6d5f1332674be4d8d Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Wed, 24 Sep 2025 12:45:50 -0400
Subject: [PATCH 06/17] Update DAGCombiner.cpp
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 25 -------------------
1 file changed, 25 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index db2fc895cf09ff..a6ba6e518899f8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13347,31 +13347,6 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
DAG.getNode(ISD::AND, DL, N0.getValueType(), N1.getOperand(1), N0));
}
- // vselect uaddo(x, y).overflow, -1, uaddo(x, y) -> uaddsat(x, y)
- // This converts the pattern created by CodeGenPrepare back to uaddsat
- // Handle the case where overflow might be sign-extended
- if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG) {
- // Look through sign_extend_inreg to find the actual overflow flag
- (void)N0.getOperand(0);
- if ((N1.getOpcode() == ISD::UADDO && N1.getResNo() == 0 &&
- ISD::isConstantSplatVectorAllOnes(N2.getNode())) ||
- (N2.getOpcode() == ISD::UADDO && N2.getResNo() == 0 &&
- ISD::isConstantSplatVectorAllOnes(N1.getNode()))) {
- LLVM_DEBUG(dbgs() << "Converting uaddo to uaddsat\n");
- return DAG.getNode(ISD::UADDSAT, DL, VT,
- N1.getOpcode() == ISD::UADDO ? N1.getOperand(0) : N2.getOperand(0),
- N1.getOpcode() == ISD::UADDO ? N1.getOperand(1) : N2.getOperand(1));
- }
- } else if ((N1.getOpcode() == ISD::UADDO && N1.getResNo() == 0 &&
- ISD::isConstantSplatVectorAllOnes(N2.getNode())) ||
- (N2.getOpcode() == ISD::UADDO && N2.getResNo() == 0 &&
- ISD::isConstantSplatVectorAllOnes(N1.getNode()))) {
- LLVM_DEBUG(dbgs() << "Converting uaddo to uaddsat\n");
- return DAG.getNode(ISD::UADDSAT, DL, VT,
- N1.getOpcode() == ISD::UADDO ? N1.getOperand(0) : N2.getOperand(0),
- N1.getOpcode() == ISD::UADDO ? N1.getOperand(1) : N2.getOperand(1));
- }
-
// Canonicalize integer abs.
// vselect (setg[te] X, 0), X, -X ->
// vselect (setgt X, -1), X, -X ->
>From e0b263c0ee0d54b29bb18efccf6d9bec9629d0a5 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Wed, 24 Sep 2025 12:58:19 -0400
Subject: [PATCH 07/17] n
---
llvm/test/CodeGen/PowerPC/sat-add.ll | 52 +++++++++++++---------------
1 file changed, 24 insertions(+), 28 deletions(-)
diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll
index 771c2ca0a866c3..34b703a9811058 100644
--- a/llvm/test/CodeGen/PowerPC/sat-add.ll
+++ b/llvm/test/CodeGen/PowerPC/sat-add.ll
@@ -24,12 +24,11 @@ define i8 @unsigned_sat_constant_i8_using_min(i8 %x) {
define i8 @unsigned_sat_constant_i8_using_cmp_sum(i8 %x) {
; CHECK-LABEL: unsigned_sat_constant_i8_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrlwi 4, 3, 24
+; CHECK-NEXT: clrlwi 3, 3, 24
; CHECK-NEXT: addi 3, 3, 42
-; CHECK-NEXT: clrlwi 5, 3, 24
-; CHECK-NEXT: cmplw 4, 5
+; CHECK-NEXT: andi. 4, 3, 256
; CHECK-NEXT: li 4, -1
-; CHECK-NEXT: iselgt 3, 4, 3
+; CHECK-NEXT: iseleq 3, 3, 4
; CHECK-NEXT: blr
%a = add i8 %x, 42
%c = icmp ugt i8 %x, %a
@@ -70,12 +69,11 @@ define i16 @unsigned_sat_constant_i16_using_min(i16 %x) {
define i16 @unsigned_sat_constant_i16_using_cmp_sum(i16 %x) {
; CHECK-LABEL: unsigned_sat_constant_i16_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrlwi 4, 3, 16
+; CHECK-NEXT: clrlwi 3, 3, 16
; CHECK-NEXT: addi 3, 3, 42
-; CHECK-NEXT: clrlwi 5, 3, 16
-; CHECK-NEXT: cmplw 4, 5
+; CHECK-NEXT: andis. 4, 3, 1
; CHECK-NEXT: li 4, -1
-; CHECK-NEXT: iselgt 3, 4, 3
+; CHECK-NEXT: iseleq 3, 3, 4
; CHECK-NEXT: blr
%a = add i16 %x, 42
%c = icmp ugt i16 %x, %a
@@ -116,9 +114,9 @@ define i32 @unsigned_sat_constant_i32_using_cmp_sum(i32 %x) {
; CHECK-LABEL: unsigned_sat_constant_i32_using_cmp_sum:
; CHECK: # %bb.0:
; CHECK-NEXT: addi 4, 3, 42
-; CHECK-NEXT: cmplw 3, 4
+; CHECK-NEXT: cmplw 4, 3
; CHECK-NEXT: li 3, -1
-; CHECK-NEXT: iselgt 3, 3, 4
+; CHECK-NEXT: isellt 3, 3, 4
; CHECK-NEXT: blr
%a = add i32 %x, 42
%c = icmp ugt i32 %x, %a
@@ -205,12 +203,12 @@ define i8 @unsigned_sat_variable_i8_using_min(i8 %x, i8 %y) {
define i8 @unsigned_sat_variable_i8_using_cmp_sum(i8 %x, i8 %y) {
; CHECK-LABEL: unsigned_sat_variable_i8_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrlwi 5, 3, 24
+; CHECK-NEXT: clrlwi 4, 4, 24
+; CHECK-NEXT: clrlwi 3, 3, 24
; CHECK-NEXT: add 3, 3, 4
-; CHECK-NEXT: clrlwi 4, 3, 24
-; CHECK-NEXT: cmplw 5, 4
+; CHECK-NEXT: andi. 4, 3, 256
; CHECK-NEXT: li 4, -1
-; CHECK-NEXT: iselgt 3, 4, 3
+; CHECK-NEXT: iseleq 3, 3, 4
; CHECK-NEXT: blr
%a = add i8 %x, %y
%c = icmp ugt i8 %x, %a
@@ -256,12 +254,12 @@ define i16 @unsigned_sat_variable_i16_using_min(i16 %x, i16 %y) {
define i16 @unsigned_sat_variable_i16_using_cmp_sum(i16 %x, i16 %y) {
; CHECK-LABEL: unsigned_sat_variable_i16_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: clrlwi 5, 3, 16
+; CHECK-NEXT: clrlwi 4, 4, 16
+; CHECK-NEXT: clrlwi 3, 3, 16
; CHECK-NEXT: add 3, 3, 4
-; CHECK-NEXT: clrlwi 4, 3, 16
-; CHECK-NEXT: cmplw 5, 4
+; CHECK-NEXT: andis. 4, 3, 1
; CHECK-NEXT: li 4, -1
-; CHECK-NEXT: iselgt 3, 4, 3
+; CHECK-NEXT: iseleq 3, 3, 4
; CHECK-NEXT: blr
%a = add i16 %x, %y
%c = icmp ugt i16 %x, %a
@@ -306,9 +304,9 @@ define i32 @unsigned_sat_variable_i32_using_cmp_sum(i32 %x, i32 %y) {
; CHECK-LABEL: unsigned_sat_variable_i32_using_cmp_sum:
; CHECK: # %bb.0:
; CHECK-NEXT: add 4, 3, 4
-; CHECK-NEXT: cmplw 3, 4
+; CHECK-NEXT: cmplw 4, 3
; CHECK-NEXT: li 3, -1
-; CHECK-NEXT: iselgt 3, 3, 4
+; CHECK-NEXT: isellt 3, 3, 4
; CHECK-NEXT: blr
%a = add i32 %x, %y
%c = icmp ugt i32 %x, %a
@@ -540,11 +538,9 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_sum(<2 x i64> %x) {
; CHECK-NEXT: addis 3, 2, .LCPI34_0 at toc@ha
; CHECK-NEXT: addi 3, 3, .LCPI34_0 at toc@l
; CHECK-NEXT: lxvd2x 35, 0, 3
-; CHECK-NEXT: addis 3, 2, .LCPI34_1 at toc@ha
-; CHECK-NEXT: addi 3, 3, .LCPI34_1 at toc@l
-; CHECK-NEXT: lxvd2x 36, 0, 3
-; CHECK-NEXT: vminud 2, 2, 4
-; CHECK-NEXT: vaddudm 2, 2, 3
+; CHECK-NEXT: vaddudm 3, 2, 3
+; CHECK-NEXT: vcmpgtud 2, 2, 3
+; CHECK-NEXT: xxlor 34, 34, 35
; CHECK-NEXT: blr
%a = add <2 x i64> %x, <i64 42, i64 42>
%c = icmp ugt <2 x i64> %x, %a
@@ -708,9 +704,9 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_min(<2 x i64> %x, <2 x i64>
define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_sum(<2 x i64> %x, <2 x i64> %y) {
; CHECK-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum:
; CHECK: # %bb.0:
-; CHECK-NEXT: xxlnor 36, 35, 35
-; CHECK-NEXT: vminud 2, 2, 4
-; CHECK-NEXT: vaddudm 2, 2, 3
+; CHECK-NEXT: vaddudm 3, 2, 3
+; CHECK-NEXT: vcmpgtud 2, 2, 3
+; CHECK-NEXT: xxlor 34, 34, 35
; CHECK-NEXT: blr
%a = add <2 x i64> %x, %y
%c = icmp ugt <2 x i64> %x, %a
>From 1d26bff3d345754a1612353bbc9bf3e11efd13a7 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Wed, 24 Sep 2025 14:39:56 -0400
Subject: [PATCH 08/17] n
---
llvm/include/llvm/CodeGen/TargetLowering.h | 2 --
llvm/lib/Target/X86/X86ISelLowering.cpp | 4 ++++
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index bcf14a77f9d702..233689d1f52ad4 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3464,8 +3464,6 @@ class LLVM_ABI TargetLoweringBase {
// besides the overflow check. On some targets (e.g. SPARC), it is
// not profitable to form on overflow op if the math result has no
// concrete users.
- if (VT.isVector())
- return false;
return MathUsed && (VT.isSimple() || !isOperationExpand(Opcode, VT));
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 47d0010504d28b..31d7441b7db2e8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3406,6 +3406,10 @@ bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
bool) const {
+ // TODO: Allow vectors?
+ if (VT.isVector())
+ return false;
+
return TargetLowering::shouldFormOverflowOp(Opcode, VT, true);
}
>From 4402164f06036bfc40b8dba5b7efb8908563d618 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Thu, 25 Sep 2025 09:11:16 -0400
Subject: [PATCH 09/17] h
---
llvm/include/llvm/CodeGen/TargetLowering.h | 2 ++
llvm/lib/Target/X86/X86ISelLowering.cpp | 5 -----
2 files changed, 2 insertions(+), 5 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 233689d1f52ad4..bcf14a77f9d702 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3464,6 +3464,8 @@ class LLVM_ABI TargetLoweringBase {
// besides the overflow check. On some targets (e.g. SPARC), it is
// not profitable to form on overflow op if the math result has no
// concrete users.
+ if (VT.isVector())
+ return false;
return MathUsed && (VT.isSimple() || !isOperationExpand(Opcode, VT));
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 31d7441b7db2e8..b22a5476b57b4c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3405,11 +3405,6 @@ bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
bool) const {
-
- // TODO: Allow vectors?
- if (VT.isVector())
- return false;
-
return TargetLowering::shouldFormOverflowOp(Opcode, VT, true);
}
>From 63609034a7b4f65ba43805666117a2263597f86c Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Thu, 25 Sep 2025 09:21:21 -0400
Subject: [PATCH 10/17] h
---
llvm/include/llvm/CodeGen/TargetLowering.h | 3 ++-
llvm/lib/Target/RISCV/RISCVISelLowering.h | 8 --------
2 files changed, 2 insertions(+), 9 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index bcf14a77f9d702..643ec35ed90e2e 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3466,7 +3466,8 @@ class LLVM_ABI TargetLoweringBase {
// concrete users.
if (VT.isVector())
return false;
- return MathUsed && (VT.isSimple() || !isOperationExpand(Opcode, VT));
+ return MathUsed &&
+ (isTypeLegal(VT) || isOperationLegalOrCustomOrPromote(Opcode, VT));
}
// Return true if it is profitable to use a scalar input to a BUILD_VECTOR
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 3f81ed74c12edb..54f12e76a48451 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -201,14 +201,6 @@ class RISCVTargetLowering : public TargetLowering {
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
- bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
- bool MathUsed) const override {
- if (VT == MVT::i8 || VT == MVT::i16)
- return false;
-
- return TargetLowering::shouldFormOverflowOp(Opcode, VT, MathUsed);
- }
-
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem,
unsigned AddrSpace) const override {
// If we can replace 4 or more scalar stores, there will be a reduction
>From 08252afe294fd05b22760e638c912ab4be1d2455 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Thu, 25 Sep 2025 09:26:32 -0400
Subject: [PATCH 11/17] e
---
llvm/include/llvm/CodeGen/TargetLowering.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 643ec35ed90e2e..030fe706207ede 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3467,7 +3467,7 @@ class LLVM_ABI TargetLoweringBase {
if (VT.isVector())
return false;
return MathUsed &&
- (isTypeLegal(VT) || isOperationLegalOrCustomOrPromote(Opcode, VT));
+ (isTypeLegal(VT) || isOperationLegalOrCustom(Opcode, VT));
}
// Return true if it is profitable to use a scalar input to a BUILD_VECTOR
>From 027cc9e04aed541c8653ede15366de9140672616 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Thu, 25 Sep 2025 09:40:13 -0400
Subject: [PATCH 12/17] Update TargetLowering.h
---
llvm/include/llvm/CodeGen/TargetLowering.h | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 030fe706207ede..275a79ae604e20 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3458,7 +3458,7 @@ class LLVM_ABI TargetLoweringBase {
// Form it if it is legal.
if (isOperationLegal(Opcode, VT))
return true;
-
+
// Allow the transform as long as we have an integer type that is not
// obviously illegal and unsupported and if the math result is used
// besides the overflow check. On some targets (e.g. SPARC), it is
@@ -3466,8 +3466,7 @@ class LLVM_ABI TargetLoweringBase {
// concrete users.
if (VT.isVector())
return false;
- return MathUsed &&
- (isTypeLegal(VT) || isOperationLegalOrCustom(Opcode, VT));
+ return MathUsed && (isTypeLegal(VT) || isOperationCustom(Opcode, VT));
}
// Return true if it is profitable to use a scalar input to a BUILD_VECTOR
>From 112543fb46dd018b9a31ea90eaec73c2f0b50ecb Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Thu, 25 Sep 2025 12:16:56 -0400
Subject: [PATCH 13/17] d
---
llvm/include/llvm/CodeGen/TargetLowering.h | 2 +-
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 ++
llvm/lib/Target/ARM/ARMISelLowering.h | 2 ++
llvm/lib/Target/RISCV/RISCVISelLowering.h | 8 ++++++++
4 files changed, 13 insertions(+), 1 deletion(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 275a79ae604e20..6165b7e43ab855 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3466,7 +3466,7 @@ class LLVM_ABI TargetLoweringBase {
// concrete users.
if (VT.isVector())
return false;
- return MathUsed && (isTypeLegal(VT) || isOperationCustom(Opcode, VT));
+ return MathUsed && (VT.isSimple() || !isOperationExpand(Opcode, VT));
}
// Return true if it is profitable to use a scalar input to a BUILD_VECTOR
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index d8072d15853ee0..510773bfccf887 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -321,6 +321,8 @@ class AArch64TargetLowering : public TargetLowering {
bool MathUsed) const override {
// Using overflow ops for overflow checks only should beneficial on
// AArch64.
+ if (VT == MVT::i8 || VT == MVT::i16)
+ return false;
return TargetLowering::shouldFormOverflowOp(Opcode, VT, true);
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index fa130a153b0de3..cfce133bbaad01 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -642,6 +642,8 @@ class VectorType;
bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
bool MathUsed) const override {
// Using overflow ops for overflow checks only should beneficial on ARM.
+ if (VT == MVT::i8 || VT == MVT::i16)
+ return false;
return TargetLowering::shouldFormOverflowOp(Opcode, VT, true);
}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 54f12e76a48451..3f81ed74c12edb 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -201,6 +201,14 @@ class RISCVTargetLowering : public TargetLowering {
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
+ bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
+ bool MathUsed) const override {
+ if (VT == MVT::i8 || VT == MVT::i16)
+ return false;
+
+ return TargetLowering::shouldFormOverflowOp(Opcode, VT, MathUsed);
+ }
+
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem,
unsigned AddrSpace) const override {
// If we can replace 4 or more scalar stores, there will be a reduction
>From ad23f53e12257cb48245d0c733414b74448a8000 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Thu, 25 Sep 2025 12:56:49 -0400
Subject: [PATCH 14/17] v
---
llvm/include/llvm/CodeGen/TargetLowering.h | 2 +-
.../Hexagon/atomicrmw-uinc-udec-wrap.ll | 103 +++++----
.../test/CodeGen/RISCV/overflow-intrinsics.ll | 205 +++++++++---------
llvm/test/CodeGen/X86/abdu-neg.ll | 93 ++++----
llvm/test/CodeGen/X86/select.ll | 12 +-
5 files changed, 204 insertions(+), 211 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 6165b7e43ab855..acbc84568300d0 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3466,7 +3466,7 @@ class LLVM_ABI TargetLoweringBase {
// concrete users.
if (VT.isVector())
return false;
- return MathUsed && (VT.isSimple() || !isOperationExpand(Opcode, VT));
+ return MathUsed && !isOperationExpand(Opcode, VT);
}
// Return true if it is profitable to use a scalar input to a BUILD_VECTOR
diff --git a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll
index 8e673c1bb06ba3..55a8f2eb768c8b 100644
--- a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll
@@ -184,53 +184,53 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r0 = and(#24,asl(r0,#3))
-; CHECK-NEXT: r3 = and(r0,#-4)
-; CHECK-NEXT: r4 = #255
-; CHECK-NEXT: r5 = and(r1,#255)
+; CHECK-NEXT: r2 = and(r0,#-4)
+; CHECK-NEXT: r3 = #255
+; CHECK-NEXT: r4 = and(r1,#255)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r2 = asl(r4,r0)
+; CHECK-NEXT: r5 = asl(r3,r0)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r6 = sub(#-1,r2)
+; CHECK-NEXT: r5 = sub(#-1,r5)
; CHECK-NEXT: }
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB4_1: // %atomicrmw.start
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: {
-; CHECK-NEXT: r7 = memw_locked(r3)
+; CHECK-NEXT: r7 = #255
+; CHECK-NEXT: r6 = memw_locked(r2)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r2 = lsr(r7,r0)
-; CHECK-NEXT: r7 = and(r7,r6)
+; CHECK-NEXT: r7 &= lsr(r6,r0)
+; CHECK-NEXT: r8 = and(r6,r5)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: p0 = bitsclr(r2,r4)
-; CHECK-NEXT: r8 = and(r2,#255)
+; CHECK-NEXT: p0 = cmp.gtu(r7,r4)
+; CHECK-NEXT: r7 = add(r7,#-1)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: p1 = cmp.gtu(r8,r5)
-; CHECK-NEXT: if (p1.new) r8 = add(r1,#0)
-; CHECK-NEXT: if (!p1.new) r8 = add(r2,#-1)
+; CHECK-NEXT: p1 = !bitsset(r3,r7)
+; CHECK-NEXT: r9 = mux(p0,r1,r7)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: if (p0) r8 = add(r1,#0)
+; CHECK-NEXT: if (p1) r9 = add(r1,#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r8 = and(r8,#255)
+; CHECK-NEXT: r7 = and(r9,#255)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r7 |= asl(r8,r0)
+; CHECK-NEXT: r8 |= asl(r7,r0)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: memw_locked(r3,p0) = r7
+; CHECK-NEXT: memw_locked(r2,p0) = r8
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: if (!p0) jump:nt .LBB4_1
; CHECK-NEXT: }
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
; CHECK-NEXT: {
-; CHECK-NEXT: r0 = r2
+; CHECK-NEXT: r0 = lsr(r6,r0)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: }
%result = atomicrmw udec_wrap ptr %ptr, i8 %val seq_cst
@@ -243,53 +243,53 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r0 = and(#24,asl(r0,#3))
-; CHECK-NEXT: r3 = and(r0,#-4)
-; CHECK-NEXT: r4 = ##65535
+; CHECK-NEXT: r2 = and(r0,#-4)
+; CHECK-NEXT: r3 = ##65535
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r2 = asl(r4,r0)
-; CHECK-NEXT: r5 = zxth(r1)
+; CHECK-NEXT: r5 = asl(r3,r0)
+; CHECK-NEXT: r4 = zxth(r1)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r6 = sub(#-1,r2)
+; CHECK-NEXT: r5 = sub(#-1,r5)
; CHECK-NEXT: }
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB5_1: // %atomicrmw.start
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: {
-; CHECK-NEXT: r7 = memw_locked(r3)
+; CHECK-NEXT: r7 = ##65535
+; CHECK-NEXT: r6 = memw_locked(r2)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r2 = lsr(r7,r0)
-; CHECK-NEXT: r7 = and(r7,r6)
+; CHECK-NEXT: r7 &= lsr(r6,r0)
+; CHECK-NEXT: r8 = and(r6,r5)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: p0 = bitsclr(r2,r4)
-; CHECK-NEXT: r8 = zxth(r2)
+; CHECK-NEXT: p0 = cmp.gtu(r7,r4)
+; CHECK-NEXT: r7 = add(r7,#-1)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: p1 = cmp.gtu(r8,r5)
-; CHECK-NEXT: if (p1.new) r8 = add(r1,#0)
-; CHECK-NEXT: if (!p1.new) r8 = add(r2,#-1)
+; CHECK-NEXT: p1 = !bitsset(r3,r7)
+; CHECK-NEXT: r9 = mux(p0,r1,r7)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: if (p0) r8 = add(r1,#0)
+; CHECK-NEXT: if (p1) r9 = add(r1,#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r8 = zxth(r8)
+; CHECK-NEXT: r7 = zxth(r9)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r7 |= asl(r8,r0)
+; CHECK-NEXT: r8 |= asl(r7,r0)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: memw_locked(r3,p0) = r7
+; CHECK-NEXT: memw_locked(r2,p0) = r8
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: if (!p0) jump:nt .LBB5_1
; CHECK-NEXT: }
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
; CHECK-NEXT: {
-; CHECK-NEXT: r0 = r2
+; CHECK-NEXT: r0 = lsr(r6,r0)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: }
%result = atomicrmw udec_wrap ptr %ptr, i16 %val seq_cst
@@ -308,15 +308,17 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: p0 = cmp.gtu(r2,r1)
-; CHECK-NEXT: p1 = cmp.eq(r2,#0)
-; CHECK-NEXT: if (p0.new) r3 = add(r1,#0)
-; CHECK-NEXT: if (!p0.new) r3 = add(r2,#-1)
+; CHECK-NEXT: r3 = add(r2,#-1)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: if (p1) r3 = add(r1,#0)
+; CHECK-NEXT: p1 = cmp.eq(r3,#-1)
+; CHECK-NEXT: r4 = mux(p0,r1,r3)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: memw_locked(r0,p0) = r3
+; CHECK-NEXT: if (p1) r4 = add(r1,#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: memw_locked(r0,p0) = r4
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: if (!p0) jump:nt .LBB6_1
@@ -336,7 +338,6 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r7:6 = combine(#-1,#-1)
-; CHECK-NEXT: r9:8 = combine(#0,#0)
; CHECK-NEXT: }
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB7_1: // %atomicrmw.start
@@ -345,22 +346,20 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
; CHECK-NEXT: r5:4 = memd_locked(r0)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: p1 = cmp.gtu(r5:4,r3:2)
-; CHECK-NEXT: p0 = cmp.eq(r5:4,r9:8)
-; CHECK-NEXT: }
-; CHECK-NEXT: {
-; CHECK-NEXT: r13:12 = add(r5:4,r7:6)
+; CHECK-NEXT: p0 = cmp.gtu(r5:4,r3:2)
+; CHECK-NEXT: r9:8 = add(r5:4,r7:6)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r1 = mux(p1,r2,r12)
-; CHECK-NEXT: r14 = mux(p1,r3,r13)
+; CHECK-NEXT: p1 = cmp.eq(r9:8,r7:6)
+; CHECK-NEXT: r1 = mux(p0,r2,r8)
+; CHECK-NEXT: r12 = mux(p0,r3,r9)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r10 = mux(p0,r2,r1)
-; CHECK-NEXT: r11 = mux(p0,r3,r14)
+; CHECK-NEXT: r14 = mux(p1,r2,r1)
+; CHECK-NEXT: r15 = mux(p1,r3,r12)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: memd_locked(r0,p0) = r11:10
+; CHECK-NEXT: memd_locked(r0,p0) = r15:14
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: if (!p0) jump:nt .LBB7_1
diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
index 148886224454c7..db29a828a2c137 100644
--- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
@@ -48,36 +48,33 @@ define i64 @uaddo1_overflow_used(i64 %a, i64 %b) nounwind ssp {
define i64 @uaddo1_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp {
; RV32-LABEL: uaddo1_math_overflow_used:
; RV32: # %bb.0:
-; RV32-NEXT: add a6, a3, a1
-; RV32-NEXT: add a5, a2, a0
-; RV32-NEXT: sltu a7, a5, a2
-; RV32-NEXT: add a6, a6, a7
-; RV32-NEXT: beq a6, a1, .LBB1_2
+; RV32-NEXT: add a5, a3, a1
+; RV32-NEXT: add a0, a2, a0
+; RV32-NEXT: sltu a1, a0, a2
+; RV32-NEXT: add a5, a5, a1
+; RV32-NEXT: beq a5, a3, .LBB1_2
; RV32-NEXT: # %bb.1:
-; RV32-NEXT: sltu a0, a6, a1
-; RV32-NEXT: beqz a0, .LBB1_3
-; RV32-NEXT: j .LBB1_4
+; RV32-NEXT: sltu a1, a5, a3
; RV32-NEXT: .LBB1_2:
-; RV32-NEXT: sltu a0, a5, a0
-; RV32-NEXT: bnez a0, .LBB1_4
-; RV32-NEXT: .LBB1_3:
+; RV32-NEXT: bnez a1, .LBB1_4
+; RV32-NEXT: # %bb.3:
; RV32-NEXT: li a2, 42
; RV32-NEXT: .LBB1_4:
-; RV32-NEXT: neg a1, a0
+; RV32-NEXT: neg a1, a1
; RV32-NEXT: and a1, a1, a3
-; RV32-NEXT: sw a5, 0(a4)
-; RV32-NEXT: sw a6, 4(a4)
+; RV32-NEXT: sw a0, 0(a4)
+; RV32-NEXT: sw a5, 4(a4)
; RV32-NEXT: mv a0, a2
; RV32-NEXT: ret
;
; RV64-LABEL: uaddo1_math_overflow_used:
; RV64: # %bb.0:
-; RV64-NEXT: add a3, a1, a0
-; RV64-NEXT: bltu a3, a0, .LBB1_2
+; RV64-NEXT: add a0, a1, a0
+; RV64-NEXT: bltu a0, a1, .LBB1_2
; RV64-NEXT: # %bb.1:
; RV64-NEXT: li a1, 42
; RV64-NEXT: .LBB1_2:
-; RV64-NEXT: sd a3, 0(a2)
+; RV64-NEXT: sd a0, 0(a2)
; RV64-NEXT: mv a0, a1
; RV64-NEXT: ret
%add = add i64 %b, %a
@@ -203,7 +200,7 @@ define i64 @uaddo3_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp {
; RV32-NEXT: add a0, a2, a0
; RV32-NEXT: sltu a1, a0, a2
; RV32-NEXT: add a5, a5, a1
-; RV32-NEXT: beq a3, a5, .LBB5_2
+; RV32-NEXT: beq a5, a3, .LBB5_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: sltu a1, a5, a3
; RV32-NEXT: .LBB5_2:
@@ -620,10 +617,9 @@ define i1 @uaddo_i64_increment_alt(i64 %x, ptr %p) {
; RV32-LABEL: uaddo_i64_increment_alt:
; RV32: # %bb.0:
; RV32-NEXT: addi a3, a0, 1
-; RV32-NEXT: and a0, a0, a1
-; RV32-NEXT: seqz a4, a3
-; RV32-NEXT: addi a0, a0, 1
-; RV32-NEXT: add a1, a1, a4
+; RV32-NEXT: seqz a0, a3
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: or a0, a3, a1
; RV32-NEXT: seqz a0, a0
; RV32-NEXT: sw a3, 0(a2)
; RV32-NEXT: sw a1, 4(a2)
@@ -646,13 +642,12 @@ define i1 @uaddo_i64_increment_alt(i64 %x, ptr %p) {
define i1 @uaddo_i64_increment_alt_dom(i64 %x, ptr %p) {
; RV32-LABEL: uaddo_i64_increment_alt_dom:
; RV32: # %bb.0:
-; RV32-NEXT: and a3, a0, a1
-; RV32-NEXT: addi a4, a0, 1
-; RV32-NEXT: addi a3, a3, 1
-; RV32-NEXT: seqz a5, a4
+; RV32-NEXT: addi a3, a0, 1
; RV32-NEXT: seqz a0, a3
-; RV32-NEXT: add a1, a1, a5
-; RV32-NEXT: sw a4, 0(a2)
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: or a0, a3, a1
+; RV32-NEXT: seqz a0, a0
+; RV32-NEXT: sw a3, 0(a2)
; RV32-NEXT: sw a1, 4(a2)
; RV32-NEXT: ret
;
@@ -673,16 +668,18 @@ define i1 @uaddo_i64_increment_alt_dom(i64 %x, ptr %p) {
define i1 @uaddo_i32_decrement_alt(i32 signext %x, ptr %p) {
; RV32-LABEL: uaddo_i32_decrement_alt:
; RV32: # %bb.0:
-; RV32-NEXT: addi a2, a0, -1
-; RV32-NEXT: snez a0, a0
-; RV32-NEXT: sw a2, 0(a1)
+; RV32-NEXT: snez a2, a0
+; RV32-NEXT: addi a0, a0, -1
+; RV32-NEXT: sw a0, 0(a1)
+; RV32-NEXT: mv a0, a2
; RV32-NEXT: ret
;
; RV64-LABEL: uaddo_i32_decrement_alt:
; RV64: # %bb.0:
-; RV64-NEXT: addi a2, a0, -1
-; RV64-NEXT: snez a0, a0
-; RV64-NEXT: sw a2, 0(a1)
+; RV64-NEXT: snez a2, a0
+; RV64-NEXT: addi a0, a0, -1
+; RV64-NEXT: sw a0, 0(a1)
+; RV64-NEXT: mv a0, a2
; RV64-NEXT: ret
%a = add i32 %x, -1
store i32 %a, ptr %p
@@ -693,20 +690,21 @@ define i1 @uaddo_i32_decrement_alt(i32 signext %x, ptr %p) {
define i1 @uaddo_i64_decrement_alt(i64 %x, ptr %p) {
; RV32-LABEL: uaddo_i64_decrement_alt:
; RV32: # %bb.0:
-; RV32-NEXT: seqz a3, a0
-; RV32-NEXT: addi a4, a0, -1
-; RV32-NEXT: or a0, a0, a1
-; RV32-NEXT: sub a1, a1, a3
-; RV32-NEXT: snez a0, a0
-; RV32-NEXT: sw a4, 0(a2)
+; RV32-NEXT: or a3, a0, a1
+; RV32-NEXT: seqz a4, a0
+; RV32-NEXT: addi a5, a0, -1
+; RV32-NEXT: snez a0, a3
+; RV32-NEXT: sub a1, a1, a4
+; RV32-NEXT: sw a5, 0(a2)
; RV32-NEXT: sw a1, 4(a2)
; RV32-NEXT: ret
;
; RV64-LABEL: uaddo_i64_decrement_alt:
; RV64: # %bb.0:
-; RV64-NEXT: addi a2, a0, -1
-; RV64-NEXT: snez a0, a0
-; RV64-NEXT: sd a2, 0(a1)
+; RV64-NEXT: snez a2, a0
+; RV64-NEXT: addi a0, a0, -1
+; RV64-NEXT: sd a0, 0(a1)
+; RV64-NEXT: mv a0, a2
; RV64-NEXT: ret
%a = add i64 %x, -1
store i64 %a, ptr %p
@@ -794,24 +792,26 @@ define i1 @usubo_ult_i64_overflow_used(i64 %x, i64 %y, ptr %p) {
define i1 @usubo_ult_i64_math_overflow_used(i64 %x, i64 %y, ptr %p) {
; RV32-LABEL: usubo_ult_i64_math_overflow_used:
; RV32: # %bb.0:
-; RV32-NEXT: mv a5, a0
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: sub a6, a1, a3
-; RV32-NEXT: sub a5, a5, a2
-; RV32-NEXT: sub a2, a6, a0
-; RV32-NEXT: sw a5, 0(a4)
-; RV32-NEXT: sw a2, 4(a4)
-; RV32-NEXT: beq a1, a3, .LBB23_2
+; RV32-NEXT: sltu a5, a0, a2
+; RV32-NEXT: sub a3, a1, a3
+; RV32-NEXT: sub a3, a3, a5
+; RV32-NEXT: sub a2, a0, a2
+; RV32-NEXT: beq a3, a1, .LBB23_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: sltu a0, a1, a3
+; RV32-NEXT: j .LBB23_3
; RV32-NEXT: .LBB23_2:
+; RV32-NEXT: sltu a0, a0, a2
+; RV32-NEXT: .LBB23_3:
+; RV32-NEXT: sw a2, 0(a4)
+; RV32-NEXT: sw a3, 4(a4)
; RV32-NEXT: ret
;
; RV64-LABEL: usubo_ult_i64_math_overflow_used:
; RV64: # %bb.0:
-; RV64-NEXT: sub a3, a0, a1
+; RV64-NEXT: sub a1, a0, a1
; RV64-NEXT: sltu a0, a0, a1
-; RV64-NEXT: sd a3, 0(a2)
+; RV64-NEXT: sd a1, 0(a2)
; RV64-NEXT: ret
%s = sub i64 %x, %y
store i64 %s, ptr %p
@@ -824,20 +824,17 @@ define i1 @usubo_ult_i64_math_overflow_used(i64 %x, i64 %y, ptr %p) {
define i1 @usubo_ugt_i32(i32 %x, i32 %y, ptr %p) {
; RV32-LABEL: usubo_ugt_i32:
; RV32: # %bb.0:
-; RV32-NEXT: sltu a3, a0, a1
-; RV32-NEXT: sub a0, a0, a1
-; RV32-NEXT: sw a0, 0(a2)
-; RV32-NEXT: mv a0, a3
+; RV32-NEXT: sub a1, a0, a1
+; RV32-NEXT: sltu a0, a0, a1
+; RV32-NEXT: sw a1, 0(a2)
; RV32-NEXT: ret
;
; RV64-LABEL: usubo_ugt_i32:
; RV64: # %bb.0:
-; RV64-NEXT: sext.w a3, a1
-; RV64-NEXT: sext.w a4, a0
-; RV64-NEXT: sltu a3, a4, a3
-; RV64-NEXT: sub a0, a0, a1
-; RV64-NEXT: sw a0, 0(a2)
-; RV64-NEXT: mv a0, a3
+; RV64-NEXT: subw a1, a0, a1
+; RV64-NEXT: sext.w a0, a0
+; RV64-NEXT: sltu a0, a0, a1
+; RV64-NEXT: sw a1, 0(a2)
; RV64-NEXT: ret
%ov = icmp ugt i32 %y, %x
%s = sub i32 %x, %y
@@ -959,16 +956,16 @@ define i1 @usubo_eq_constant1_op1_i32(i32 %x, ptr %p) {
; RV32-LABEL: usubo_eq_constant1_op1_i32:
; RV32: # %bb.0:
; RV32-NEXT: addi a2, a0, -1
-; RV32-NEXT: seqz a0, a0
+; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sw a2, 0(a1)
; RV32-NEXT: ret
;
; RV64-LABEL: usubo_eq_constant1_op1_i32:
; RV64: # %bb.0:
-; RV64-NEXT: sext.w a2, a0
-; RV64-NEXT: addi a3, a0, -1
-; RV64-NEXT: seqz a0, a2
-; RV64-NEXT: sw a3, 0(a1)
+; RV64-NEXT: addiw a2, a0, -1
+; RV64-NEXT: sext.w a0, a0
+; RV64-NEXT: sltu a0, a0, a2
+; RV64-NEXT: sw a2, 0(a1)
; RV64-NEXT: ret
%s = add i32 %x, -1
%ov = icmp eq i32 %x, 0
@@ -982,16 +979,15 @@ define i1 @usubo_ne_constant0_op1_i32(i32 %x, ptr %p) {
; RV32-LABEL: usubo_ne_constant0_op1_i32:
; RV32: # %bb.0:
; RV32-NEXT: neg a2, a0
-; RV32-NEXT: snez a0, a0
+; RV32-NEXT: snez a0, a2
; RV32-NEXT: sw a2, 0(a1)
; RV32-NEXT: ret
;
; RV64-LABEL: usubo_ne_constant0_op1_i32:
; RV64: # %bb.0:
-; RV64-NEXT: sext.w a2, a0
-; RV64-NEXT: neg a3, a0
+; RV64-NEXT: negw a2, a0
; RV64-NEXT: snez a0, a2
-; RV64-NEXT: sw a3, 0(a1)
+; RV64-NEXT: sw a2, 0(a1)
; RV64-NEXT: ret
%s = sub i32 0, %x
%ov = icmp ne i32 %x, 0
@@ -1078,41 +1074,43 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) {
; RV32-NEXT: .cfi_offset s4, -24
; RV32-NEXT: .cfi_offset s5, -28
; RV32-NEXT: .cfi_offset s6, -32
-; RV32-NEXT: mv s5, a5
-; RV32-NEXT: mv s3, a1
+; RV32-NEXT: mv s2, a5
+; RV32-NEXT: mv s1, a1
; RV32-NEXT: andi a1, a5, 1
-; RV32-NEXT: beqz a1, .LBB32_8
+; RV32-NEXT: beqz a1, .LBB32_7
; RV32-NEXT: # %bb.1: # %t
; RV32-NEXT: mv s0, a4
-; RV32-NEXT: mv s2, a3
-; RV32-NEXT: mv s1, a2
+; RV32-NEXT: mv s5, a3
+; RV32-NEXT: mv s3, a2
; RV32-NEXT: mv s4, a0
-; RV32-NEXT: beq s3, a3, .LBB32_3
+; RV32-NEXT: beq s1, a3, .LBB32_3
; RV32-NEXT: # %bb.2: # %t
-; RV32-NEXT: sltu s6, s3, s2
+; RV32-NEXT: sltu s6, s1, s5
; RV32-NEXT: j .LBB32_4
; RV32-NEXT: .LBB32_3:
-; RV32-NEXT: sltu s6, s4, s1
+; RV32-NEXT: sltu s6, s4, s3
; RV32-NEXT: .LBB32_4: # %t
; RV32-NEXT: mv a0, s6
; RV32-NEXT: call call
-; RV32-NEXT: beqz s6, .LBB32_8
+; RV32-NEXT: beqz s6, .LBB32_7
; RV32-NEXT: # %bb.5: # %end
-; RV32-NEXT: sltu a1, s4, s1
-; RV32-NEXT: mv a0, a1
-; RV32-NEXT: beq s3, s2, .LBB32_7
+; RV32-NEXT: sltu a0, s4, s3
+; RV32-NEXT: sub a1, s1, s5
+; RV32-NEXT: sub a1, a1, a0
+; RV32-NEXT: sub a2, s4, s3
+; RV32-NEXT: beq a1, s1, .LBB32_8
; RV32-NEXT: # %bb.6: # %end
-; RV32-NEXT: sltu a0, s3, s2
-; RV32-NEXT: .LBB32_7: # %end
-; RV32-NEXT: sub a2, s3, s2
-; RV32-NEXT: sub a3, s4, s1
-; RV32-NEXT: sub a2, a2, a1
-; RV32-NEXT: sw a3, 0(s0)
-; RV32-NEXT: sw a2, 4(s0)
+; RV32-NEXT: sltu a0, s1, a1
; RV32-NEXT: j .LBB32_9
-; RV32-NEXT: .LBB32_8: # %f
-; RV32-NEXT: mv a0, s5
-; RV32-NEXT: .LBB32_9: # %f
+; RV32-NEXT: .LBB32_7: # %f
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: j .LBB32_10
+; RV32-NEXT: .LBB32_8:
+; RV32-NEXT: sltu a0, s4, a2
+; RV32-NEXT: .LBB32_9: # %end
+; RV32-NEXT: sw a2, 0(s0)
+; RV32-NEXT: sw a1, 4(s0)
+; RV32-NEXT: .LBB32_10: # %f
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
@@ -1142,44 +1140,39 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) {
; RV64-NEXT: sd s1, 24(sp) # 8-byte Folded Spill
; RV64-NEXT: sd s2, 16(sp) # 8-byte Folded Spill
; RV64-NEXT: sd s3, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s4, 0(sp) # 8-byte Folded Spill
; RV64-NEXT: .cfi_offset ra, -8
; RV64-NEXT: .cfi_offset s0, -16
; RV64-NEXT: .cfi_offset s1, -24
; RV64-NEXT: .cfi_offset s2, -32
; RV64-NEXT: .cfi_offset s3, -40
-; RV64-NEXT: .cfi_offset s4, -48
-; RV64-NEXT: mv s0, a3
+; RV64-NEXT: mv s1, a3
; RV64-NEXT: mv s2, a1
; RV64-NEXT: andi a1, a3, 1
; RV64-NEXT: beqz a1, .LBB32_3
; RV64-NEXT: # %bb.1: # %t
-; RV64-NEXT: mv s1, a2
+; RV64-NEXT: mv s0, a2
; RV64-NEXT: mv s3, a0
-; RV64-NEXT: sltu s4, a0, s2
-; RV64-NEXT: mv a0, s4
+; RV64-NEXT: sltu a0, a0, s2
; RV64-NEXT: call call
; RV64-NEXT: bgeu s3, s2, .LBB32_3
; RV64-NEXT: # %bb.2: # %end
-; RV64-NEXT: sub a0, s3, s2
-; RV64-NEXT: sd a0, 0(s1)
-; RV64-NEXT: mv a0, s4
+; RV64-NEXT: sub a1, s3, s2
+; RV64-NEXT: sltu a0, s3, a1
+; RV64-NEXT: sd a1, 0(s0)
; RV64-NEXT: j .LBB32_4
; RV64-NEXT: .LBB32_3: # %f
-; RV64-NEXT: mv a0, s0
+; RV64-NEXT: mv a0, s1
; RV64-NEXT: .LBB32_4: # %f
; RV64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s2, 16(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s3, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s4, 0(sp) # 8-byte Folded Reload
; RV64-NEXT: .cfi_restore ra
; RV64-NEXT: .cfi_restore s0
; RV64-NEXT: .cfi_restore s1
; RV64-NEXT: .cfi_restore s2
; RV64-NEXT: .cfi_restore s3
-; RV64-NEXT: .cfi_restore s4
; RV64-NEXT: addi sp, sp, 48
; RV64-NEXT: .cfi_def_cfa_offset 0
; RV64-NEXT: ret
diff --git a/llvm/test/CodeGen/X86/abdu-neg.ll b/llvm/test/CodeGen/X86/abdu-neg.ll
index 7309f1902bff63..b7c34070f1af6a 100644
--- a/llvm/test/CodeGen/X86/abdu-neg.ll
+++ b/llvm/test/CodeGen/X86/abdu-neg.ll
@@ -804,24 +804,21 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
; X86-LABEL: abd_cmp_i64:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: subl %eax, %edi
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: sbbl %edx, %ebx
-; X86-NEXT: subl %ecx, %eax
-; X86-NEXT: sbbl %esi, %edx
-; X86-NEXT: cmovael %edi, %eax
-; X86-NEXT: cmovael %ebx, %edx
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: xorl %esi, %ecx
+; X86-NEXT: xorl %esi, %eax
+; X86-NEXT: subl %esi, %eax
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: negl %eax
+; X86-NEXT: sbbl %ecx, %edx
; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-LABEL: abd_cmp_i64:
@@ -848,34 +845,36 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl 24(%ebp), %ecx
-; X86-NEXT: movl 28(%ebp), %edx
-; X86-NEXT: movl 40(%ebp), %eax
-; X86-NEXT: movl 44(%ebp), %esi
-; X86-NEXT: subl %ecx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: sbbl %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 32(%ebp), %esi
-; X86-NEXT: movl 48(%ebp), %edi
-; X86-NEXT: sbbl %esi, %edi
-; X86-NEXT: movl 36(%ebp), %ebx
-; X86-NEXT: movl 52(%ebp), %eax
-; X86-NEXT: sbbl %ebx, %eax
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: movl 28(%ebp), %edi
+; X86-NEXT: xorl %edx, %edx
; X86-NEXT: subl 40(%ebp), %ecx
-; X86-NEXT: sbbl 44(%ebp), %edx
+; X86-NEXT: sbbl 44(%ebp), %edi
; X86-NEXT: sbbl 48(%ebp), %esi
-; X86-NEXT: sbbl 52(%ebp), %ebx
-; X86-NEXT: cmovael {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: cmovael {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: cmovael %edi, %esi
-; X86-NEXT: cmovael %eax, %ebx
+; X86-NEXT: sbbl 52(%ebp), %eax
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: xorl %ebx, %eax
+; X86-NEXT: xorl %ebx, %esi
+; X86-NEXT: xorl %ebx, %edi
+; X86-NEXT: xorl %ebx, %ecx
+; X86-NEXT: subl %ebx, %ecx
+; X86-NEXT: sbbl %ebx, %edi
+; X86-NEXT: sbbl %ebx, %esi
+; X86-NEXT: sbbl %ebx, %eax
+; X86-NEXT: negl %ecx
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %edi, %ebx
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %esi, %edi
+; X86-NEXT: sbbl %eax, %edx
; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %ebx, 12(%eax)
-; X86-NEXT: movl %esi, 8(%eax)
-; X86-NEXT: movl %edx, 4(%eax)
; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: movl %ebx, 4(%eax)
+; X86-NEXT: movl %edi, 8(%eax)
+; X86-NEXT: movl %edx, 12(%eax)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -885,15 +884,19 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
;
; X64-LABEL: abd_cmp_i128:
; X64: # %bb.0:
-; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: subq %rdi, %rax
-; X64-NEXT: movq %rcx, %r8
-; X64-NEXT: sbbq %rsi, %r8
-; X64-NEXT: subq %rdx, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: xorl %edi, %edi
+; X64-NEXT: subq %rdx, %rax
; X64-NEXT: sbbq %rcx, %rsi
-; X64-NEXT: cmovbq %rdi, %rax
-; X64-NEXT: cmovbq %rsi, %r8
-; X64-NEXT: movq %r8, %rdx
+; X64-NEXT: movl $0, %ecx
+; X64-NEXT: sbbq %rcx, %rcx
+; X64-NEXT: xorq %rcx, %rsi
+; X64-NEXT: xorq %rcx, %rax
+; X64-NEXT: subq %rcx, %rax
+; X64-NEXT: sbbq %rcx, %rsi
+; X64-NEXT: negq %rax
+; X64-NEXT: sbbq %rsi, %rdi
+; X64-NEXT: movq %rdi, %rdx
; X64-NEXT: retq
%cmp = icmp ult i128 %a, %b
%ab = sub i128 %a, %b
diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll
index 1b307b30d8c0d1..4e31b48ec5cece 100644
--- a/llvm/test/CodeGen/X86/select.ll
+++ b/llvm/test/CodeGen/X86/select.ll
@@ -2065,11 +2065,10 @@ define i64 @PR51612(i64 %x, i64 %y) {
; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %eax
; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %ecx
; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %edx
-; ATHLON-NEXT: addl $1, %ecx
-; ATHLON-NEXT: adcl $0, %edx
-; ATHLON-NEXT: incl %eax
-; ATHLON-NEXT: orl %ecx, %edx
-; ATHLON-NEXT: cmovnel %ecx, %eax
+; ATHLON-NEXT: incl %edx
+; ATHLON-NEXT: addl $1, %eax
+; ATHLON-NEXT: adcl $0, %ecx
+; ATHLON-NEXT: cmovbl %edx, %eax
; ATHLON-NEXT: andl 10, %eax
; ATHLON-NEXT: xorl %edx, %edx
; ATHLON-NEXT: retl
@@ -2078,8 +2077,7 @@ define i64 @PR51612(i64 %x, i64 %y) {
; MCU: # %bb.0:
; MCU-NEXT: addl $1, %eax
; MCU-NEXT: adcl $0, %edx
-; MCU-NEXT: orl %eax, %edx
-; MCU-NEXT: jne .LBB45_2
+; MCU-NEXT: jae .LBB45_2
; MCU-NEXT: # %bb.1:
; MCU-NEXT: movl {{[0-9]+}}(%esp), %eax
; MCU-NEXT: incl %eax
>From cc2b304a29c115ad5460f52fbdd811ee60d352ec Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Thu, 25 Sep 2025 14:07:01 -0400
Subject: [PATCH 15/17] d
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 9 +
llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 663 ++++++++++----------
2 files changed, 348 insertions(+), 324 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index bdaf48652d107f..7e03c4f8af37d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -295,6 +295,15 @@ class AMDGPUTargetLowering : public TargetLowering {
bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
return true;
}
+
+ bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
+ bool MathUsed) const override {
+ if (isOperationLegal(Opcode, VT))
+ return true;
+
+ return MathUsed && (VT.isSimple() || !isOperationExpand(Opcode, VT));
+ }
+
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
int &RefinementSteps, bool &UseOneConstNR,
bool Reciprocal) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 5134159e3e406a..34fe1255f9c3b2 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -97,24 +97,24 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[18:19], v[20:21], v18
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc
-; SDAG-NEXT: v_or_b32_e32 v9, v30, v32
-; SDAG-NEXT: v_sub_i32_e32 v11, vcc, 0x7f, v8
-; SDAG-NEXT: v_or_b32_e32 v10, v31, v33
-; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v11
-; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 64, v11
-; SDAG-NEXT: v_lshl_b64 v[34:35], v[20:21], v11
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10]
-; SDAG-NEXT: v_lshr_b64 v[8:9], v[20:21], v8
-; SDAG-NEXT: v_or_b32_e32 v9, v23, v9
-; SDAG-NEXT: v_or_b32_e32 v8, v22, v8
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v11
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v19, v9, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, v35, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, v34, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[4:5]
+; SDAG-NEXT: v_sub_i32_e32 v36, vcc, 0x7f, v8
+; SDAG-NEXT: v_or_b32_e32 v8, v30, v32
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[16:17], v36
+; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 64, v36
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[20:21], v36
+; SDAG-NEXT: v_or_b32_e32 v9, v31, v33
+; SDAG-NEXT: v_lshr_b64 v[34:35], v[20:21], v34
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_or_b32_e32 v8, v11, v35
+; SDAG-NEXT: v_or_b32_e32 v9, v10, v34
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v36
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v19, v8, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v18, v9, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, v23, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, v22, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v36
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v8, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v10, v16, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v10, 0
; SDAG-NEXT: v_mov_b32_e32 v11, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -189,9 +189,9 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v16, v2
; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5]
; SDAG-NEXT: s_cbranch_execnz .LBB0_3
-; SDAG-NEXT: ; %bb.4: ; %Flow13
+; SDAG-NEXT: ; %bb.4: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
-; SDAG-NEXT: .LBB0_5: ; %Flow14
+; SDAG-NEXT: .LBB0_5: ; %Flow13
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v19
@@ -201,7 +201,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v19, v3, v9
; SDAG-NEXT: v_or_b32_e32 v22, v10, v0
; SDAG-NEXT: v_or_b32_e32 v23, v2, v8
-; SDAG-NEXT: .LBB0_6: ; %Flow16
+; SDAG-NEXT: .LBB0_6: ; %Flow15
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v7
; SDAG-NEXT: v_ashrrev_i32_e32 v17, 31, v15
@@ -294,24 +294,24 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[12:13], v[2:3], v12
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc
-; SDAG-NEXT: v_or_b32_e32 v10, v30, v32
-; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v6
-; SDAG-NEXT: v_or_b32_e32 v11, v31, v33
-; SDAG-NEXT: v_lshl_b64 v[6:7], v[8:9], v34
-; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34
-; SDAG-NEXT: v_lshl_b64 v[14:15], v[2:3], v34
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v35
-; SDAG-NEXT: v_or_b32_e32 v7, v7, v11
-; SDAG-NEXT: v_or_b32_e32 v6, v6, v10
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34
-; SDAG-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v6, s[4:5]
+; SDAG-NEXT: v_sub_i32_e32 v36, vcc, 0x7f, v6
+; SDAG-NEXT: v_or_b32_e32 v6, v30, v32
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[8:9], v36
+; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 64, v36
+; SDAG-NEXT: v_lshl_b64 v[14:15], v[2:3], v36
+; SDAG-NEXT: v_or_b32_e32 v7, v31, v33
+; SDAG-NEXT: v_lshr_b64 v[34:35], v[2:3], v34
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; SDAG-NEXT: v_or_b32_e32 v6, v11, v35
+; SDAG-NEXT: v_or_b32_e32 v7, v10, v34
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v36
+; SDAG-NEXT: v_cndmask_b32_e64 v6, v13, v6, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, v12, v7, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v15, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34
-; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v36
+; SDAG-NEXT: v_cndmask_b32_e64 v7, v6, v9, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v8, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: v_mov_b32_e32 v13, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -388,7 +388,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_cbranch_execnz .LBB0_9
; SDAG-NEXT: ; %bb.10: ; %Flow
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
-; SDAG-NEXT: .LBB0_11: ; %Flow11
+; SDAG-NEXT: .LBB0_11: ; %Flow9
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[0:1], v[6:7], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v11
@@ -398,7 +398,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v14, v5, v3
; SDAG-NEXT: v_or_b32_e32 v5, v12, v0
; SDAG-NEXT: v_or_b32_e32 v4, v4, v2
-; SDAG-NEXT: .LBB0_12: ; %Flow12
+; SDAG-NEXT: .LBB0_12: ; %Flow10
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_xor_b32_e32 v3, v27, v26
; SDAG-NEXT: v_xor_b32_e32 v2, v25, v24
@@ -509,32 +509,34 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_addc_u32_e64 v29, s[4:5], 0, v3, vcc
; GISEL-NEXT: v_sub_i32_e32 v32, vcc, 0x7f, v2
; GISEL-NEXT: v_not_b32_e32 v2, 63
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_addc_u32_e64 v30, vcc, 0, v0, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v31, vcc, 0, v1, vcc
-; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v32, v2
-; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 64, v32
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v32, v2
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 64, v32
; GISEL-NEXT: v_lshl_b64 v[0:1], v[18:19], v32
; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], v32
-; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: v_lshr_b64 v[8:9], v[18:19], v8
; GISEL-NEXT: v_lshl_b64 v[22:23], v[18:19], v16
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v32
; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v1, vcc
-; GISEL-NEXT: v_or_b32_e32 v0, v8, v2
-; GISEL-NEXT: v_or_b32_e32 v1, v9, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v28, v30
+; GISEL-NEXT: v_or_b32_e32 v1, v29, v31
+; GISEL-NEXT: v_or_b32_e32 v2, v8, v2
+; GISEL-NEXT: v_or_b32_e32 v3, v9, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v22, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v23, v3, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v20, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v9, v1, v21, vcc
-; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v2, v20, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v3, v21, vcc
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_mov_b32_e32 v0, s8
; GISEL-NEXT: v_mov_b32_e32 v1, s9
; GISEL-NEXT: v_mov_b32_e32 v2, s10
; GISEL-NEXT: v_mov_b32_e32 v3, s11
-; GISEL-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
-; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[8:9]
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB0_5
; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4
; GISEL-NEXT: v_add_i32_e32 v32, vcc, 0xffffffc0, v28
@@ -546,7 +548,6 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_lshl_b64 v[22:23], v[20:21], v22
; GISEL-NEXT: v_or_b32_e32 v22, v2, v22
; GISEL-NEXT: v_or_b32_e32 v23, v3, v23
-; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_lshr_b64 v[2:3], v[20:21], v32
; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v22, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v23, vcc
@@ -604,9 +605,9 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v1, v23
; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GISEL-NEXT: s_cbranch_execnz .LBB0_3
-; GISEL-NEXT: ; %bb.4: ; %Flow13
+; GISEL-NEXT: ; %bb.4: ; %Flow11
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
-; GISEL-NEXT: .LBB0_5: ; %Flow14
+; GISEL-NEXT: .LBB0_5: ; %Flow13
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: v_lshl_b64 v[2:3], v[16:17], 1
; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
@@ -614,48 +615,48 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v8, v8, v10
; GISEL-NEXT: v_or_b32_e32 v22, v0, v2
; GISEL-NEXT: v_or_b32_e32 v23, v1, v3
-; GISEL-NEXT: .LBB0_6: ; %Flow16
+; GISEL-NEXT: .LBB0_6: ; %Flow15
; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_ashrrev_i32_e32 v18, 31, v7
; GISEL-NEXT: v_ashrrev_i32_e32 v19, 31, v15
-; GISEL-NEXT: v_mov_b32_e32 v10, 0x7f
-; GISEL-NEXT: v_mov_b32_e32 v11, 0
+; GISEL-NEXT: v_mov_b32_e32 v16, 0x7f
+; GISEL-NEXT: v_mov_b32_e32 v17, 0
; GISEL-NEXT: v_xor_b32_e32 v0, v18, v4
; GISEL-NEXT: v_xor_b32_e32 v1, v18, v5
; GISEL-NEXT: v_xor_b32_e32 v2, v18, v6
; GISEL-NEXT: v_xor_b32_e32 v3, v18, v7
; GISEL-NEXT: v_xor_b32_e32 v4, v19, v12
; GISEL-NEXT: v_xor_b32_e32 v5, v19, v13
-; GISEL-NEXT: v_xor_b32_e32 v14, v19, v14
-; GISEL-NEXT: v_xor_b32_e32 v15, v19, v15
+; GISEL-NEXT: v_xor_b32_e32 v12, v19, v14
+; GISEL-NEXT: v_xor_b32_e32 v13, v19, v15
; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v0, v18
; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v1, v18, vcc
; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], v4, v19
; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], v5, v19, s[4:5]
-; GISEL-NEXT: v_subb_u32_e32 v12, vcc, v2, v18, vcc
-; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v18, vcc
-; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v14, v19, s[4:5]
-; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v15, v19, vcc
-; GISEL-NEXT: v_ffbh_u32_e32 v14, v21
-; GISEL-NEXT: v_ffbh_u32_e32 v15, v20
-; GISEL-NEXT: v_ffbh_u32_e32 v16, v7
-; GISEL-NEXT: v_ffbh_u32_e32 v17, v6
+; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v2, v18, vcc
+; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v3, v18, vcc
+; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v12, v19, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v13, v19, vcc
+; GISEL-NEXT: v_ffbh_u32_e32 v12, v21
+; GISEL-NEXT: v_ffbh_u32_e32 v13, v20
+; GISEL-NEXT: v_ffbh_u32_e32 v14, v7
+; GISEL-NEXT: v_ffbh_u32_e32 v15, v6
; GISEL-NEXT: v_or_b32_e32 v0, v20, v4
; GISEL-NEXT: v_or_b32_e32 v1, v21, v5
-; GISEL-NEXT: v_or_b32_e32 v2, v6, v12
-; GISEL-NEXT: v_or_b32_e32 v3, v7, v13
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15
+; GISEL-NEXT: v_or_b32_e32 v2, v6, v10
+; GISEL-NEXT: v_or_b32_e32 v3, v7, v11
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, 32, v13
; GISEL-NEXT: v_ffbh_u32_e32 v26, v5
; GISEL-NEXT: v_ffbh_u32_e32 v27, v4
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, 32, v17
-; GISEL-NEXT: v_ffbh_u32_e32 v28, v13
-; GISEL-NEXT: v_ffbh_u32_e32 v29, v12
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15
+; GISEL-NEXT: v_ffbh_u32_e32 v28, v11
+; GISEL-NEXT: v_ffbh_u32_e32 v29, v10
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
-; GISEL-NEXT: v_min_u32_e32 v0, v14, v15
+; GISEL-NEXT: v_min_u32_e32 v0, v12, v13
; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v27
-; GISEL-NEXT: v_min_u32_e32 v2, v16, v17
+; GISEL-NEXT: v_min_u32_e32 v2, v14, v15
; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v29
; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0
; GISEL-NEXT: v_min_u32_e32 v1, v26, v1
@@ -665,32 +666,32 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1
; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11]
+; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[16:17]
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_xor_b32_e32 v10, 0x7f, v2
+; GISEL-NEXT: v_xor_b32_e32 v12, 0x7f, v2
; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_or_b32_e32 v10, v10, v0
-; GISEL-NEXT: v_or_b32_e32 v11, v3, v1
+; GISEL-NEXT: v_or_b32_e32 v12, v12, v0
+; GISEL-NEXT: v_or_b32_e32 v13, v3, v1
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_or_b32_e32 v11, v14, v15
-; GISEL-NEXT: v_and_b32_e32 v14, 1, v11
-; GISEL-NEXT: v_or_b32_e32 v10, v11, v10
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v13, v14, v15
+; GISEL-NEXT: v_and_b32_e32 v14, 1, v13
+; GISEL-NEXT: v_or_b32_e32 v12, v13, v12
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, v6, 0, vcc
-; GISEL-NEXT: v_and_b32_e32 v16, 1, v10
+; GISEL-NEXT: v_and_b32_e32 v16, 1, v12
; GISEL-NEXT: v_cndmask_b32_e64 v15, v7, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v10, v12, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v11, v13, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v12, v10, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v13, v11, 0, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
@@ -700,56 +701,58 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v3, vcc
; GISEL-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v2
; GISEL-NEXT: v_not_b32_e32 v2, 63
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v0, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v1, vcc
-; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v30, v2
-; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v30
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v30, v2
+; GISEL-NEXT: v_sub_i32_e32 v12, vcc, 64, v30
; GISEL-NEXT: v_lshl_b64 v[0:1], v[6:7], v30
-; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], v30
-; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
-; GISEL-NEXT: v_lshr_b64 v[10:11], v[6:7], v10
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[10:11], v30
+; GISEL-NEXT: v_lshr_b64 v[12:13], v[6:7], v12
; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v14
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30
; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v15, 0, v1, vcc
-; GISEL-NEXT: v_or_b32_e32 v0, v10, v2
-; GISEL-NEXT: v_or_b32_e32 v1, v11, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v26, v28
+; GISEL-NEXT: v_or_b32_e32 v1, v27, v29
+; GISEL-NEXT: v_or_b32_e32 v2, v12, v2
+; GISEL-NEXT: v_or_b32_e32 v3, v13, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v0, v12, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v1, v13, vcc
-; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v2, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v3, v11, vcc
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_mov_b32_e32 v0, s8
; GISEL-NEXT: v_mov_b32_e32 v1, s9
; GISEL-NEXT: v_mov_b32_e32 v2, s10
; GISEL-NEXT: v_mov_b32_e32 v3, s11
-; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB0_11
; GISEL-NEXT: ; %bb.8: ; %udiv-preheader
; GISEL-NEXT: v_add_i32_e32 v32, vcc, 0xffffffc0, v26
; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 64, v26
-; GISEL-NEXT: v_lshr_b64 v[0:1], v[12:13], v26
+; GISEL-NEXT: v_lshr_b64 v[0:1], v[10:11], v26
; GISEL-NEXT: v_lshr_b64 v[2:3], v[6:7], v26
; GISEL-NEXT: s_mov_b64 s[4:5], 0
; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v20
; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v21, vcc
-; GISEL-NEXT: v_lshl_b64 v[16:17], v[12:13], v16
-; GISEL-NEXT: v_lshr_b64 v[12:13], v[12:13], v32
+; GISEL-NEXT: v_lshl_b64 v[16:17], v[10:11], v16
+; GISEL-NEXT: v_lshr_b64 v[10:11], v[10:11], v32
; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v4, vcc
; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v5, vcc
; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
; GISEL-NEXT: v_or_b32_e32 v2, v2, v16
; GISEL-NEXT: v_or_b32_e32 v3, v3, v17
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26
-; GISEL-NEXT: v_cndmask_b32_e32 v12, v2, v6, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v13, v3, v7, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v10, v2, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v11, v3, v7, vcc
; GISEL-NEXT: v_mov_b32_e32 v7, 0
; GISEL-NEXT: v_mov_b32_e32 v0, s4
; GISEL-NEXT: v_mov_b32_e32 v1, s5
@@ -757,20 +760,20 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v3, s7
; GISEL-NEXT: .LBB0_9: ; %udiv-do-while
; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], 1
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[10:11], 1
; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
-; GISEL-NEXT: v_lshrrev_b32_e32 v6, 31, v13
-; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v11
-; GISEL-NEXT: v_lshl_b64 v[12:13], v[14:15], 1
-; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v6, 31, v11
+; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v13
+; GISEL-NEXT: v_lshl_b64 v[10:11], v[14:15], 1
+; GISEL-NEXT: v_lshl_b64 v[12:13], v[12:13], 1
; GISEL-NEXT: v_lshrrev_b32_e32 v14, 31, v15
; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v26
; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc
; GISEL-NEXT: v_or_b32_e32 v16, v16, v6
; GISEL-NEXT: v_or_b32_e32 v2, v2, v34
-; GISEL-NEXT: v_or_b32_e32 v10, v10, v14
-; GISEL-NEXT: v_or_b32_e32 v14, v0, v12
-; GISEL-NEXT: v_or_b32_e32 v15, v1, v13
+; GISEL-NEXT: v_or_b32_e32 v12, v12, v14
+; GISEL-NEXT: v_or_b32_e32 v14, v0, v10
+; GISEL-NEXT: v_or_b32_e32 v15, v1, v11
; GISEL-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc
; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v30, v2
@@ -783,29 +786,29 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v6
; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GISEL-NEXT: v_and_b32_e32 v6, 1, v0
-; GISEL-NEXT: v_and_b32_e32 v12, v0, v20
-; GISEL-NEXT: v_and_b32_e32 v13, v0, v21
+; GISEL-NEXT: v_and_b32_e32 v10, v0, v20
+; GISEL-NEXT: v_and_b32_e32 v11, v0, v21
; GISEL-NEXT: v_and_b32_e32 v34, v0, v4
; GISEL-NEXT: v_and_b32_e32 v35, v0, v5
; GISEL-NEXT: v_mov_b32_e32 v0, v6
; GISEL-NEXT: v_mov_b32_e32 v1, v7
-; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v12
-; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v13, vcc
+; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v2, v10
+; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v3, v11, vcc
; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v34, vcc
; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v35, vcc
; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GISEL-NEXT: s_cbranch_execnz .LBB0_9
; GISEL-NEXT: ; %bb.10: ; %Flow
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB0_11: ; %Flow11
+; GISEL-NEXT: .LBB0_11: ; %Flow9
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1
-; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; GISEL-NEXT: v_lshl_b64 v[12:13], v[12:13], 1
; GISEL-NEXT: v_lshrrev_b32_e32 v4, 31, v15
-; GISEL-NEXT: v_or_b32_e32 v10, v10, v4
+; GISEL-NEXT: v_or_b32_e32 v12, v12, v4
; GISEL-NEXT: v_or_b32_e32 v14, v0, v2
; GISEL-NEXT: v_or_b32_e32 v15, v1, v3
-; GISEL-NEXT: .LBB0_12: ; %Flow12
+; GISEL-NEXT: .LBB0_12: ; %Flow10
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: v_xor_b32_e32 v3, v25, v24
; GISEL-NEXT: v_xor_b32_e32 v7, v19, v18
@@ -815,8 +818,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_xor_b32_e32 v6, v9, v3
; GISEL-NEXT: v_xor_b32_e32 v4, v14, v7
; GISEL-NEXT: v_xor_b32_e32 v5, v15, v7
-; GISEL-NEXT: v_xor_b32_e32 v8, v10, v7
-; GISEL-NEXT: v_xor_b32_e32 v9, v11, v7
+; GISEL-NEXT: v_xor_b32_e32 v8, v12, v7
+; GISEL-NEXT: v_xor_b32_e32 v9, v13, v7
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v7
@@ -903,24 +906,24 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[16:17], v[0:1], v16
; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v23, vcc
; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v24, vcc
-; SDAG-NEXT: v_or_b32_e32 v22, v18, v28
-; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v21
-; SDAG-NEXT: v_or_b32_e32 v23, v27, v29
-; SDAG-NEXT: v_lshl_b64 v[24:25], v[2:3], v26
-; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 64, v26
-; SDAG-NEXT: v_lshl_b64 v[30:31], v[0:1], v26
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[22:23]
-; SDAG-NEXT: v_lshr_b64 v[21:22], v[0:1], v21
-; SDAG-NEXT: v_or_b32_e32 v22, v25, v22
-; SDAG-NEXT: v_or_b32_e32 v21, v24, v21
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26
-; SDAG-NEXT: v_cndmask_b32_e64 v22, v17, v22, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, v16, v21, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v30, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26
-; SDAG-NEXT: v_cndmask_b32_e64 v22, v22, v3, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, v21, v2, s[4:5]
+; SDAG-NEXT: v_sub_i32_e32 v32, vcc, 0x7f, v21
+; SDAG-NEXT: v_or_b32_e32 v21, v18, v28
+; SDAG-NEXT: v_lshl_b64 v[23:24], v[2:3], v32
+; SDAG-NEXT: v_sub_i32_e32 v30, vcc, 64, v32
+; SDAG-NEXT: v_lshl_b64 v[25:26], v[0:1], v32
+; SDAG-NEXT: v_or_b32_e32 v22, v27, v29
+; SDAG-NEXT: v_lshr_b64 v[30:31], v[0:1], v30
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[21:22]
+; SDAG-NEXT: v_or_b32_e32 v21, v24, v31
+; SDAG-NEXT: v_or_b32_e32 v22, v23, v30
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v32
+; SDAG-NEXT: v_cndmask_b32_e64 v21, v17, v21, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v23, v16, v22, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v26, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v25, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v32
+; SDAG-NEXT: v_cndmask_b32_e64 v22, v21, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, v23, v2, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v23, 0
; SDAG-NEXT: v_mov_b32_e32 v24, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -995,9 +998,9 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v25, v19
; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5]
; SDAG-NEXT: s_cbranch_execnz .LBB1_3
-; SDAG-NEXT: ; %bb.4: ; %Flow13
+; SDAG-NEXT: ; %bb.4: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
-; SDAG-NEXT: .LBB1_5: ; %Flow14
+; SDAG-NEXT: .LBB1_5: ; %Flow13
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[0:1], v[21:22], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v17
@@ -1007,7 +1010,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v18, v20, v3
; SDAG-NEXT: v_or_b32_e32 v17, v23, v0
; SDAG-NEXT: v_or_b32_e32 v19, v19, v2
-; SDAG-NEXT: .LBB1_6: ; %Flow16
+; SDAG-NEXT: .LBB1_6: ; %Flow15
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_or_b32_e32 v1, v13, v15
; SDAG-NEXT: v_or_b32_e32 v0, v12, v14
@@ -1078,24 +1081,24 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[8:9], v[4:5], v8
; SDAG-NEXT: v_addc_u32_e32 v24, vcc, 0, v20, vcc
; SDAG-NEXT: v_addc_u32_e32 v25, vcc, 0, v21, vcc
-; SDAG-NEXT: v_or_b32_e32 v10, v22, v24
-; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v2
-; SDAG-NEXT: v_or_b32_e32 v11, v23, v25
-; SDAG-NEXT: v_lshl_b64 v[2:3], v[6:7], v26
-; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26
-; SDAG-NEXT: v_lshl_b64 v[20:21], v[4:5], v26
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_lshr_b64 v[10:11], v[4:5], v27
-; SDAG-NEXT: v_or_b32_e32 v3, v3, v11
-; SDAG-NEXT: v_or_b32_e32 v2, v2, v10
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v2
+; SDAG-NEXT: v_or_b32_e32 v2, v22, v24
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[6:7], v28
+; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 64, v28
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[4:5], v28
+; SDAG-NEXT: v_or_b32_e32 v3, v23, v25
+; SDAG-NEXT: v_lshr_b64 v[26:27], v[4:5], v26
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT: v_or_b32_e32 v2, v11, v27
+; SDAG-NEXT: v_or_b32_e32 v3, v10, v26
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v28
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v8, v3, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, v21, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v20, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v28
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v2, v7, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v6, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: v_mov_b32_e32 v21, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -1172,7 +1175,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_cbranch_execnz .LBB1_9
; SDAG-NEXT: ; %bb.10: ; %Flow
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
-; SDAG-NEXT: .LBB1_11: ; %Flow11
+; SDAG-NEXT: .LBB1_11: ; %Flow9
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v9
@@ -1182,7 +1185,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v10, v1, v5
; SDAG-NEXT: v_or_b32_e32 v9, v20, v2
; SDAG-NEXT: v_or_b32_e32 v11, v0, v4
-; SDAG-NEXT: .LBB1_12: ; %Flow12
+; SDAG-NEXT: .LBB1_12: ; %Flow10
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_mov_b32_e32 v0, v19
; SDAG-NEXT: v_mov_b32_e32 v1, v18
@@ -1263,34 +1266,36 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v22
; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v23, vcc
-; GISEL-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v22
+; GISEL-NEXT: v_sub_i32_e32 v32, vcc, 0x7f, v22
; GISEL-NEXT: v_not_b32_e32 v2, 63
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v20, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v21, vcc
-; GISEL-NEXT: v_add_i32_e64 v22, s[4:5], v30, v2
-; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], 64, v30
-; GISEL-NEXT: v_lshl_b64 v[2:3], v[0:1], v30
-; GISEL-NEXT: v_lshl_b64 v[18:19], v[16:17], v30
-; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: v_add_i32_e32 v22, vcc, v32, v2
+; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 64, v32
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[0:1], v32
+; GISEL-NEXT: v_lshl_b64 v[18:19], v[16:17], v32
; GISEL-NEXT: v_lshr_b64 v[20:21], v[0:1], v20
; GISEL-NEXT: v_lshl_b64 v[24:25], v[0:1], v22
-; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v32
; GISEL-NEXT: v_cndmask_b32_e32 v22, 0, v2, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v23, 0, v3, vcc
+; GISEL-NEXT: v_or_b32_e32 v30, v26, v28
+; GISEL-NEXT: v_or_b32_e32 v31, v27, v29
; GISEL-NEXT: v_or_b32_e32 v2, v20, v18
; GISEL-NEXT: v_or_b32_e32 v3, v21, v19
; GISEL-NEXT: v_cndmask_b32_e32 v2, v24, v2, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v25, v3, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32
; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc
-; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[30:31]
; GISEL-NEXT: v_mov_b32_e32 v21, s11
; GISEL-NEXT: v_mov_b32_e32 v20, s10
; GISEL-NEXT: v_mov_b32_e32 v19, s9
; GISEL-NEXT: v_mov_b32_e32 v18, s8
-; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB1_5
; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4
; GISEL-NEXT: v_add_i32_e32 v32, vcc, 0xffffffc0, v26
@@ -1360,9 +1365,9 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v18, v0
; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GISEL-NEXT: s_cbranch_execnz .LBB1_3
-; GISEL-NEXT: ; %bb.4: ; %Flow13
+; GISEL-NEXT: ; %bb.4: ; %Flow11
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB1_5: ; %Flow14
+; GISEL-NEXT: .LBB1_5: ; %Flow13
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: v_lshl_b64 v[0:1], v[22:23], 1
; GISEL-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
@@ -1370,7 +1375,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v2, v2, v8
; GISEL-NEXT: v_or_b32_e32 v18, v18, v0
; GISEL-NEXT: v_or_b32_e32 v19, v19, v1
-; GISEL-NEXT: .LBB1_6: ; %Flow16
+; GISEL-NEXT: .LBB1_6: ; %Flow15
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_or_b32_e32 v0, v12, v14
@@ -1436,34 +1441,36 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: ; %bb.7: ; %udiv-bb1
; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v16
; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v17, vcc
-; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16
+; GISEL-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v16
; GISEL-NEXT: v_not_b32_e32 v9, 63
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_addc_u32_e64 v24, vcc, 0, v0, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v26, v9
-; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v26
-; GISEL-NEXT: v_lshl_b64 v[0:1], v[4:5], v26
-; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v26
-; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v28, v9
+; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 64, v28
+; GISEL-NEXT: v_lshl_b64 v[0:1], v[4:5], v28
+; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v28
; GISEL-NEXT: v_lshr_b64 v[20:21], v[4:5], v10
; GISEL-NEXT: v_lshl_b64 v[22:23], v[4:5], v9
-; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v28
; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc
+; GISEL-NEXT: v_or_b32_e32 v26, v8, v24
+; GISEL-NEXT: v_or_b32_e32 v27, v11, v25
; GISEL-NEXT: v_or_b32_e32 v0, v20, v16
; GISEL-NEXT: v_or_b32_e32 v1, v21, v17
; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v28
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[26:27]
; GISEL-NEXT: v_mov_b32_e32 v23, s11
; GISEL-NEXT: v_mov_b32_e32 v22, s10
; GISEL-NEXT: v_mov_b32_e32 v21, s9
; GISEL-NEXT: v_mov_b32_e32 v20, s8
-; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB1_11
; GISEL-NEXT: ; %bb.8: ; %udiv-preheader
; GISEL-NEXT: v_add_i32_e32 v28, vcc, 0xffffffc0, v8
@@ -1535,7 +1542,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: s_cbranch_execnz .LBB1_9
; GISEL-NEXT: ; %bb.10: ; %Flow
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB1_11: ; %Flow11
+; GISEL-NEXT: .LBB1_11: ; %Flow9
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], 1
; GISEL-NEXT: v_lshl_b64 v[8:9], v[0:1], 1
@@ -1543,7 +1550,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v8, v8, v0
; GISEL-NEXT: v_or_b32_e32 v10, v20, v4
; GISEL-NEXT: v_or_b32_e32 v11, v21, v5
-; GISEL-NEXT: .LBB1_12: ; %Flow12
+; GISEL-NEXT: .LBB1_12: ; %Flow10
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: v_mov_b32_e32 v0, v18
; GISEL-NEXT: v_mov_b32_e32 v1, v19
@@ -1650,24 +1657,24 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v20
; SDAG-NEXT: v_addc_u32_e32 v34, vcc, 0, v18, vcc
; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc
-; SDAG-NEXT: v_or_b32_e32 v18, v32, v34
-; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v10
-; SDAG-NEXT: v_or_b32_e32 v19, v33, v35
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[0:1], v24
-; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 64, v24
-; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v24
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
-; SDAG-NEXT: v_lshr_b64 v[18:19], v[16:17], v25
-; SDAG-NEXT: v_or_b32_e32 v11, v11, v19
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v18
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
-; SDAG-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[4:5]
+; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v10
+; SDAG-NEXT: v_or_b32_e32 v10, v32, v34
+; SDAG-NEXT: v_lshl_b64 v[18:19], v[0:1], v26
+; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 64, v26
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v26
+; SDAG-NEXT: v_or_b32_e32 v11, v33, v35
+; SDAG-NEXT: v_lshr_b64 v[24:25], v[16:17], v24
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_or_b32_e32 v10, v19, v25
+; SDAG-NEXT: v_or_b32_e32 v11, v18, v24
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v21, v10, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v20, v11, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24
-; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, v1, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v10, v1, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v18, v0, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v18, 0
; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -1742,9 +1749,9 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v22, v8
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
; SDAG-NEXT: s_cbranch_execnz .LBB2_3
-; SDAG-NEXT: ; %bb.4: ; %Flow13
+; SDAG-NEXT: ; %bb.4: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
-; SDAG-NEXT: .LBB2_5: ; %Flow14
+; SDAG-NEXT: .LBB2_5: ; %Flow13
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v21
@@ -1754,7 +1761,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v32, v18, v10
; SDAG-NEXT: v_or_b32_e32 v27, v9, v21
; SDAG-NEXT: v_or_b32_e32 v33, v8, v20
-; SDAG-NEXT: .LBB2_6: ; %Flow16
+; SDAG-NEXT: .LBB2_6: ; %Flow15
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v7
; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 0, v4
@@ -1845,22 +1852,22 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[18:19], v[8:9], v18
; SDAG-NEXT: v_addc_u32_e32 v48, vcc, 0, v14, vcc
; SDAG-NEXT: v_addc_u32_e32 v49, vcc, 0, v15, vcc
-; SDAG-NEXT: v_or_b32_e32 v14, v38, v48
-; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 0x7f, v10
-; SDAG-NEXT: v_or_b32_e32 v15, v39, v49
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[4:5], v22
-; SDAG-NEXT: v_sub_i32_e32 v23, vcc, 64, v22
-; SDAG-NEXT: v_lshl_b64 v[20:21], v[8:9], v22
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; SDAG-NEXT: v_lshr_b64 v[14:15], v[8:9], v23
-; SDAG-NEXT: v_or_b32_e32 v11, v11, v15
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v14
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v22
-; SDAG-NEXT: v_cndmask_b32_e64 v14, v19, v11, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v10, s[4:5]
+; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v10
+; SDAG-NEXT: v_or_b32_e32 v10, v38, v48
+; SDAG-NEXT: v_lshl_b64 v[14:15], v[4:5], v24
+; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 64, v24
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[8:9], v24
+; SDAG-NEXT: v_or_b32_e32 v11, v39, v49
+; SDAG-NEXT: v_lshr_b64 v[22:23], v[8:9], v22
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_or_b32_e32 v10, v15, v23
+; SDAG-NEXT: v_or_b32_e32 v11, v14, v22
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
+; SDAG-NEXT: v_cndmask_b32_e64 v14, v19, v10, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v11, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v21, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v20, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24
; SDAG-NEXT: v_cndmask_b32_e64 v15, v14, v5, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v14, v18, v4, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v18, 0
@@ -1939,7 +1946,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_cbranch_execnz .LBB2_9
; SDAG-NEXT: ; %bb.10: ; %Flow
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
-; SDAG-NEXT: .LBB2_11: ; %Flow11
+; SDAG-NEXT: .LBB2_11: ; %Flow9
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v11
@@ -1949,7 +1956,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v13, v13, v11
; SDAG-NEXT: v_or_b32_e32 v18, v18, v14
; SDAG-NEXT: v_or_b32_e32 v12, v12, v10
-; SDAG-NEXT: .LBB2_12: ; %Flow12
+; SDAG-NEXT: .LBB2_12: ; %Flow10
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_mul_lo_u32 v14, v33, v3
; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v33, v2, 0
@@ -2107,32 +2114,34 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_addc_u32_e64 v32, s[4:5], 0, v3, vcc
; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v2
; GISEL-NEXT: v_not_b32_e32 v2, 63
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_addc_u32_e64 v33, vcc, 0, v0, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v34, vcc, 0, v1, vcc
-; GISEL-NEXT: v_add_i32_e64 v20, s[4:5], v24, v2
-; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 64, v24
+; GISEL-NEXT: v_add_i32_e32 v20, vcc, v24, v2
+; GISEL-NEXT: v_sub_i32_e32 v18, vcc, 64, v24
; GISEL-NEXT: v_lshl_b64 v[0:1], v[16:17], v24
; GISEL-NEXT: v_lshl_b64 v[2:3], v[8:9], v24
-; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: v_lshr_b64 v[18:19], v[16:17], v18
; GISEL-NEXT: v_lshl_b64 v[22:23], v[16:17], v20
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24
; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc
-; GISEL-NEXT: v_or_b32_e32 v0, v18, v2
-; GISEL-NEXT: v_or_b32_e32 v1, v19, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v31, v33
+; GISEL-NEXT: v_or_b32_e32 v1, v32, v34
+; GISEL-NEXT: v_or_b32_e32 v2, v18, v2
+; GISEL-NEXT: v_or_b32_e32 v3, v19, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v22, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v23, v3, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
-; GISEL-NEXT: v_cndmask_b32_e32 v18, v0, v8, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v19, v1, v9, vcc
-; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e32 v18, v2, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v19, v3, v9, vcc
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_mov_b32_e32 v0, s8
; GISEL-NEXT: v_mov_b32_e32 v1, s9
; GISEL-NEXT: v_mov_b32_e32 v2, s10
; GISEL-NEXT: v_mov_b32_e32 v3, s11
-; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB2_5
; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4
; GISEL-NEXT: v_add_i32_e32 v24, vcc, 0xffffffc0, v31
@@ -2202,9 +2211,9 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v1, v23
; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GISEL-NEXT: s_cbranch_execnz .LBB2_3
-; GISEL-NEXT: ; %bb.4: ; %Flow13
+; GISEL-NEXT: ; %bb.4: ; %Flow11
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB2_5: ; %Flow14
+; GISEL-NEXT: .LBB2_5: ; %Flow13
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1
; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
@@ -2212,7 +2221,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v18, v18, v20
; GISEL-NEXT: v_or_b32_e32 v31, v0, v2
; GISEL-NEXT: v_or_b32_e32 v32, v1, v3
-; GISEL-NEXT: .LBB2_6: ; %Flow16
+; GISEL-NEXT: .LBB2_6: ; %Flow15
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_ashrrev_i32_e32 v33, 31, v7
@@ -2298,32 +2307,34 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_addc_u32_e64 v37, s[4:5], 0, v15, vcc
; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v14
; GISEL-NEXT: v_not_b32_e32 v2, 63
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_addc_u32_e64 v38, vcc, 0, v0, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v39, vcc, 0, v1, vcc
-; GISEL-NEXT: v_add_i32_e64 v20, s[4:5], v24, v2
-; GISEL-NEXT: v_sub_i32_e64 v14, s[4:5], 64, v24
+; GISEL-NEXT: v_add_i32_e32 v20, vcc, v24, v2
+; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 64, v24
; GISEL-NEXT: v_lshl_b64 v[0:1], v[12:13], v24
; GISEL-NEXT: v_lshl_b64 v[2:3], v[6:7], v24
-; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: v_lshr_b64 v[14:15], v[12:13], v14
; GISEL-NEXT: v_lshl_b64 v[22:23], v[12:13], v20
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24
; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc
-; GISEL-NEXT: v_or_b32_e32 v0, v14, v2
-; GISEL-NEXT: v_or_b32_e32 v1, v15, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v36, v38
+; GISEL-NEXT: v_or_b32_e32 v1, v37, v39
+; GISEL-NEXT: v_or_b32_e32 v2, v14, v2
+; GISEL-NEXT: v_or_b32_e32 v3, v15, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v22, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v23, v3, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
-; GISEL-NEXT: v_cndmask_b32_e32 v14, v0, v6, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v15, v1, v7, vcc
-; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v2, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v15, v3, v7, vcc
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_mov_b32_e32 v0, s8
; GISEL-NEXT: v_mov_b32_e32 v1, s9
; GISEL-NEXT: v_mov_b32_e32 v2, s10
; GISEL-NEXT: v_mov_b32_e32 v3, s11
-; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB2_11
; GISEL-NEXT: ; %bb.8: ; %udiv-preheader
; GISEL-NEXT: v_add_i32_e32 v24, vcc, 0xffffffc0, v36
@@ -2395,7 +2406,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: s_cbranch_execnz .LBB2_9
; GISEL-NEXT: ; %bb.10: ; %Flow
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB2_11: ; %Flow11
+; GISEL-NEXT: .LBB2_11: ; %Flow9
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: v_lshl_b64 v[22:23], v[20:21], 1
; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1
@@ -2403,7 +2414,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v2, v2, v14
; GISEL-NEXT: v_or_b32_e32 v20, v0, v22
; GISEL-NEXT: v_or_b32_e32 v21, v1, v23
-; GISEL-NEXT: .LBB2_12: ; %Flow12
+; GISEL-NEXT: .LBB2_12: ; %Flow10
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0
@@ -2533,24 +2544,24 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[22:23], v[0:1], v22
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v20, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v21, vcc
-; SDAG-NEXT: v_or_b32_e32 v19, v30, v32
-; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0x7f, v18
-; SDAG-NEXT: v_or_b32_e32 v20, v31, v33
-; SDAG-NEXT: v_lshl_b64 v[24:25], v[2:3], v21
-; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 64, v21
-; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v21
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20]
-; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v18
-; SDAG-NEXT: v_or_b32_e32 v19, v25, v19
-; SDAG-NEXT: v_or_b32_e32 v18, v24, v18
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v21
-; SDAG-NEXT: v_cndmask_b32_e64 v19, v23, v19, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v18, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v27, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, v26, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v21
-; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v3, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v2, s[4:5]
+; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v18
+; SDAG-NEXT: v_or_b32_e32 v18, v30, v32
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[2:3], v28
+; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 64, v28
+; SDAG-NEXT: v_lshl_b64 v[24:25], v[0:1], v28
+; SDAG-NEXT: v_or_b32_e32 v19, v31, v33
+; SDAG-NEXT: v_lshr_b64 v[26:27], v[0:1], v26
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_or_b32_e32 v18, v21, v27
+; SDAG-NEXT: v_or_b32_e32 v19, v20, v26
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v28
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v23, v18, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, v22, v19, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v25, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, v24, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v28
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v18, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v20, v2, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: v_mov_b32_e32 v21, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -2625,9 +2636,9 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v24, v16
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
; SDAG-NEXT: s_cbranch_execnz .LBB3_3
-; SDAG-NEXT: ; %bb.4: ; %Flow13
+; SDAG-NEXT: ; %bb.4: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
-; SDAG-NEXT: .LBB3_5: ; %Flow14
+; SDAG-NEXT: .LBB3_5: ; %Flow13
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v23
@@ -2637,7 +2648,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v30, v17, v23
; SDAG-NEXT: v_or_b32_e32 v31, v20, v18
; SDAG-NEXT: v_or_b32_e32 v32, v16, v22
-; SDAG-NEXT: .LBB3_6: ; %Flow16
+; SDAG-NEXT: .LBB3_6: ; %Flow15
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_or_b32_e32 v17, v13, v15
; SDAG-NEXT: v_or_b32_e32 v16, v12, v14
@@ -2708,24 +2719,24 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[22:23], v[4:5], v22
; SDAG-NEXT: v_addc_u32_e32 v36, vcc, 0, v20, vcc
; SDAG-NEXT: v_addc_u32_e32 v37, vcc, 0, v21, vcc
-; SDAG-NEXT: v_or_b32_e32 v19, v34, v36
; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v18
-; SDAG-NEXT: v_or_b32_e32 v20, v35, v37
-; SDAG-NEXT: v_lshl_b64 v[24:25], v[6:7], v28
-; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 64, v28
-; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v28
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20]
-; SDAG-NEXT: v_lshr_b64 v[18:19], v[4:5], v18
-; SDAG-NEXT: v_or_b32_e32 v19, v25, v19
-; SDAG-NEXT: v_or_b32_e32 v18, v24, v18
+; SDAG-NEXT: v_or_b32_e32 v18, v34, v36
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[6:7], v28
+; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 64, v28
+; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v28
+; SDAG-NEXT: v_or_b32_e32 v19, v35, v37
+; SDAG-NEXT: v_lshr_b64 v[26:27], v[4:5], v26
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_or_b32_e32 v18, v21, v27
+; SDAG-NEXT: v_or_b32_e32 v19, v20, v26
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v28
-; SDAG-NEXT: v_cndmask_b32_e64 v19, v23, v19, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v18, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v27, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v26, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v23, v18, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v22, v22, v19, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v25, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v24, s[4:5]
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v28
-; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v7, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v6, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v18, v7, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v6, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v22, 0
; SDAG-NEXT: v_mov_b32_e32 v23, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -2802,7 +2813,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_cbranch_execnz .LBB3_9
; SDAG-NEXT: ; %bb.10: ; %Flow
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
-; SDAG-NEXT: .LBB3_11: ; %Flow11
+; SDAG-NEXT: .LBB3_11: ; %Flow9
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v21
@@ -2812,7 +2823,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v17, v17, v21
; SDAG-NEXT: v_or_b32_e32 v22, v22, v18
; SDAG-NEXT: v_or_b32_e32 v16, v16, v20
-; SDAG-NEXT: .LBB3_12: ; %Flow12
+; SDAG-NEXT: .LBB3_12: ; %Flow10
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_mul_lo_u32 v20, v32, v11
; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v32, v10, 0
@@ -2934,32 +2945,34 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_addc_u32_e64 v31, s[4:5], 0, v19, vcc
; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v18
; GISEL-NEXT: v_not_b32_e32 v18, 63
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_addc_u32_e64 v32, vcc, 0, v16, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v33, vcc, 0, v17, vcc
-; GISEL-NEXT: v_add_i32_e64 v22, s[4:5], v26, v18
-; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], 64, v26
+; GISEL-NEXT: v_add_i32_e32 v22, vcc, v26, v18
+; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 64, v26
; GISEL-NEXT: v_lshl_b64 v[16:17], v[0:1], v26
; GISEL-NEXT: v_lshl_b64 v[18:19], v[2:3], v26
-; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: v_lshr_b64 v[20:21], v[0:1], v20
; GISEL-NEXT: v_lshl_b64 v[24:25], v[0:1], v22
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26
; GISEL-NEXT: v_cndmask_b32_e32 v22, 0, v16, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v23, 0, v17, vcc
-; GISEL-NEXT: v_or_b32_e32 v16, v20, v18
-; GISEL-NEXT: v_or_b32_e32 v17, v21, v19
-; GISEL-NEXT: v_cndmask_b32_e32 v16, v24, v16, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v17, v25, v17, vcc
+; GISEL-NEXT: v_or_b32_e32 v16, v30, v32
+; GISEL-NEXT: v_or_b32_e32 v17, v31, v33
+; GISEL-NEXT: v_or_b32_e32 v18, v20, v18
+; GISEL-NEXT: v_or_b32_e32 v19, v21, v19
+; GISEL-NEXT: v_cndmask_b32_e32 v18, v24, v18, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v19, v25, v19, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26
-; GISEL-NEXT: v_cndmask_b32_e32 v20, v16, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v21, v17, v3, vcc
-; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e32 v20, v18, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v21, v19, v3, vcc
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
; GISEL-NEXT: v_mov_b32_e32 v19, s11
; GISEL-NEXT: v_mov_b32_e32 v18, s10
; GISEL-NEXT: v_mov_b32_e32 v17, s9
; GISEL-NEXT: v_mov_b32_e32 v16, s8
-; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB3_5
; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4
; GISEL-NEXT: v_add_i32_e32 v26, vcc, 0xffffffc0, v30
@@ -3029,9 +3042,9 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v17, v25
; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GISEL-NEXT: s_cbranch_execnz .LBB3_3
-; GISEL-NEXT: ; %bb.4: ; %Flow13
+; GISEL-NEXT: ; %bb.4: ; %Flow11
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB3_5: ; %Flow14
+; GISEL-NEXT: .LBB3_5: ; %Flow13
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1
; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
@@ -3039,7 +3052,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v20, v20, v22
; GISEL-NEXT: v_or_b32_e32 v32, v16, v18
; GISEL-NEXT: v_or_b32_e32 v33, v17, v19
-; GISEL-NEXT: .LBB3_6: ; %Flow16
+; GISEL-NEXT: .LBB3_6: ; %Flow15
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_or_b32_e32 v16, v12, v14
@@ -3107,32 +3120,34 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_addc_u32_e64 v35, s[4:5], 0, v23, vcc
; GISEL-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v22
; GISEL-NEXT: v_not_b32_e32 v18, 63
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_addc_u32_e64 v36, vcc, 0, v16, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v37, vcc, 0, v17, vcc
-; GISEL-NEXT: v_add_i32_e64 v24, s[4:5], v28, v18
-; GISEL-NEXT: v_sub_i32_e64 v22, s[4:5], 64, v28
+; GISEL-NEXT: v_add_i32_e32 v24, vcc, v28, v18
+; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v28
; GISEL-NEXT: v_lshl_b64 v[16:17], v[4:5], v28
; GISEL-NEXT: v_lshl_b64 v[18:19], v[6:7], v28
-; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: v_lshr_b64 v[22:23], v[4:5], v22
; GISEL-NEXT: v_lshl_b64 v[26:27], v[4:5], v24
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v28
; GISEL-NEXT: v_cndmask_b32_e32 v24, 0, v16, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v25, 0, v17, vcc
-; GISEL-NEXT: v_or_b32_e32 v16, v22, v18
-; GISEL-NEXT: v_or_b32_e32 v17, v23, v19
-; GISEL-NEXT: v_cndmask_b32_e32 v16, v26, v16, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v17, v27, v17, vcc
+; GISEL-NEXT: v_or_b32_e32 v16, v34, v36
+; GISEL-NEXT: v_or_b32_e32 v17, v35, v37
+; GISEL-NEXT: v_or_b32_e32 v18, v22, v18
+; GISEL-NEXT: v_or_b32_e32 v19, v23, v19
+; GISEL-NEXT: v_cndmask_b32_e32 v18, v26, v18, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v28
-; GISEL-NEXT: v_cndmask_b32_e32 v22, v16, v6, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v23, v17, v7, vcc
-; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e32 v22, v18, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v23, v19, v7, vcc
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
; GISEL-NEXT: v_mov_b32_e32 v19, s11
; GISEL-NEXT: v_mov_b32_e32 v18, s10
; GISEL-NEXT: v_mov_b32_e32 v17, s9
; GISEL-NEXT: v_mov_b32_e32 v16, s8
-; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB3_11
; GISEL-NEXT: ; %bb.8: ; %udiv-preheader
; GISEL-NEXT: v_add_i32_e32 v28, vcc, 0xffffffc0, v34
@@ -3204,7 +3219,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: s_cbranch_execnz .LBB3_9
; GISEL-NEXT: ; %bb.10: ; %Flow
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB3_11: ; %Flow11
+; GISEL-NEXT: .LBB3_11: ; %Flow9
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: v_lshl_b64 v[26:27], v[24:25], 1
; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1
@@ -3212,7 +3227,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v18, v18, v22
; GISEL-NEXT: v_or_b32_e32 v24, v16, v26
; GISEL-NEXT: v_or_b32_e32 v25, v17, v27
-; GISEL-NEXT: .LBB3_12: ; %Flow12
+; GISEL-NEXT: .LBB3_12: ; %Flow10
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0
; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0
>From d501fbeed46fd97b6f015918eb17346f066791f8 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Thu, 25 Sep 2025 14:33:26 -0400
Subject: [PATCH 16/17] f
---
llvm/include/llvm/CodeGen/TargetLowering.h | 2 +-
.../Target/AArch64/AArch64ISelLowering.cpp | 23 -
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 -
llvm/lib/Target/ARM/ARMISelLowering.h | 3 -
.../AArch64/atomicrmw-uinc-udec-wrap.ll | 16 +-
llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 663 +++++++++---------
6 files changed, 335 insertions(+), 374 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index acbc84568300d0..7a21b287620358 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3466,7 +3466,7 @@ class LLVM_ABI TargetLoweringBase {
// concrete users.
if (VT.isVector())
return false;
- return MathUsed && !isOperationExpand(Opcode, VT);
+ return MathUsed && (isTypeLegal(VT) || !isOperationExpand(Opcode, VT));
}
// Return true if it is profitable to use a scalar input to a BUILD_VECTOR
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a4c1e265f0e63b..70dc311192206e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -25896,29 +25896,6 @@ static SDValue performCSELCombine(SDNode *N,
}
}
- // CSEL a, b, cc, SUBS(SUB(x,y), 0) -> CSEL a, b, cc, SUBS(x,y) if cc doesn't
- // use overflow flags, to avoid the comparison with zero. In case of success,
- // this also replaces the original SUB(x,y) with the newly created SUBS(x,y).
- // NOTE: Perhaps in the future use performFlagSettingCombine to replace SUB
- // nodes with their SUBS equivalent as is already done for other flag-setting
- // operators, in which case doing the replacement here becomes redundant.
- if (Cond.getOpcode() == AArch64ISD::SUBS && Cond->hasNUsesOfValue(1, 1) &&
- isNullConstant(Cond.getOperand(1))) {
- SDValue Sub = Cond.getOperand(0);
- AArch64CC::CondCode CC =
- static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
- if (Sub.getOpcode() == ISD::SUB &&
- (CC == AArch64CC::EQ || CC == AArch64CC::NE || CC == AArch64CC::MI ||
- CC == AArch64CC::PL)) {
- SDLoc DL(N);
- SDValue Subs = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
- Sub.getOperand(0), Sub.getOperand(1));
- DCI.CombineTo(Sub.getNode(), Subs);
- DCI.CombineTo(Cond.getNode(), Subs, Subs.getValue(1));
- return SDValue(N, 0);
- }
- }
-
// CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z
if (SDValue CondLast = foldCSELofLASTB(N, DAG))
return CondLast;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 510773bfccf887..d8072d15853ee0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -321,8 +321,6 @@ class AArch64TargetLowering : public TargetLowering {
bool MathUsed) const override {
// Using overflow ops for overflow checks only should beneficial on
// AArch64.
- if (VT == MVT::i8 || VT == MVT::i16)
- return false;
return TargetLowering::shouldFormOverflowOp(Opcode, VT, true);
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index cfce133bbaad01..5417d0ac839a9f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -641,9 +641,6 @@ class VectorType;
bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
bool MathUsed) const override {
- // Using overflow ops for overflow checks only should beneficial on ARM.
- if (VT == MVT::i8 || VT == MVT::i16)
- return false;
return TargetLowering::shouldFormOverflowOp(Opcode, VT, true);
}
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll
index 66fea3535b1ec3..86d8c13811d71c 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll
@@ -113,10 +113,12 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
; CHECK-NEXT: .LBB6_1: // %atomicrmw.start
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldaxr w8, [x0]
+; CHECK-NEXT: subs w9, w8, #1
+; CHECK-NEXT: cset w10, lo
; CHECK-NEXT: cmp w8, w1
-; CHECK-NEXT: sub w9, w8, #1
-; CHECK-NEXT: ccmp w8, #0, #4, ls
-; CHECK-NEXT: csel w9, w1, w9, eq
+; CHECK-NEXT: csinc w10, w10, wzr, ls
+; CHECK-NEXT: cmp w10, #0
+; CHECK-NEXT: csel w9, w1, w9, ne
; CHECK-NEXT: stlxr w10, w9, [x0]
; CHECK-NEXT: cbnz w10, .LBB6_1
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
@@ -133,10 +135,12 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
; CHECK-NEXT: .LBB7_1: // %atomicrmw.start
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldaxr x0, [x8]
+; CHECK-NEXT: subs x9, x0, #1
+; CHECK-NEXT: cset w10, lo
; CHECK-NEXT: cmp x0, x1
-; CHECK-NEXT: sub x9, x0, #1
-; CHECK-NEXT: ccmp x0, #0, #4, ls
-; CHECK-NEXT: csel x9, x1, x9, eq
+; CHECK-NEXT: csinc w10, w10, wzr, ls
+; CHECK-NEXT: cmp w10, #0
+; CHECK-NEXT: csel x9, x1, x9, ne
; CHECK-NEXT: stlxr w10, x9, [x8]
; CHECK-NEXT: cbnz w10, .LBB7_1
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 34fe1255f9c3b2..5134159e3e406a 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -97,24 +97,24 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[18:19], v[20:21], v18
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc
-; SDAG-NEXT: v_sub_i32_e32 v36, vcc, 0x7f, v8
-; SDAG-NEXT: v_or_b32_e32 v8, v30, v32
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[16:17], v36
-; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 64, v36
-; SDAG-NEXT: v_lshl_b64 v[22:23], v[20:21], v36
-; SDAG-NEXT: v_or_b32_e32 v9, v31, v33
-; SDAG-NEXT: v_lshr_b64 v[34:35], v[20:21], v34
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; SDAG-NEXT: v_or_b32_e32 v8, v11, v35
-; SDAG-NEXT: v_or_b32_e32 v9, v10, v34
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v36
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v19, v8, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, v18, v9, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, v23, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, v22, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v36
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v8, v17, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v10, v16, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v9, v30, v32
+; SDAG-NEXT: v_sub_i32_e32 v11, vcc, 0x7f, v8
+; SDAG-NEXT: v_or_b32_e32 v10, v31, v33
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v11
+; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 64, v11
+; SDAG-NEXT: v_lshl_b64 v[34:35], v[20:21], v11
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10]
+; SDAG-NEXT: v_lshr_b64 v[8:9], v[20:21], v8
+; SDAG-NEXT: v_or_b32_e32 v9, v23, v9
+; SDAG-NEXT: v_or_b32_e32 v8, v22, v8
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v11
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v19, v9, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, v35, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, v34, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v10, 0
; SDAG-NEXT: v_mov_b32_e32 v11, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -189,9 +189,9 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v16, v2
; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5]
; SDAG-NEXT: s_cbranch_execnz .LBB0_3
-; SDAG-NEXT: ; %bb.4: ; %Flow11
+; SDAG-NEXT: ; %bb.4: ; %Flow13
; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
-; SDAG-NEXT: .LBB0_5: ; %Flow13
+; SDAG-NEXT: .LBB0_5: ; %Flow14
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v19
@@ -201,7 +201,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v19, v3, v9
; SDAG-NEXT: v_or_b32_e32 v22, v10, v0
; SDAG-NEXT: v_or_b32_e32 v23, v2, v8
-; SDAG-NEXT: .LBB0_6: ; %Flow15
+; SDAG-NEXT: .LBB0_6: ; %Flow16
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v7
; SDAG-NEXT: v_ashrrev_i32_e32 v17, 31, v15
@@ -294,24 +294,24 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[12:13], v[2:3], v12
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc
-; SDAG-NEXT: v_sub_i32_e32 v36, vcc, 0x7f, v6
-; SDAG-NEXT: v_or_b32_e32 v6, v30, v32
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[8:9], v36
-; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 64, v36
-; SDAG-NEXT: v_lshl_b64 v[14:15], v[2:3], v36
-; SDAG-NEXT: v_or_b32_e32 v7, v31, v33
-; SDAG-NEXT: v_lshr_b64 v[34:35], v[2:3], v34
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; SDAG-NEXT: v_or_b32_e32 v6, v11, v35
-; SDAG-NEXT: v_or_b32_e32 v7, v10, v34
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v36
-; SDAG-NEXT: v_cndmask_b32_e64 v6, v13, v6, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v12, v12, v7, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v10, v30, v32
+; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v6
+; SDAG-NEXT: v_or_b32_e32 v11, v31, v33
+; SDAG-NEXT: v_lshl_b64 v[6:7], v[8:9], v34
+; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34
+; SDAG-NEXT: v_lshl_b64 v[14:15], v[2:3], v34
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v35
+; SDAG-NEXT: v_or_b32_e32 v7, v7, v11
+; SDAG-NEXT: v_or_b32_e32 v6, v6, v10
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34
+; SDAG-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v6, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v15, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v36
-; SDAG-NEXT: v_cndmask_b32_e64 v7, v6, v9, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v8, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34
+; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: v_mov_b32_e32 v13, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -388,7 +388,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_cbranch_execnz .LBB0_9
; SDAG-NEXT: ; %bb.10: ; %Flow
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
-; SDAG-NEXT: .LBB0_11: ; %Flow9
+; SDAG-NEXT: .LBB0_11: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[0:1], v[6:7], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v11
@@ -398,7 +398,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v14, v5, v3
; SDAG-NEXT: v_or_b32_e32 v5, v12, v0
; SDAG-NEXT: v_or_b32_e32 v4, v4, v2
-; SDAG-NEXT: .LBB0_12: ; %Flow10
+; SDAG-NEXT: .LBB0_12: ; %Flow12
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_xor_b32_e32 v3, v27, v26
; SDAG-NEXT: v_xor_b32_e32 v2, v25, v24
@@ -509,34 +509,32 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_addc_u32_e64 v29, s[4:5], 0, v3, vcc
; GISEL-NEXT: v_sub_i32_e32 v32, vcc, 0x7f, v2
; GISEL-NEXT: v_not_b32_e32 v2, 63
-; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_addc_u32_e64 v30, vcc, 0, v0, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v31, vcc, 0, v1, vcc
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v32, v2
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 64, v32
+; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v32, v2
+; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 64, v32
; GISEL-NEXT: v_lshl_b64 v[0:1], v[18:19], v32
; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], v32
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: v_lshr_b64 v[8:9], v[18:19], v8
; GISEL-NEXT: v_lshl_b64 v[22:23], v[18:19], v16
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v32
; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v1, vcc
-; GISEL-NEXT: v_or_b32_e32 v0, v28, v30
-; GISEL-NEXT: v_or_b32_e32 v1, v29, v31
-; GISEL-NEXT: v_or_b32_e32 v2, v8, v2
-; GISEL-NEXT: v_or_b32_e32 v3, v9, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v22, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v23, v3, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v8, v2
+; GISEL-NEXT: v_or_b32_e32 v1, v9, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v2, v20, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v9, v3, v21, vcc
-; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v20, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v1, v21, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_mov_b32_e32 v0, s8
; GISEL-NEXT: v_mov_b32_e32 v1, s9
; GISEL-NEXT: v_mov_b32_e32 v2, s10
; GISEL-NEXT: v_mov_b32_e32 v3, s11
-; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[4:5]
+; GISEL-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[8:9]
; GISEL-NEXT: s_cbranch_execz .LBB0_5
; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4
; GISEL-NEXT: v_add_i32_e32 v32, vcc, 0xffffffc0, v28
@@ -548,6 +546,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_lshl_b64 v[22:23], v[20:21], v22
; GISEL-NEXT: v_or_b32_e32 v22, v2, v22
; GISEL-NEXT: v_or_b32_e32 v23, v3, v23
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_lshr_b64 v[2:3], v[20:21], v32
; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v22, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v23, vcc
@@ -605,9 +604,9 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v1, v23
; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GISEL-NEXT: s_cbranch_execnz .LBB0_3
-; GISEL-NEXT: ; %bb.4: ; %Flow11
+; GISEL-NEXT: ; %bb.4: ; %Flow13
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
-; GISEL-NEXT: .LBB0_5: ; %Flow13
+; GISEL-NEXT: .LBB0_5: ; %Flow14
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: v_lshl_b64 v[2:3], v[16:17], 1
; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
@@ -615,48 +614,48 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v8, v8, v10
; GISEL-NEXT: v_or_b32_e32 v22, v0, v2
; GISEL-NEXT: v_or_b32_e32 v23, v1, v3
-; GISEL-NEXT: .LBB0_6: ; %Flow15
+; GISEL-NEXT: .LBB0_6: ; %Flow16
; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_ashrrev_i32_e32 v18, 31, v7
; GISEL-NEXT: v_ashrrev_i32_e32 v19, 31, v15
-; GISEL-NEXT: v_mov_b32_e32 v16, 0x7f
-; GISEL-NEXT: v_mov_b32_e32 v17, 0
+; GISEL-NEXT: v_mov_b32_e32 v10, 0x7f
+; GISEL-NEXT: v_mov_b32_e32 v11, 0
; GISEL-NEXT: v_xor_b32_e32 v0, v18, v4
; GISEL-NEXT: v_xor_b32_e32 v1, v18, v5
; GISEL-NEXT: v_xor_b32_e32 v2, v18, v6
; GISEL-NEXT: v_xor_b32_e32 v3, v18, v7
; GISEL-NEXT: v_xor_b32_e32 v4, v19, v12
; GISEL-NEXT: v_xor_b32_e32 v5, v19, v13
-; GISEL-NEXT: v_xor_b32_e32 v12, v19, v14
-; GISEL-NEXT: v_xor_b32_e32 v13, v19, v15
+; GISEL-NEXT: v_xor_b32_e32 v14, v19, v14
+; GISEL-NEXT: v_xor_b32_e32 v15, v19, v15
; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v0, v18
; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v1, v18, vcc
; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], v4, v19
; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], v5, v19, s[4:5]
-; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v2, v18, vcc
-; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v3, v18, vcc
-; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v12, v19, s[4:5]
-; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v13, v19, vcc
-; GISEL-NEXT: v_ffbh_u32_e32 v12, v21
-; GISEL-NEXT: v_ffbh_u32_e32 v13, v20
-; GISEL-NEXT: v_ffbh_u32_e32 v14, v7
-; GISEL-NEXT: v_ffbh_u32_e32 v15, v6
+; GISEL-NEXT: v_subb_u32_e32 v12, vcc, v2, v18, vcc
+; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v18, vcc
+; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v14, v19, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v15, v19, vcc
+; GISEL-NEXT: v_ffbh_u32_e32 v14, v21
+; GISEL-NEXT: v_ffbh_u32_e32 v15, v20
+; GISEL-NEXT: v_ffbh_u32_e32 v16, v7
+; GISEL-NEXT: v_ffbh_u32_e32 v17, v6
; GISEL-NEXT: v_or_b32_e32 v0, v20, v4
; GISEL-NEXT: v_or_b32_e32 v1, v21, v5
-; GISEL-NEXT: v_or_b32_e32 v2, v6, v10
-; GISEL-NEXT: v_or_b32_e32 v3, v7, v11
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, 32, v13
+; GISEL-NEXT: v_or_b32_e32 v2, v6, v12
+; GISEL-NEXT: v_or_b32_e32 v3, v7, v13
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15
; GISEL-NEXT: v_ffbh_u32_e32 v26, v5
; GISEL-NEXT: v_ffbh_u32_e32 v27, v4
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15
-; GISEL-NEXT: v_ffbh_u32_e32 v28, v11
-; GISEL-NEXT: v_ffbh_u32_e32 v29, v10
+; GISEL-NEXT: v_add_i32_e32 v17, vcc, 32, v17
+; GISEL-NEXT: v_ffbh_u32_e32 v28, v13
+; GISEL-NEXT: v_ffbh_u32_e32 v29, v12
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
-; GISEL-NEXT: v_min_u32_e32 v0, v12, v13
+; GISEL-NEXT: v_min_u32_e32 v0, v14, v15
; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v27
-; GISEL-NEXT: v_min_u32_e32 v2, v14, v15
+; GISEL-NEXT: v_min_u32_e32 v2, v16, v17
; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v29
; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0
; GISEL-NEXT: v_min_u32_e32 v1, v26, v1
@@ -666,32 +665,32 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13]
; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1
; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[16:17]
+; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11]
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_xor_b32_e32 v12, 0x7f, v2
+; GISEL-NEXT: v_xor_b32_e32 v10, 0x7f, v2
; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_or_b32_e32 v12, v12, v0
-; GISEL-NEXT: v_or_b32_e32 v13, v3, v1
+; GISEL-NEXT: v_or_b32_e32 v10, v10, v0
+; GISEL-NEXT: v_or_b32_e32 v11, v3, v1
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13]
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_or_b32_e32 v13, v14, v15
-; GISEL-NEXT: v_and_b32_e32 v14, 1, v13
-; GISEL-NEXT: v_or_b32_e32 v12, v13, v12
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v11, v14, v15
+; GISEL-NEXT: v_and_b32_e32 v14, 1, v11
+; GISEL-NEXT: v_or_b32_e32 v10, v11, v10
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, v6, 0, vcc
-; GISEL-NEXT: v_and_b32_e32 v16, 1, v12
+; GISEL-NEXT: v_and_b32_e32 v16, 1, v10
; GISEL-NEXT: v_cndmask_b32_e64 v15, v7, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v10, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v13, v11, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v10, v12, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v11, v13, 0, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
@@ -701,58 +700,56 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v3, vcc
; GISEL-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v2
; GISEL-NEXT: v_not_b32_e32 v2, 63
-; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v0, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v30, v2
-; GISEL-NEXT: v_sub_i32_e32 v12, vcc, 64, v30
+; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v30, v2
+; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v30
; GISEL-NEXT: v_lshl_b64 v[0:1], v[6:7], v30
-; GISEL-NEXT: v_lshl_b64 v[2:3], v[10:11], v30
-; GISEL-NEXT: v_lshr_b64 v[12:13], v[6:7], v12
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], v30
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: v_lshr_b64 v[10:11], v[6:7], v10
; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v14
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30
; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v15, 0, v1, vcc
-; GISEL-NEXT: v_or_b32_e32 v0, v26, v28
-; GISEL-NEXT: v_or_b32_e32 v1, v27, v29
-; GISEL-NEXT: v_or_b32_e32 v2, v12, v2
-; GISEL-NEXT: v_or_b32_e32 v3, v13, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v10, v2
+; GISEL-NEXT: v_or_b32_e32 v1, v11, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
-; GISEL-NEXT: v_cndmask_b32_e32 v12, v2, v10, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v13, v3, v11, vcc
-; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e32 v10, v0, v12, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v11, v1, v13, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_mov_b32_e32 v0, s8
; GISEL-NEXT: v_mov_b32_e32 v1, s9
; GISEL-NEXT: v_mov_b32_e32 v2, s10
; GISEL-NEXT: v_mov_b32_e32 v3, s11
-; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
; GISEL-NEXT: s_cbranch_execz .LBB0_11
; GISEL-NEXT: ; %bb.8: ; %udiv-preheader
; GISEL-NEXT: v_add_i32_e32 v32, vcc, 0xffffffc0, v26
; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 64, v26
-; GISEL-NEXT: v_lshr_b64 v[0:1], v[10:11], v26
+; GISEL-NEXT: v_lshr_b64 v[0:1], v[12:13], v26
; GISEL-NEXT: v_lshr_b64 v[2:3], v[6:7], v26
; GISEL-NEXT: s_mov_b64 s[4:5], 0
; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v20
; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v21, vcc
-; GISEL-NEXT: v_lshl_b64 v[16:17], v[10:11], v16
-; GISEL-NEXT: v_lshr_b64 v[10:11], v[10:11], v32
+; GISEL-NEXT: v_lshl_b64 v[16:17], v[12:13], v16
+; GISEL-NEXT: v_lshr_b64 v[12:13], v[12:13], v32
; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v4, vcc
; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v5, vcc
; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
; GISEL-NEXT: v_or_b32_e32 v2, v2, v16
; GISEL-NEXT: v_or_b32_e32 v3, v3, v17
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v2, v6, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v3, v7, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v2, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v3, v7, vcc
; GISEL-NEXT: v_mov_b32_e32 v7, 0
; GISEL-NEXT: v_mov_b32_e32 v0, s4
; GISEL-NEXT: v_mov_b32_e32 v1, s5
@@ -760,20 +757,20 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v3, s7
; GISEL-NEXT: .LBB0_9: ; %udiv-do-while
; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; GISEL-NEXT: v_lshl_b64 v[2:3], v[10:11], 1
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], 1
; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
-; GISEL-NEXT: v_lshrrev_b32_e32 v6, 31, v11
-; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v13
-; GISEL-NEXT: v_lshl_b64 v[10:11], v[14:15], 1
-; GISEL-NEXT: v_lshl_b64 v[12:13], v[12:13], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v6, 31, v13
+; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v11
+; GISEL-NEXT: v_lshl_b64 v[12:13], v[14:15], 1
+; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
; GISEL-NEXT: v_lshrrev_b32_e32 v14, 31, v15
; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v26
; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc
; GISEL-NEXT: v_or_b32_e32 v16, v16, v6
; GISEL-NEXT: v_or_b32_e32 v2, v2, v34
-; GISEL-NEXT: v_or_b32_e32 v12, v12, v14
-; GISEL-NEXT: v_or_b32_e32 v14, v0, v10
-; GISEL-NEXT: v_or_b32_e32 v15, v1, v11
+; GISEL-NEXT: v_or_b32_e32 v10, v10, v14
+; GISEL-NEXT: v_or_b32_e32 v14, v0, v12
+; GISEL-NEXT: v_or_b32_e32 v15, v1, v13
; GISEL-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc
; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v30, v2
@@ -786,29 +783,29 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v6
; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GISEL-NEXT: v_and_b32_e32 v6, 1, v0
-; GISEL-NEXT: v_and_b32_e32 v10, v0, v20
-; GISEL-NEXT: v_and_b32_e32 v11, v0, v21
+; GISEL-NEXT: v_and_b32_e32 v12, v0, v20
+; GISEL-NEXT: v_and_b32_e32 v13, v0, v21
; GISEL-NEXT: v_and_b32_e32 v34, v0, v4
; GISEL-NEXT: v_and_b32_e32 v35, v0, v5
; GISEL-NEXT: v_mov_b32_e32 v0, v6
; GISEL-NEXT: v_mov_b32_e32 v1, v7
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v2, v10
-; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v3, v11, vcc
+; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v12
+; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v13, vcc
; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v34, vcc
; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v35, vcc
; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GISEL-NEXT: s_cbranch_execnz .LBB0_9
; GISEL-NEXT: ; %bb.10: ; %Flow
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB0_11: ; %Flow9
+; GISEL-NEXT: .LBB0_11: ; %Flow11
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1
-; GISEL-NEXT: v_lshl_b64 v[12:13], v[12:13], 1
+; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
; GISEL-NEXT: v_lshrrev_b32_e32 v4, 31, v15
-; GISEL-NEXT: v_or_b32_e32 v12, v12, v4
+; GISEL-NEXT: v_or_b32_e32 v10, v10, v4
; GISEL-NEXT: v_or_b32_e32 v14, v0, v2
; GISEL-NEXT: v_or_b32_e32 v15, v1, v3
-; GISEL-NEXT: .LBB0_12: ; %Flow10
+; GISEL-NEXT: .LBB0_12: ; %Flow12
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: v_xor_b32_e32 v3, v25, v24
; GISEL-NEXT: v_xor_b32_e32 v7, v19, v18
@@ -818,8 +815,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_xor_b32_e32 v6, v9, v3
; GISEL-NEXT: v_xor_b32_e32 v4, v14, v7
; GISEL-NEXT: v_xor_b32_e32 v5, v15, v7
-; GISEL-NEXT: v_xor_b32_e32 v8, v12, v7
-; GISEL-NEXT: v_xor_b32_e32 v9, v13, v7
+; GISEL-NEXT: v_xor_b32_e32 v8, v10, v7
+; GISEL-NEXT: v_xor_b32_e32 v9, v11, v7
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v7
@@ -906,24 +903,24 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[16:17], v[0:1], v16
; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v23, vcc
; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v24, vcc
-; SDAG-NEXT: v_sub_i32_e32 v32, vcc, 0x7f, v21
-; SDAG-NEXT: v_or_b32_e32 v21, v18, v28
-; SDAG-NEXT: v_lshl_b64 v[23:24], v[2:3], v32
-; SDAG-NEXT: v_sub_i32_e32 v30, vcc, 64, v32
-; SDAG-NEXT: v_lshl_b64 v[25:26], v[0:1], v32
-; SDAG-NEXT: v_or_b32_e32 v22, v27, v29
-; SDAG-NEXT: v_lshr_b64 v[30:31], v[0:1], v30
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[21:22]
-; SDAG-NEXT: v_or_b32_e32 v21, v24, v31
-; SDAG-NEXT: v_or_b32_e32 v22, v23, v30
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v32
-; SDAG-NEXT: v_cndmask_b32_e64 v21, v17, v21, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v23, v16, v22, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v26, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v25, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v32
-; SDAG-NEXT: v_cndmask_b32_e64 v22, v21, v3, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, v23, v2, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v22, v18, v28
+; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v21
+; SDAG-NEXT: v_or_b32_e32 v23, v27, v29
+; SDAG-NEXT: v_lshl_b64 v[24:25], v[2:3], v26
+; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 64, v26
+; SDAG-NEXT: v_lshl_b64 v[30:31], v[0:1], v26
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[22:23]
+; SDAG-NEXT: v_lshr_b64 v[21:22], v[0:1], v21
+; SDAG-NEXT: v_or_b32_e32 v22, v25, v22
+; SDAG-NEXT: v_or_b32_e32 v21, v24, v21
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26
+; SDAG-NEXT: v_cndmask_b32_e64 v22, v17, v22, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, v16, v21, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v30, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26
+; SDAG-NEXT: v_cndmask_b32_e64 v22, v22, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, v21, v2, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v23, 0
; SDAG-NEXT: v_mov_b32_e32 v24, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -998,9 +995,9 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v25, v19
; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5]
; SDAG-NEXT: s_cbranch_execnz .LBB1_3
-; SDAG-NEXT: ; %bb.4: ; %Flow11
+; SDAG-NEXT: ; %bb.4: ; %Flow13
; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
-; SDAG-NEXT: .LBB1_5: ; %Flow13
+; SDAG-NEXT: .LBB1_5: ; %Flow14
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[0:1], v[21:22], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v17
@@ -1010,7 +1007,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v18, v20, v3
; SDAG-NEXT: v_or_b32_e32 v17, v23, v0
; SDAG-NEXT: v_or_b32_e32 v19, v19, v2
-; SDAG-NEXT: .LBB1_6: ; %Flow15
+; SDAG-NEXT: .LBB1_6: ; %Flow16
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_or_b32_e32 v1, v13, v15
; SDAG-NEXT: v_or_b32_e32 v0, v12, v14
@@ -1081,24 +1078,24 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[8:9], v[4:5], v8
; SDAG-NEXT: v_addc_u32_e32 v24, vcc, 0, v20, vcc
; SDAG-NEXT: v_addc_u32_e32 v25, vcc, 0, v21, vcc
-; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v2
-; SDAG-NEXT: v_or_b32_e32 v2, v22, v24
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[6:7], v28
-; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 64, v28
-; SDAG-NEXT: v_lshl_b64 v[20:21], v[4:5], v28
-; SDAG-NEXT: v_or_b32_e32 v3, v23, v25
-; SDAG-NEXT: v_lshr_b64 v[26:27], v[4:5], v26
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; SDAG-NEXT: v_or_b32_e32 v2, v11, v27
-; SDAG-NEXT: v_or_b32_e32 v3, v10, v26
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v28
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, v8, v3, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v10, v22, v24
+; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v2
+; SDAG-NEXT: v_or_b32_e32 v11, v23, v25
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[6:7], v26
+; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[4:5], v26
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_lshr_b64 v[10:11], v[4:5], v27
+; SDAG-NEXT: v_or_b32_e32 v3, v3, v11
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v10
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, v21, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v20, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v28
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v2, v7, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v6, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: v_mov_b32_e32 v21, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -1175,7 +1172,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_cbranch_execnz .LBB1_9
; SDAG-NEXT: ; %bb.10: ; %Flow
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
-; SDAG-NEXT: .LBB1_11: ; %Flow9
+; SDAG-NEXT: .LBB1_11: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v9
@@ -1185,7 +1182,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v10, v1, v5
; SDAG-NEXT: v_or_b32_e32 v9, v20, v2
; SDAG-NEXT: v_or_b32_e32 v11, v0, v4
-; SDAG-NEXT: .LBB1_12: ; %Flow10
+; SDAG-NEXT: .LBB1_12: ; %Flow12
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_mov_b32_e32 v0, v19
; SDAG-NEXT: v_mov_b32_e32 v1, v18
@@ -1266,36 +1263,34 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v22
; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v23, vcc
-; GISEL-NEXT: v_sub_i32_e32 v32, vcc, 0x7f, v22
+; GISEL-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v22
; GISEL-NEXT: v_not_b32_e32 v2, 63
-; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v20, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v21, vcc
-; GISEL-NEXT: v_add_i32_e32 v22, vcc, v32, v2
-; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 64, v32
-; GISEL-NEXT: v_lshl_b64 v[2:3], v[0:1], v32
-; GISEL-NEXT: v_lshl_b64 v[18:19], v[16:17], v32
+; GISEL-NEXT: v_add_i32_e64 v22, s[4:5], v30, v2
+; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], 64, v30
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[0:1], v30
+; GISEL-NEXT: v_lshl_b64 v[18:19], v[16:17], v30
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: v_lshr_b64 v[20:21], v[0:1], v20
; GISEL-NEXT: v_lshl_b64 v[24:25], v[0:1], v22
-; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v32
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30
; GISEL-NEXT: v_cndmask_b32_e32 v22, 0, v2, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v23, 0, v3, vcc
-; GISEL-NEXT: v_or_b32_e32 v30, v26, v28
-; GISEL-NEXT: v_or_b32_e32 v31, v27, v29
; GISEL-NEXT: v_or_b32_e32 v2, v20, v18
; GISEL-NEXT: v_or_b32_e32 v3, v21, v19
; GISEL-NEXT: v_cndmask_b32_e32 v2, v24, v2, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v25, v3, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc
-; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[30:31]
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_mov_b32_e32 v21, s11
; GISEL-NEXT: v_mov_b32_e32 v20, s10
; GISEL-NEXT: v_mov_b32_e32 v19, s9
; GISEL-NEXT: v_mov_b32_e32 v18, s8
-; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
; GISEL-NEXT: s_cbranch_execz .LBB1_5
; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4
; GISEL-NEXT: v_add_i32_e32 v32, vcc, 0xffffffc0, v26
@@ -1365,9 +1360,9 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v18, v0
; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GISEL-NEXT: s_cbranch_execnz .LBB1_3
-; GISEL-NEXT: ; %bb.4: ; %Flow11
+; GISEL-NEXT: ; %bb.4: ; %Flow13
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB1_5: ; %Flow13
+; GISEL-NEXT: .LBB1_5: ; %Flow14
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: v_lshl_b64 v[0:1], v[22:23], 1
; GISEL-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
@@ -1375,7 +1370,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v2, v2, v8
; GISEL-NEXT: v_or_b32_e32 v18, v18, v0
; GISEL-NEXT: v_or_b32_e32 v19, v19, v1
-; GISEL-NEXT: .LBB1_6: ; %Flow15
+; GISEL-NEXT: .LBB1_6: ; %Flow16
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_or_b32_e32 v0, v12, v14
@@ -1441,36 +1436,34 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: ; %bb.7: ; %udiv-bb1
; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v16
; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v17, vcc
-; GISEL-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v16
+; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16
; GISEL-NEXT: v_not_b32_e32 v9, 63
-; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_addc_u32_e64 v24, vcc, 0, v0, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v28, v9
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 64, v28
-; GISEL-NEXT: v_lshl_b64 v[0:1], v[4:5], v28
-; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v28
+; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v26, v9
+; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v26
+; GISEL-NEXT: v_lshl_b64 v[0:1], v[4:5], v26
+; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v26
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: v_lshr_b64 v[20:21], v[4:5], v10
; GISEL-NEXT: v_lshl_b64 v[22:23], v[4:5], v9
-; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v28
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26
; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc
-; GISEL-NEXT: v_or_b32_e32 v26, v8, v24
-; GISEL-NEXT: v_or_b32_e32 v27, v11, v25
; GISEL-NEXT: v_or_b32_e32 v0, v20, v16
; GISEL-NEXT: v_or_b32_e32 v1, v21, v17
; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v28
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[26:27]
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_mov_b32_e32 v23, s11
; GISEL-NEXT: v_mov_b32_e32 v22, s10
; GISEL-NEXT: v_mov_b32_e32 v21, s9
; GISEL-NEXT: v_mov_b32_e32 v20, s8
-; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
; GISEL-NEXT: s_cbranch_execz .LBB1_11
; GISEL-NEXT: ; %bb.8: ; %udiv-preheader
; GISEL-NEXT: v_add_i32_e32 v28, vcc, 0xffffffc0, v8
@@ -1542,7 +1535,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: s_cbranch_execnz .LBB1_9
; GISEL-NEXT: ; %bb.10: ; %Flow
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB1_11: ; %Flow9
+; GISEL-NEXT: .LBB1_11: ; %Flow11
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], 1
; GISEL-NEXT: v_lshl_b64 v[8:9], v[0:1], 1
@@ -1550,7 +1543,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v8, v8, v0
; GISEL-NEXT: v_or_b32_e32 v10, v20, v4
; GISEL-NEXT: v_or_b32_e32 v11, v21, v5
-; GISEL-NEXT: .LBB1_12: ; %Flow10
+; GISEL-NEXT: .LBB1_12: ; %Flow12
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: v_mov_b32_e32 v0, v18
; GISEL-NEXT: v_mov_b32_e32 v1, v19
@@ -1657,24 +1650,24 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v20
; SDAG-NEXT: v_addc_u32_e32 v34, vcc, 0, v18, vcc
; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc
-; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v10
-; SDAG-NEXT: v_or_b32_e32 v10, v32, v34
-; SDAG-NEXT: v_lshl_b64 v[18:19], v[0:1], v26
-; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 64, v26
-; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v26
-; SDAG-NEXT: v_or_b32_e32 v11, v33, v35
-; SDAG-NEXT: v_lshr_b64 v[24:25], v[16:17], v24
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_or_b32_e32 v10, v19, v25
-; SDAG-NEXT: v_or_b32_e32 v11, v18, v24
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26
-; SDAG-NEXT: v_cndmask_b32_e64 v10, v21, v10, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v20, v11, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v18, v32, v34
+; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v10
+; SDAG-NEXT: v_or_b32_e32 v19, v33, v35
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[0:1], v24
+; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 64, v24
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v24
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_lshr_b64 v[18:19], v[16:17], v25
+; SDAG-NEXT: v_or_b32_e32 v11, v11, v19
+; SDAG-NEXT: v_or_b32_e32 v10, v10, v18
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26
-; SDAG-NEXT: v_cndmask_b32_e64 v11, v10, v1, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, v18, v0, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, v1, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v18, 0
; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -1749,9 +1742,9 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v22, v8
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
; SDAG-NEXT: s_cbranch_execnz .LBB2_3
-; SDAG-NEXT: ; %bb.4: ; %Flow11
+; SDAG-NEXT: ; %bb.4: ; %Flow13
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
-; SDAG-NEXT: .LBB2_5: ; %Flow13
+; SDAG-NEXT: .LBB2_5: ; %Flow14
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v21
@@ -1761,7 +1754,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v32, v18, v10
; SDAG-NEXT: v_or_b32_e32 v27, v9, v21
; SDAG-NEXT: v_or_b32_e32 v33, v8, v20
-; SDAG-NEXT: .LBB2_6: ; %Flow15
+; SDAG-NEXT: .LBB2_6: ; %Flow16
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v7
; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 0, v4
@@ -1852,22 +1845,22 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[18:19], v[8:9], v18
; SDAG-NEXT: v_addc_u32_e32 v48, vcc, 0, v14, vcc
; SDAG-NEXT: v_addc_u32_e32 v49, vcc, 0, v15, vcc
-; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v10
-; SDAG-NEXT: v_or_b32_e32 v10, v38, v48
-; SDAG-NEXT: v_lshl_b64 v[14:15], v[4:5], v24
-; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 64, v24
-; SDAG-NEXT: v_lshl_b64 v[20:21], v[8:9], v24
-; SDAG-NEXT: v_or_b32_e32 v11, v39, v49
-; SDAG-NEXT: v_lshr_b64 v[22:23], v[8:9], v22
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_or_b32_e32 v10, v15, v23
-; SDAG-NEXT: v_or_b32_e32 v11, v14, v22
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
-; SDAG-NEXT: v_cndmask_b32_e64 v14, v19, v10, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v11, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v14, v38, v48
+; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 0x7f, v10
+; SDAG-NEXT: v_or_b32_e32 v15, v39, v49
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[4:5], v22
+; SDAG-NEXT: v_sub_i32_e32 v23, vcc, 64, v22
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[8:9], v22
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; SDAG-NEXT: v_lshr_b64 v[14:15], v[8:9], v23
+; SDAG-NEXT: v_or_b32_e32 v11, v11, v15
+; SDAG-NEXT: v_or_b32_e32 v10, v10, v14
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v22
+; SDAG-NEXT: v_cndmask_b32_e64 v14, v19, v11, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v10, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v21, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v20, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22
; SDAG-NEXT: v_cndmask_b32_e64 v15, v14, v5, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v14, v18, v4, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v18, 0
@@ -1946,7 +1939,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_cbranch_execnz .LBB2_9
; SDAG-NEXT: ; %bb.10: ; %Flow
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
-; SDAG-NEXT: .LBB2_11: ; %Flow9
+; SDAG-NEXT: .LBB2_11: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v11
@@ -1956,7 +1949,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v13, v13, v11
; SDAG-NEXT: v_or_b32_e32 v18, v18, v14
; SDAG-NEXT: v_or_b32_e32 v12, v12, v10
-; SDAG-NEXT: .LBB2_12: ; %Flow10
+; SDAG-NEXT: .LBB2_12: ; %Flow12
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_mul_lo_u32 v14, v33, v3
; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v33, v2, 0
@@ -2114,34 +2107,32 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_addc_u32_e64 v32, s[4:5], 0, v3, vcc
; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v2
; GISEL-NEXT: v_not_b32_e32 v2, 63
-; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_addc_u32_e64 v33, vcc, 0, v0, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v34, vcc, 0, v1, vcc
-; GISEL-NEXT: v_add_i32_e32 v20, vcc, v24, v2
-; GISEL-NEXT: v_sub_i32_e32 v18, vcc, 64, v24
+; GISEL-NEXT: v_add_i32_e64 v20, s[4:5], v24, v2
+; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 64, v24
; GISEL-NEXT: v_lshl_b64 v[0:1], v[16:17], v24
; GISEL-NEXT: v_lshl_b64 v[2:3], v[8:9], v24
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: v_lshr_b64 v[18:19], v[16:17], v18
; GISEL-NEXT: v_lshl_b64 v[22:23], v[16:17], v20
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24
; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc
-; GISEL-NEXT: v_or_b32_e32 v0, v31, v33
-; GISEL-NEXT: v_or_b32_e32 v1, v32, v34
-; GISEL-NEXT: v_or_b32_e32 v2, v18, v2
-; GISEL-NEXT: v_or_b32_e32 v3, v19, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v22, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v23, v3, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v18, v2
+; GISEL-NEXT: v_or_b32_e32 v1, v19, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
-; GISEL-NEXT: v_cndmask_b32_e32 v18, v2, v8, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v19, v3, v9, vcc
-; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e32 v18, v0, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v19, v1, v9, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_mov_b32_e32 v0, s8
; GISEL-NEXT: v_mov_b32_e32 v1, s9
; GISEL-NEXT: v_mov_b32_e32 v2, s10
; GISEL-NEXT: v_mov_b32_e32 v3, s11
-; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
; GISEL-NEXT: s_cbranch_execz .LBB2_5
; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4
; GISEL-NEXT: v_add_i32_e32 v24, vcc, 0xffffffc0, v31
@@ -2211,9 +2202,9 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v1, v23
; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GISEL-NEXT: s_cbranch_execnz .LBB2_3
-; GISEL-NEXT: ; %bb.4: ; %Flow11
+; GISEL-NEXT: ; %bb.4: ; %Flow13
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB2_5: ; %Flow13
+; GISEL-NEXT: .LBB2_5: ; %Flow14
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1
; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
@@ -2221,7 +2212,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v18, v18, v20
; GISEL-NEXT: v_or_b32_e32 v31, v0, v2
; GISEL-NEXT: v_or_b32_e32 v32, v1, v3
-; GISEL-NEXT: .LBB2_6: ; %Flow15
+; GISEL-NEXT: .LBB2_6: ; %Flow16
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_ashrrev_i32_e32 v33, 31, v7
@@ -2307,34 +2298,32 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_addc_u32_e64 v37, s[4:5], 0, v15, vcc
; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v14
; GISEL-NEXT: v_not_b32_e32 v2, 63
-; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_addc_u32_e64 v38, vcc, 0, v0, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v39, vcc, 0, v1, vcc
-; GISEL-NEXT: v_add_i32_e32 v20, vcc, v24, v2
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 64, v24
+; GISEL-NEXT: v_add_i32_e64 v20, s[4:5], v24, v2
+; GISEL-NEXT: v_sub_i32_e64 v14, s[4:5], 64, v24
; GISEL-NEXT: v_lshl_b64 v[0:1], v[12:13], v24
; GISEL-NEXT: v_lshl_b64 v[2:3], v[6:7], v24
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: v_lshr_b64 v[14:15], v[12:13], v14
; GISEL-NEXT: v_lshl_b64 v[22:23], v[12:13], v20
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24
; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc
-; GISEL-NEXT: v_or_b32_e32 v0, v36, v38
-; GISEL-NEXT: v_or_b32_e32 v1, v37, v39
-; GISEL-NEXT: v_or_b32_e32 v2, v14, v2
-; GISEL-NEXT: v_or_b32_e32 v3, v15, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v22, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v23, v3, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v14, v2
+; GISEL-NEXT: v_or_b32_e32 v1, v15, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
-; GISEL-NEXT: v_cndmask_b32_e32 v14, v2, v6, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v15, v3, v7, vcc
-; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v0, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v15, v1, v7, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_mov_b32_e32 v0, s8
; GISEL-NEXT: v_mov_b32_e32 v1, s9
; GISEL-NEXT: v_mov_b32_e32 v2, s10
; GISEL-NEXT: v_mov_b32_e32 v3, s11
-; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
; GISEL-NEXT: s_cbranch_execz .LBB2_11
; GISEL-NEXT: ; %bb.8: ; %udiv-preheader
; GISEL-NEXT: v_add_i32_e32 v24, vcc, 0xffffffc0, v36
@@ -2406,7 +2395,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: s_cbranch_execnz .LBB2_9
; GISEL-NEXT: ; %bb.10: ; %Flow
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB2_11: ; %Flow9
+; GISEL-NEXT: .LBB2_11: ; %Flow11
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: v_lshl_b64 v[22:23], v[20:21], 1
; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1
@@ -2414,7 +2403,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v2, v2, v14
; GISEL-NEXT: v_or_b32_e32 v20, v0, v22
; GISEL-NEXT: v_or_b32_e32 v21, v1, v23
-; GISEL-NEXT: .LBB2_12: ; %Flow10
+; GISEL-NEXT: .LBB2_12: ; %Flow12
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0
@@ -2544,24 +2533,24 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[22:23], v[0:1], v22
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v20, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v21, vcc
-; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v18
-; SDAG-NEXT: v_or_b32_e32 v18, v30, v32
-; SDAG-NEXT: v_lshl_b64 v[20:21], v[2:3], v28
-; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 64, v28
-; SDAG-NEXT: v_lshl_b64 v[24:25], v[0:1], v28
-; SDAG-NEXT: v_or_b32_e32 v19, v31, v33
-; SDAG-NEXT: v_lshr_b64 v[26:27], v[0:1], v26
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
-; SDAG-NEXT: v_or_b32_e32 v18, v21, v27
-; SDAG-NEXT: v_or_b32_e32 v19, v20, v26
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v28
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v23, v18, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, v22, v19, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v25, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, v24, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v28
-; SDAG-NEXT: v_cndmask_b32_e64 v19, v18, v3, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v20, v2, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v19, v30, v32
+; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0x7f, v18
+; SDAG-NEXT: v_or_b32_e32 v20, v31, v33
+; SDAG-NEXT: v_lshl_b64 v[24:25], v[2:3], v21
+; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 64, v21
+; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v21
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20]
+; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v18
+; SDAG-NEXT: v_or_b32_e32 v19, v25, v19
+; SDAG-NEXT: v_or_b32_e32 v18, v24, v18
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v21
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v23, v19, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v18, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v27, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, v26, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v21
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v2, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: v_mov_b32_e32 v21, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -2636,9 +2625,9 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v24, v16
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
; SDAG-NEXT: s_cbranch_execnz .LBB3_3
-; SDAG-NEXT: ; %bb.4: ; %Flow11
+; SDAG-NEXT: ; %bb.4: ; %Flow13
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
-; SDAG-NEXT: .LBB3_5: ; %Flow13
+; SDAG-NEXT: .LBB3_5: ; %Flow14
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v23
@@ -2648,7 +2637,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v30, v17, v23
; SDAG-NEXT: v_or_b32_e32 v31, v20, v18
; SDAG-NEXT: v_or_b32_e32 v32, v16, v22
-; SDAG-NEXT: .LBB3_6: ; %Flow15
+; SDAG-NEXT: .LBB3_6: ; %Flow16
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_or_b32_e32 v17, v13, v15
; SDAG-NEXT: v_or_b32_e32 v16, v12, v14
@@ -2719,24 +2708,24 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[22:23], v[4:5], v22
; SDAG-NEXT: v_addc_u32_e32 v36, vcc, 0, v20, vcc
; SDAG-NEXT: v_addc_u32_e32 v37, vcc, 0, v21, vcc
+; SDAG-NEXT: v_or_b32_e32 v19, v34, v36
; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v18
-; SDAG-NEXT: v_or_b32_e32 v18, v34, v36
-; SDAG-NEXT: v_lshl_b64 v[20:21], v[6:7], v28
-; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 64, v28
-; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v28
-; SDAG-NEXT: v_or_b32_e32 v19, v35, v37
-; SDAG-NEXT: v_lshr_b64 v[26:27], v[4:5], v26
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
-; SDAG-NEXT: v_or_b32_e32 v18, v21, v27
-; SDAG-NEXT: v_or_b32_e32 v19, v20, v26
+; SDAG-NEXT: v_or_b32_e32 v20, v35, v37
+; SDAG-NEXT: v_lshl_b64 v[24:25], v[6:7], v28
+; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 64, v28
+; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v28
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20]
+; SDAG-NEXT: v_lshr_b64 v[18:19], v[4:5], v18
+; SDAG-NEXT: v_or_b32_e32 v19, v25, v19
+; SDAG-NEXT: v_or_b32_e32 v18, v24, v18
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v28
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v23, v18, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v22, v22, v19, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v25, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v24, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v23, v19, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v18, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v27, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v26, s[4:5]
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v28
-; SDAG-NEXT: v_cndmask_b32_e64 v19, v18, v7, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v6, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v7, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v6, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v22, 0
; SDAG-NEXT: v_mov_b32_e32 v23, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -2813,7 +2802,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_cbranch_execnz .LBB3_9
; SDAG-NEXT: ; %bb.10: ; %Flow
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
-; SDAG-NEXT: .LBB3_11: ; %Flow9
+; SDAG-NEXT: .LBB3_11: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v21
@@ -2823,7 +2812,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v17, v17, v21
; SDAG-NEXT: v_or_b32_e32 v22, v22, v18
; SDAG-NEXT: v_or_b32_e32 v16, v16, v20
-; SDAG-NEXT: .LBB3_12: ; %Flow10
+; SDAG-NEXT: .LBB3_12: ; %Flow12
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_mul_lo_u32 v20, v32, v11
; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v32, v10, 0
@@ -2945,34 +2934,32 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_addc_u32_e64 v31, s[4:5], 0, v19, vcc
; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v18
; GISEL-NEXT: v_not_b32_e32 v18, 63
-; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_addc_u32_e64 v32, vcc, 0, v16, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v33, vcc, 0, v17, vcc
-; GISEL-NEXT: v_add_i32_e32 v22, vcc, v26, v18
-; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 64, v26
+; GISEL-NEXT: v_add_i32_e64 v22, s[4:5], v26, v18
+; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], 64, v26
; GISEL-NEXT: v_lshl_b64 v[16:17], v[0:1], v26
; GISEL-NEXT: v_lshl_b64 v[18:19], v[2:3], v26
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: v_lshr_b64 v[20:21], v[0:1], v20
; GISEL-NEXT: v_lshl_b64 v[24:25], v[0:1], v22
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26
; GISEL-NEXT: v_cndmask_b32_e32 v22, 0, v16, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v23, 0, v17, vcc
-; GISEL-NEXT: v_or_b32_e32 v16, v30, v32
-; GISEL-NEXT: v_or_b32_e32 v17, v31, v33
-; GISEL-NEXT: v_or_b32_e32 v18, v20, v18
-; GISEL-NEXT: v_or_b32_e32 v19, v21, v19
-; GISEL-NEXT: v_cndmask_b32_e32 v18, v24, v18, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v19, v25, v19, vcc
+; GISEL-NEXT: v_or_b32_e32 v16, v20, v18
+; GISEL-NEXT: v_or_b32_e32 v17, v21, v19
+; GISEL-NEXT: v_cndmask_b32_e32 v16, v24, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v17, v25, v17, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26
-; GISEL-NEXT: v_cndmask_b32_e32 v20, v18, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v21, v19, v3, vcc
-; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT: v_cndmask_b32_e32 v20, v16, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v21, v17, v3, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_mov_b32_e32 v19, s11
; GISEL-NEXT: v_mov_b32_e32 v18, s10
; GISEL-NEXT: v_mov_b32_e32 v17, s9
; GISEL-NEXT: v_mov_b32_e32 v16, s8
-; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
; GISEL-NEXT: s_cbranch_execz .LBB3_5
; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4
; GISEL-NEXT: v_add_i32_e32 v26, vcc, 0xffffffc0, v30
@@ -3042,9 +3029,9 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v17, v25
; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GISEL-NEXT: s_cbranch_execnz .LBB3_3
-; GISEL-NEXT: ; %bb.4: ; %Flow11
+; GISEL-NEXT: ; %bb.4: ; %Flow13
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB3_5: ; %Flow13
+; GISEL-NEXT: .LBB3_5: ; %Flow14
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1
; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
@@ -3052,7 +3039,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v20, v20, v22
; GISEL-NEXT: v_or_b32_e32 v32, v16, v18
; GISEL-NEXT: v_or_b32_e32 v33, v17, v19
-; GISEL-NEXT: .LBB3_6: ; %Flow15
+; GISEL-NEXT: .LBB3_6: ; %Flow16
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_or_b32_e32 v16, v12, v14
@@ -3120,34 +3107,32 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_addc_u32_e64 v35, s[4:5], 0, v23, vcc
; GISEL-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v22
; GISEL-NEXT: v_not_b32_e32 v18, 63
-; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_addc_u32_e64 v36, vcc, 0, v16, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v37, vcc, 0, v17, vcc
-; GISEL-NEXT: v_add_i32_e32 v24, vcc, v28, v18
-; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v28
+; GISEL-NEXT: v_add_i32_e64 v24, s[4:5], v28, v18
+; GISEL-NEXT: v_sub_i32_e64 v22, s[4:5], 64, v28
; GISEL-NEXT: v_lshl_b64 v[16:17], v[4:5], v28
; GISEL-NEXT: v_lshl_b64 v[18:19], v[6:7], v28
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: v_lshr_b64 v[22:23], v[4:5], v22
; GISEL-NEXT: v_lshl_b64 v[26:27], v[4:5], v24
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v28
; GISEL-NEXT: v_cndmask_b32_e32 v24, 0, v16, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v25, 0, v17, vcc
-; GISEL-NEXT: v_or_b32_e32 v16, v34, v36
-; GISEL-NEXT: v_or_b32_e32 v17, v35, v37
-; GISEL-NEXT: v_or_b32_e32 v18, v22, v18
-; GISEL-NEXT: v_or_b32_e32 v19, v23, v19
-; GISEL-NEXT: v_cndmask_b32_e32 v18, v26, v18, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc
+; GISEL-NEXT: v_or_b32_e32 v16, v22, v18
+; GISEL-NEXT: v_or_b32_e32 v17, v23, v19
+; GISEL-NEXT: v_cndmask_b32_e32 v16, v26, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v17, v27, v17, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v28
-; GISEL-NEXT: v_cndmask_b32_e32 v22, v18, v6, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v23, v19, v7, vcc
-; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT: v_cndmask_b32_e32 v22, v16, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v23, v17, v7, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_mov_b32_e32 v19, s11
; GISEL-NEXT: v_mov_b32_e32 v18, s10
; GISEL-NEXT: v_mov_b32_e32 v17, s9
; GISEL-NEXT: v_mov_b32_e32 v16, s8
-; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
; GISEL-NEXT: s_cbranch_execz .LBB3_11
; GISEL-NEXT: ; %bb.8: ; %udiv-preheader
; GISEL-NEXT: v_add_i32_e32 v28, vcc, 0xffffffc0, v34
@@ -3219,7 +3204,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: s_cbranch_execnz .LBB3_9
; GISEL-NEXT: ; %bb.10: ; %Flow
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB3_11: ; %Flow9
+; GISEL-NEXT: .LBB3_11: ; %Flow11
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: v_lshl_b64 v[26:27], v[24:25], 1
; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1
@@ -3227,7 +3212,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v18, v18, v22
; GISEL-NEXT: v_or_b32_e32 v24, v16, v26
; GISEL-NEXT: v_or_b32_e32 v25, v17, v27
-; GISEL-NEXT: .LBB3_12: ; %Flow10
+; GISEL-NEXT: .LBB3_12: ; %Flow12
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0
; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0
>From d2e00e99a7e8c61ca659a956eab7798d50f3f24e Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Thu, 25 Sep 2025 15:03:06 -0400
Subject: [PATCH 17/17] d
---
.../Target/AArch64/AArch64ISelLowering.cpp | 23 +++++++++++++++++++
1 file changed, 23 insertions(+)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 70dc311192206e..a4c1e265f0e63b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -25896,6 +25896,29 @@ static SDValue performCSELCombine(SDNode *N,
}
}
+ // CSEL a, b, cc, SUBS(SUB(x,y), 0) -> CSEL a, b, cc, SUBS(x,y) if cc doesn't
+ // use overflow flags, to avoid the comparison with zero. In case of success,
+ // this also replaces the original SUB(x,y) with the newly created SUBS(x,y).
+ // NOTE: Perhaps in the future use performFlagSettingCombine to replace SUB
+ // nodes with their SUBS equivalent as is already done for other flag-setting
+ // operators, in which case doing the replacement here becomes redundant.
+ if (Cond.getOpcode() == AArch64ISD::SUBS && Cond->hasNUsesOfValue(1, 1) &&
+ isNullConstant(Cond.getOperand(1))) {
+ SDValue Sub = Cond.getOperand(0);
+ AArch64CC::CondCode CC =
+ static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
+ if (Sub.getOpcode() == ISD::SUB &&
+ (CC == AArch64CC::EQ || CC == AArch64CC::NE || CC == AArch64CC::MI ||
+ CC == AArch64CC::PL)) {
+ SDLoc DL(N);
+ SDValue Subs = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
+ Sub.getOperand(0), Sub.getOperand(1));
+ DCI.CombineTo(Sub.getNode(), Subs);
+ DCI.CombineTo(Cond.getNode(), Subs, Subs.getValue(1));
+ return SDValue(N, 0);
+ }
+ }
+
// CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z
if (SDValue CondLast = foldCSELofLASTB(N, DAG))
return CondLast;
More information about the llvm-commits
mailing list