[llvm] [DAGCombiner] Freeze maybe poison operands when folding select to logic (PR #84924)
Björn Pettersson via llvm-commits
llvm-commits at lists.llvm.org
Tue May 28 02:23:43 PDT 2024
https://github.com/bjope updated https://github.com/llvm/llvm-project/pull/84924
>From a3e00332d70ec7dc72994328d37c5b18fd8e8ae7 Mon Sep 17 00:00:00 2001
From: Bjorn Pettersson <bjorn.a.pettersson at ericsson.com>
Date: Tue, 28 May 2024 08:37:50 +0200
Subject: [PATCH 1/2] [DAGCombiner] Freeze maybe poison operands when folding
select to logic
Just like for regular IR we need to treat SELECT as conditionally
blocking poison. So (unless the condition itself is poison) the
result is only poison if the selected true/false value is poison.
Thus, when doing DAG combines that turn SELECT into arithmetic/logical
operations (e.g. AND/OR) we need to make sure that the new operations
aren't more poisonous. One way to do that is to use FREEZE to make
sure the operands aren't posion.
This patch aims at fixing the kind of miscompiles reported in
https://github.com/llvm/llvm-project/issues/84653
and
https://github.com/llvm/llvm-project/issues/85190
Solution is to make sure that we insert FREEZE, if needed to make
the fold sound, when using the foldBoolSelectToLogic and
foldVSelectToSignBitSplatMask DAG combines.
This may result in some (hopefully minor) regressions since we lack
some ways to fold away the freeze (or due to isGuaranteedNotToBePoison
being too pessimistic). Focus in this patch is to just avoid
miscompiles, but I think some of the regressions can be avoided by
general improvements regarding poison/freeze handling in SelectionDAG.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 18 +--
...rleaving-reductions-predicated-scalable.ll | 3 +-
llvm/test/CodeGen/AArch64/fast-isel-select.ll | 20 ++-
.../AArch64/intrinsic-cttz-elts-sve.ll | 3 +-
.../CodeGen/AArch64/sve-fp-int-min-max.ll | 3 +-
llvm/test/CodeGen/AMDGPU/div_i128.ll | 64 ++++-----
llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 134 +++++++++---------
.../AMDGPU/divergence-driven-trunc-to-i1.ll | 41 +++---
llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 60 ++++----
llvm/test/CodeGen/AMDGPU/rem_i128.ll | 64 ++++-----
llvm/test/CodeGen/PowerPC/pr40922.ll | 7 +-
llvm/test/CodeGen/RISCV/pr84653_pr85190.ll | 3 +-
llvm/test/CodeGen/SystemZ/pr60413.ll | 104 +++++++-------
llvm/test/CodeGen/VE/Scalar/max.ll | 2 +
llvm/test/CodeGen/VE/Scalar/min.ll | 2 +
.../X86/div-rem-pair-recomposition-signed.ll | 21 +--
llvm/test/CodeGen/X86/pr64589.ll | 4 +-
.../test/CodeGen/X86/vector-compare-all_of.ll | 41 ++++--
.../test/CodeGen/X86/vector-compare-any_of.ll | 27 ++--
19 files changed, 321 insertions(+), 300 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 93d866384b482..694fee6c09bdb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11473,28 +11473,28 @@ static SDValue foldBoolSelectToLogic(SDNode *N, SelectionDAG &DAG) {
if (VT != Cond.getValueType() || VT.getScalarSizeInBits() != 1)
return SDValue();
- // select Cond, Cond, F --> or Cond, F
- // select Cond, 1, F --> or Cond, F
+ // select Cond, Cond, F --> or Cond, freeze(F)
+ // select Cond, 1, F --> or Cond, freeze(F)
if (Cond == T || isOneOrOneSplat(T, /* AllowUndefs */ true))
- return matcher.getNode(ISD::OR, SDLoc(N), VT, Cond, F);
+ return matcher.getNode(ISD::OR, SDLoc(N), VT, Cond, DAG.getFreeze(F));
// select Cond, T, Cond --> and Cond, T
// select Cond, T, 0 --> and Cond, T
if (Cond == F || isNullOrNullSplat(F, /* AllowUndefs */ true))
- return matcher.getNode(ISD::AND, SDLoc(N), VT, Cond, T);
+ return matcher.getNode(ISD::AND, SDLoc(N), VT, Cond, DAG.getFreeze(T));
// select Cond, T, 1 --> or (not Cond), T
if (isOneOrOneSplat(F, /* AllowUndefs */ true)) {
SDValue NotCond = matcher.getNode(ISD::XOR, SDLoc(N), VT, Cond,
DAG.getAllOnesConstant(SDLoc(N), VT));
- return matcher.getNode(ISD::OR, SDLoc(N), VT, NotCond, T);
+ return matcher.getNode(ISD::OR, SDLoc(N), VT, NotCond, DAG.getFreeze(T));
}
// select Cond, 0, F --> and (not Cond), F
if (isNullOrNullSplat(T, /* AllowUndefs */ true)) {
SDValue NotCond = matcher.getNode(ISD::XOR, SDLoc(N), VT, Cond,
DAG.getAllOnesConstant(SDLoc(N), VT));
- return matcher.getNode(ISD::AND, SDLoc(N), VT, NotCond, F);
+ return matcher.getNode(ISD::AND, SDLoc(N), VT, NotCond, DAG.getFreeze(F));
}
return SDValue();
@@ -11528,7 +11528,7 @@ static SDValue foldVSelectToSignBitSplatMask(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
- return DAG.getNode(ISD::AND, DL, VT, Sra, N1);
+ return DAG.getNode(ISD::AND, DL, VT, Sra, DAG.getFreeze(N1));
}
// (Cond0 s< 0) ? -1 : N2 --> (Cond0 s>> BW-1) | N2
@@ -11536,7 +11536,7 @@ static SDValue foldVSelectToSignBitSplatMask(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
- return DAG.getNode(ISD::OR, DL, VT, Sra, N2);
+ return DAG.getNode(ISD::OR, DL, VT, Sra, DAG.getFreeze(N2));
}
// If we have to invert the sign bit mask, only do that transform if the
@@ -11548,7 +11548,7 @@ static SDValue foldVSelectToSignBitSplatMask(SDNode *N, SelectionDAG &DAG) {
SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
SDValue Not = DAG.getNOT(DL, Sra, VT);
- return DAG.getNode(ISD::AND, DL, VT, Not, N2);
+ return DAG.getNode(ISD::AND, DL, VT, Not, DAG.getFreeze(N2));
}
// TODO: There's another pattern in this family, but it may require
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
index ac2b21af29ab7..8f819f2d7c8d1 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
@@ -236,7 +236,8 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
; CHECK-NEXT: mov z7.d, z0.d
; CHECK-NEXT: add x9, x9, x11
; CHECK-NEXT: add x8, x8, x12
-; CHECK-NEXT: cmpne p2.d, p1/z, z2.d, #0
+; CHECK-NEXT: cmpne p2.d, p0/z, z2.d, #0
+; CHECK-NEXT: and p2.b, p1/z, p1.b, p2.b
; CHECK-NEXT: zip2 p1.d, p2.d, p2.d
; CHECK-NEXT: zip1 p2.d, p2.d, p2.d
; CHECK-NEXT: ld1d { z2.d }, p1/z, [x13, #1, mul vl]
diff --git a/llvm/test/CodeGen/AArch64/fast-isel-select.ll b/llvm/test/CodeGen/AArch64/fast-isel-select.ll
index 6ad4a5ae572e0..4ef4ee41e8aeb 100644
--- a/llvm/test/CodeGen/AArch64/fast-isel-select.ll
+++ b/llvm/test/CodeGen/AArch64/fast-isel-select.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=aarch64-apple-darwin -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-apple-darwin -global-isel -verify-machineinstrs < %s | FileCheck %s --check-prefix=GISEL
+; RUN: llc -mtriple=aarch64-apple-darwin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,SISEL
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,FISEL
+; RUN: llc -mtriple=aarch64-apple-darwin -global-isel -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GISEL
; First test the different supported value types for select.
define zeroext i1 @select_i1(i1 zeroext %c, i1 zeroext %a, i1 zeroext %b) {
@@ -295,22 +295,28 @@ define float @select_icmp_sle(i32 %x, i32 %y, float %a, float %b) {
; Test peephole optimizations for select.
define zeroext i1 @select_opt1(i1 zeroext %c, i1 zeroext %a) {
; CHECK-LABEL: select_opt1
-; CHECK: orr {{w[0-9]+}}, w0, w1
+; SISEL: orr [[REG:w[0-9]+]], w0, w1
+; SISEL: and w0, [[REG]], #0x1
+; FISEL: orr {{w[0-9]+}}, w0, w1
%1 = select i1 %c, i1 true, i1 %a
ret i1 %1
}
define zeroext i1 @select_opt2(i1 zeroext %c, i1 zeroext %a) {
; CHECK-LABEL: select_opt2
-; CHECK: eor [[REG:w[0-9]+]], w0, #0x1
-; CHECK: orr {{w[0-9]+}}, [[REG]], w1
+; SISEL: orn [[REG:w[0-9]+]], w1, w0
+; SISEL: and w0, [[REG]], #0x1
+; FISEL: eor [[REG:w[0-9]+]], w0, #0x1
+; FISEL: orr {{w[0-9]+}}, [[REG]], w1
%1 = select i1 %c, i1 %a, i1 true
ret i1 %1
}
define zeroext i1 @select_opt3(i1 zeroext %c, i1 zeroext %a) {
; CHECK-LABEL: select_opt3
-; CHECK: bic {{w[0-9]+}}, w1, w0
+; SISEL: eor [[REG:w[0-9]+]], w0, #0x1
+; SISEL: and w0, [[REG]], w1
+; FISEL: bic {{w[0-9]+}}, w1, w0
%1 = select i1 %c, i1 false, i1 %a
ret i1 %1
}
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
index 9c72afd84fa7c..4871d729a465b 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
@@ -319,8 +319,9 @@ define i32 @ctz_nxv16i1_poison(<vscale x 16 x i1> %a) {
define i32 @ctz_and_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: ctz_and_nxv16i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: cmpne p2.b, p1/z, z0.b, z1.b
+; CHECK-NEXT: and p0.b, p0/z, p0.b, p2.b
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
; CHECK-NEXT: cntp x0, p0, p0.b
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
index afe13851f0b95..0d7f230062650 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
@@ -24,7 +24,8 @@ define i64 @scalable_int_min_max(ptr %arg, ptr %arg1, <vscale x 2 x ptr> %i37, <
; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z4.s
; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z3.s
; CHECK-NEXT: add z0.d, z2.d, z1.d
-; CHECK-NEXT: bic p2.b, p1/z, p1.b, p2.b
+; CHECK-NEXT: not p2.b, p0/z, p2.b
+; CHECK-NEXT: and p2.b, p1/z, p1.b, p2.b
; CHECK-NEXT: mov z0.d, p2/m, z2.d
; CHECK-NEXT: sel z0.d, p1, z0.d, z2.d
; CHECK-NEXT: uaddv d0, p0, z0.d
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index b2f9bf89d9ec6..f68d60a381331 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -476,28 +476,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9]
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15]
+; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[14:15]
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[14:15]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[8:9]
; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7
@@ -508,7 +501,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s14
-; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s12
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
@@ -1042,10 +1034,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
@@ -2747,28 +2739,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9]
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15]
+; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[14:15]
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[14:15]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[8:9]
; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7
@@ -2779,7 +2764,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s14
-; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s12
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
@@ -3313,10 +3297,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 16a03badcb132..efb89499b29f0 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -16,103 +16,103 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_xor_b32_e32 v18, v24, v2
; SDAG-NEXT: v_xor_b32_e32 v1, v24, v1
; SDAG-NEXT: v_xor_b32_e32 v0, v24, v0
-; SDAG-NEXT: v_xor_b32_e32 v19, v25, v11
-; SDAG-NEXT: v_xor_b32_e32 v20, v25, v10
-; SDAG-NEXT: v_xor_b32_e32 v9, v25, v9
-; SDAG-NEXT: v_xor_b32_e32 v8, v25, v8
+; SDAG-NEXT: v_xor_b32_e32 v11, v25, v11
+; SDAG-NEXT: v_xor_b32_e32 v10, v25, v10
+; SDAG-NEXT: v_xor_b32_e32 v19, v25, v9
+; SDAG-NEXT: v_xor_b32_e32 v20, v25, v8
; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v0, v24
; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v1, v24, vcc
; SDAG-NEXT: v_ffbh_u32_e32 v0, v2
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v18, v24, vcc
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v18, v24, vcc
; SDAG-NEXT: v_add_i32_e64 v1, s[4:5], 32, v0
; SDAG-NEXT: v_ffbh_u32_e32 v18, v3
-; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v17, v24, vcc
-; SDAG-NEXT: v_or_b32_e32 v0, v2, v10
-; SDAG-NEXT: v_ffbh_u32_e32 v17, v10
+; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v17, v24, vcc
+; SDAG-NEXT: v_or_b32_e32 v0, v2, v8
+; SDAG-NEXT: v_ffbh_u32_e32 v17, v8
; SDAG-NEXT: v_min_u32_e32 v18, v1, v18
-; SDAG-NEXT: v_sub_i32_e32 v28, vcc, v8, v25
-; SDAG-NEXT: v_or_b32_e32 v1, v3, v11
-; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], 32, v17
-; SDAG-NEXT: v_ffbh_u32_e32 v17, v11
+; SDAG-NEXT: v_sub_i32_e32 v28, vcc, v20, v25
+; SDAG-NEXT: v_or_b32_e32 v1, v3, v9
+; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], 32, v17
+; SDAG-NEXT: v_ffbh_u32_e32 v20, v9
; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 64, v18
; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v9, v25, vcc
+; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v19, v25, vcc
; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; SDAG-NEXT: v_ffbh_u32_e32 v1, v28
-; SDAG-NEXT: v_min_u32_e32 v8, v8, v17
-; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, s[6:7]
-; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v20, v25, vcc
-; SDAG-NEXT: v_add_i32_e64 v9, s[8:9], 32, v1
-; SDAG-NEXT: v_ffbh_u32_e32 v20, v29
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v8, s[6:7]
-; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v19, v25, vcc
-; SDAG-NEXT: v_or_b32_e32 v8, v28, v0
-; SDAG-NEXT: v_ffbh_u32_e32 v19, v0
-; SDAG-NEXT: v_min_u32_e32 v20, v9, v20
-; SDAG-NEXT: v_or_b32_e32 v9, v29, v1
-; SDAG-NEXT: v_add_i32_e32 v19, vcc, 32, v19
+; SDAG-NEXT: v_min_u32_e32 v17, v17, v20
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[8:9]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v21, 0, s[6:7]
+; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v10, v25, vcc
+; SDAG-NEXT: v_add_i32_e64 v20, s[8:9], 32, v1
+; SDAG-NEXT: v_ffbh_u32_e32 v21, v29
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v18, v17, s[6:7]
+; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v11, v25, vcc
+; SDAG-NEXT: v_or_b32_e32 v10, v28, v0
+; SDAG-NEXT: v_ffbh_u32_e32 v18, v0
+; SDAG-NEXT: v_min_u32_e32 v20, v20, v21
+; SDAG-NEXT: v_or_b32_e32 v11, v29, v1
+; SDAG-NEXT: v_add_i32_e32 v18, vcc, 32, v18
; SDAG-NEXT: v_ffbh_u32_e32 v21, v1
; SDAG-NEXT: v_add_i32_e32 v20, vcc, 64, v20
; SDAG-NEXT: v_addc_u32_e64 v22, s[6:7], 0, 0, vcc
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; SDAG-NEXT: v_min_u32_e32 v8, v19, v21
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_min_u32_e32 v10, v18, v21
; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v22, 0, s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v22, 0, s[6:7]
; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[6:7]
-; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v18
-; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v17, vcc
-; SDAG-NEXT: v_xor_b32_e32 v17, 0x7f, v8
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[6:7]
+; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v10, v17
+; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v19, vcc
+; SDAG-NEXT: v_xor_b32_e32 v17, 0x7f, v10
; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v16, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9]
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v16, vcc
; SDAG-NEXT: v_or_b32_e32 v16, v17, v18
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v17, v9, v19
+; SDAG-NEXT: v_or_b32_e32 v17, v11, v19
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
; SDAG-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
; SDAG-NEXT: v_and_b32_e32 v16, 1, v20
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, v11, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, v9, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT: v_cndmask_b32_e64 v17, v10, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v8, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v21, v3, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, 0, s[4:5]
; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB0_6
; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
-; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v8
-; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v8
+; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v10
+; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v10
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: v_mov_b32_e32 v17, 0
-; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v9, vcc
+; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v11, vcc
; SDAG-NEXT: v_lshl_b64 v[20:21], v[2:3], v20
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v18, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v19, vcc
; SDAG-NEXT: v_or_b32_e32 v18, v30, v32
-; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v8
+; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v10
; SDAG-NEXT: v_or_b32_e32 v19, v31, v33
-; SDAG-NEXT: v_lshl_b64 v[8:9], v[10:11], v34
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[8:9], v34
; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34
; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v34
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
; SDAG-NEXT: v_lshr_b64 v[18:19], v[2:3], v35
-; SDAG-NEXT: v_or_b32_e32 v9, v9, v19
-; SDAG-NEXT: v_or_b32_e32 v8, v8, v18
+; SDAG-NEXT: v_or_b32_e32 v11, v11, v19
+; SDAG-NEXT: v_or_b32_e32 v10, v10, v18
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v21, v9, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5]
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, v8, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v18, 0
; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -122,24 +122,24 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshr_b64 v[16:17], v[2:3], v30
; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30
; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30
-; SDAG-NEXT: v_lshr_b64 v[37:38], v[10:11], v30
+; SDAG-NEXT: v_lshr_b64 v[37:38], v[8:9], v30
; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v28
; SDAG-NEXT: s_mov_b64 s[10:11], 0
; SDAG-NEXT: v_mov_b32_e32 v22, 0
; SDAG-NEXT: v_mov_b32_e32 v23, 0
; SDAG-NEXT: v_mov_b32_e32 v18, 0
; SDAG-NEXT: v_mov_b32_e32 v19, 0
-; SDAG-NEXT: v_lshl_b64 v[48:49], v[10:11], v35
-; SDAG-NEXT: v_lshr_b64 v[10:11], v[10:11], v36
+; SDAG-NEXT: v_lshl_b64 v[48:49], v[8:9], v35
+; SDAG-NEXT: v_lshr_b64 v[8:9], v[8:9], v36
; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v29, vcc
; SDAG-NEXT: v_or_b32_e32 v17, v17, v49
; SDAG-NEXT: v_or_b32_e32 v16, v16, v48
; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30
-; SDAG-NEXT: v_cndmask_b32_e64 v17, v11, v17, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v10, v16, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v38, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v37, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v9, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v8, v16, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, v38, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v37, s[4:5]
; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc
; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
; SDAG-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc
@@ -147,22 +147,22 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v17, 0
; SDAG-NEXT: .LBB0_3: ; %udiv-do-while3
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v3
; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v9
-; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v11
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v21
; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v16
+; SDAG-NEXT: v_or_b32_e32 v8, v8, v16
; SDAG-NEXT: v_or_b32_e32 v2, v2, v38
-; SDAG-NEXT: v_or_b32_e32 v8, v8, v39
-; SDAG-NEXT: v_or_b32_e32 v9, v19, v9
+; SDAG-NEXT: v_or_b32_e32 v10, v10, v39
+; SDAG-NEXT: v_or_b32_e32 v11, v19, v11
; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v34, v2
-; SDAG-NEXT: v_or_b32_e32 v8, v18, v8
+; SDAG-NEXT: v_or_b32_e32 v10, v18, v10
; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v35, v3, vcc
-; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v36, v10, vcc
-; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v37, v11, vcc
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v36, v8, vcc
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v37, v9, vcc
; SDAG-NEXT: v_ashrrev_i32_e32 v38, 31, v16
; SDAG-NEXT: v_and_b32_e32 v39, v38, v28
; SDAG-NEXT: v_and_b32_e32 v48, v38, v29
@@ -171,8 +171,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_and_b32_e32 v38, v38, v1
; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v39
; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v48, vcc
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v10, v49, vcc
-; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v38, vcc
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v8, v49, vcc
+; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v38, vcc
; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30
; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc
@@ -191,7 +191,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
; SDAG-NEXT: .LBB0_5: ; %Flow14
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[10:11], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v21
; SDAG-NEXT: v_lshl_b64 v[2:3], v[20:21], 1
; SDAG-NEXT: v_or_b32_e32 v0, v0, v8
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
index c3a6cd5975a77..243c741c17088 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -15,13 +15,16 @@ define amdgpu_kernel void @uniform_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x
; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
; GCN-NEXT: [[S_SEXT_I32_I16_:%[0-9]+]]:sreg_32 = S_SEXT_I32_I16 [[S_LOAD_DWORD_IMM]]
- ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 65536, [[S_LOAD_DWORD_IMM]], implicit-def dead $scc
- ; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc
- ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $scc
- ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GCN-NEXT: S_CMP_LT_I32 killed [[S_SEXT_I32_I16_]], killed [[S_MOV_B32_2]], implicit-def $scc
+ ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; GCN-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_2]], implicit-def dead $scc
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY killed [[S_LSHR_B32_]]
+ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY3]], implicit-def dead $scc
+ ; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $scc
- ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY4]], killed [[COPY3]], implicit-def dead $scc
+ ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GCN-NEXT: S_CMP_LT_I32 killed [[S_SEXT_I32_I16_]], killed [[S_MOV_B32_3]], implicit-def $scc
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $scc
+ ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY5]], killed [[COPY4]], implicit-def dead $scc
; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1)
; GCN-NEXT: S_ENDPGM 0
@@ -65,15 +68,16 @@ define amdgpu_kernel void @uniform_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
- ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
- ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
- ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY4]], implicit-def dead $scc
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY killed [[S_LOAD_DWORDX2_IMM1]]
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1
+ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY5]], implicit-def dead $scc
; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
- ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $scc
- ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GCN-NEXT: S_CMP_LT_I32 killed [[COPY3]], killed [[S_MOV_B32_2]], implicit-def $scc
; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $scc
- ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY6]], killed [[COPY5]], implicit-def dead $scc
+ ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GCN-NEXT: S_CMP_LT_I32 killed [[COPY4]], killed [[S_MOV_B32_2]], implicit-def $scc
+ ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $scc
+ ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY7]], killed [[COPY6]], implicit-def dead $scc
; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1)
; GCN-NEXT: S_ENDPGM 0
@@ -122,13 +126,14 @@ define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x
; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
- ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[S_LOAD_DWORD_IMM]], implicit-def dead $scc
+ ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
+ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY7]], implicit-def dead $scc
; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
- ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $scc
+ ; GCN-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY $scc
; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
- ; GCN-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]]
- ; GCN-NEXT: [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
- ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[COPY7]], implicit-def dead $scc
+ ; GCN-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]]
+ ; GCN-NEXT: [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE2]], [[COPY9]], implicit $exec
+ ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[COPY8]], implicit-def dead $scc
; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.2, addrspace 1)
; GCN-NEXT: S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 99818df6175bd..667a3f398c08a 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -22,13 +22,13 @@ define i128 @fptosi_f64_to_i128(double %x) {
; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
-; SDAG-NEXT: s_movk_i32 s4, 0xff7f
+; SDAG-NEXT: s_movk_i32 s6, 0xff7f
; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
-; SDAG-NEXT: s_mov_b32 s5, -1
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
+; SDAG-NEXT: s_mov_b32 s7, -1
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
; SDAG-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5]
-; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
@@ -394,13 +394,13 @@ define i128 @fptoui_f64_to_i128(double %x) {
; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
-; SDAG-NEXT: s_movk_i32 s4, 0xff7f
+; SDAG-NEXT: s_movk_i32 s6, 0xff7f
; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
-; SDAG-NEXT: s_mov_b32 s5, -1
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
+; SDAG-NEXT: s_mov_b32 s7, -1
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
; SDAG-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5]
-; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
@@ -765,13 +765,13 @@ define i128 @fptosi_f32_to_i128(float %x) {
; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
-; SDAG-NEXT: s_movk_i32 s4, 0xff7f
+; SDAG-NEXT: s_movk_i32 s6, 0xff7f
; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
-; SDAG-NEXT: s_mov_b32 s5, -1
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
+; SDAG-NEXT: s_mov_b32 s7, -1
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4
-; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
@@ -1123,13 +1123,13 @@ define i128 @fptoui_f32_to_i128(float %x) {
; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
-; SDAG-NEXT: s_movk_i32 s4, 0xff7f
+; SDAG-NEXT: s_movk_i32 s6, 0xff7f
; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
-; SDAG-NEXT: s_mov_b32 s5, -1
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
+; SDAG-NEXT: s_mov_b32 s7, -1
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4
-; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
@@ -1509,13 +1509,13 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
-; SDAG-NEXT: s_movk_i32 s4, 0xff7f
+; SDAG-NEXT: s_movk_i32 s6, 0xff7f
; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
-; SDAG-NEXT: s_mov_b32 s5, -1
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
+; SDAG-NEXT: s_mov_b32 s7, -1
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
; SDAG-NEXT: v_cmp_lt_i16_e32 vcc, -1, v4
-; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
@@ -1860,13 +1860,13 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
-; SDAG-NEXT: s_movk_i32 s4, 0xff7f
+; SDAG-NEXT: s_movk_i32 s6, 0xff7f
; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
-; SDAG-NEXT: s_mov_b32 s5, -1
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
+; SDAG-NEXT: s_mov_b32 s7, -1
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
; SDAG-NEXT: v_cmp_lt_i16_e32 vcc, -1, v4
-; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index b068d87c4d6f4..23291786286a7 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -510,28 +510,21 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9]
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15]
+; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[14:15]
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[14:15]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[8:9]
; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7
@@ -542,7 +535,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s14
-; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s12
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
@@ -1076,10 +1068,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
@@ -1908,28 +1900,21 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9]
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15]
+; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[14:15]
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[14:15]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[8:9]
; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7
@@ -1940,7 +1925,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s14
-; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s12
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
@@ -2474,10 +2458,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/PowerPC/pr40922.ll b/llvm/test/CodeGen/PowerPC/pr40922.ll
index 9252e9a3e3aa4..2d9add6a19857 100644
--- a/llvm/test/CodeGen/PowerPC/pr40922.ll
+++ b/llvm/test/CodeGen/PowerPC/pr40922.ll
@@ -23,11 +23,12 @@ define i32 @a() {
; CHECK-NEXT: li 5, 0
; CHECK-NEXT: mr 30, 3
; CHECK-NEXT: addic 6, 4, 6
-; CHECK-NEXT: addze 5, 5
; CHECK-NEXT: rlwinm 6, 6, 0, 28, 26
-; CHECK-NEXT: andi. 5, 5, 1
+; CHECK-NEXT: addze 5, 5
; CHECK-NEXT: cmplw 1, 6, 4
-; CHECK-NEXT: crorc 20, 1, 4
+; CHECK-NEXT: andi. 5, 5, 1
+; CHECK-NEXT: crnot 20, 4
+; CHECK-NEXT: cror 20, 1, 20
; CHECK-NEXT: bc 12, 20, .LBB0_2
; CHECK-NEXT: # %bb.1: # %if.then
; CHECK-NEXT: bl e
diff --git a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
index 3fa494e1a57dd..f9b9c8a69d431 100644
--- a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
+++ b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
@@ -54,7 +54,8 @@ define i1 @pr85190(i64 %a) {
; CHECK-ZBB-NEXT: li a2, -1
; CHECK-ZBB-NEXT: slli a2, a2, 63
; CHECK-ZBB-NEXT: sub a2, a2, a1
-; CHECK-ZBB-NEXT: slt a0, a0, a2
+; CHECK-ZBB-NEXT: min a1, a2, zero
+; CHECK-ZBB-NEXT: slt a0, a0, a1
; CHECK-ZBB-NEXT: ret
%or = or i64 %a, 7
%cmp1 = icmp slt i64 %a, 0
diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll
index 8a6a30318ae58..62f5d49192ea9 100644
--- a/llvm/test/CodeGen/SystemZ/pr60413.ll
+++ b/llvm/test/CodeGen/SystemZ/pr60413.ll
@@ -13,7 +13,6 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
define dso_local void @m() local_unnamed_addr #1 {
; CHECK-LABEL: m:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
; CHECK-NEXT: aghi %r15, -168
; CHECK-NEXT: lhrl %r1, f+4
; CHECK-NEXT: sll %r1, 8
@@ -21,59 +20,66 @@ define dso_local void @m() local_unnamed_addr #1 {
; CHECK-NEXT: ic %r1, 6(%r2)
; CHECK-NEXT: larl %r2, e
; CHECK-NEXT: lb %r0, 3(%r2)
-; CHECK-NEXT: vlvgp %v0, %r0, %r1
-; CHECK-NEXT: vlvgp %v1, %r1, %r0
; CHECK-NEXT: vlvgf %v1, %r1, 0
-; CHECK-NEXT: vlvgf %v1, %r1, 2
-; CHECK-NEXT: vlvgp %v2, %r1, %r1
-; CHECK-NEXT: # kill: def $r1l killed $r1l killed $r1d
+; CHECK-NEXT: vlvgf %v1, %r1, 1
+; CHECK-NEXT: larl %r2, .LCPI0_0
+; CHECK-NEXT: vl %v2, 0(%r2), 3
+; CHECK-NEXT: vlvgf %v1, %r1, 3
+; CHECK-NEXT: vlvgf %v3, %r1, 3
+; CHECK-NEXT: vlvgf %v0, %r1, 1
+; CHECK-NEXT: vperm %v4, %v1, %v0, %v2
+; CHECK-NEXT: vlvgf %v0, %r1, 3
; CHECK-NEXT: nilh %r1, 255
; CHECK-NEXT: chi %r1, 128
; CHECK-NEXT: ipm %r1
; CHECK-NEXT: risbg %r1, %r1, 63, 191, 36
-; CHECK-NEXT: vlvgf %v0, %r0, 0
-; CHECK-NEXT: vlvgf %v0, %r0, 2
-; CHECK-NEXT: vgbm %v3, 30583
-; CHECK-NEXT: vn %v0, %v0, %v3
-; CHECK-NEXT: vn %v1, %v1, %v3
-; CHECK-NEXT: vrepf %v2, %v2, 1
-; CHECK-NEXT: vn %v2, %v2, %v3
-; CHECK-NEXT: vrepif %v3, 127
-; CHECK-NEXT: vchlf %v1, %v1, %v3
-; CHECK-NEXT: vlgvf %r13, %v1, 0
-; CHECK-NEXT: vchlf %v2, %v2, %v3
+; CHECK-NEXT: vperm %v0, %v3, %v0, %v2
+; CHECK-NEXT: larl %r2, .LCPI0_1
+; CHECK-NEXT: vl %v5, 0(%r2), 3
+; CHECK-NEXT: vgbm %v6, 30583
+; CHECK-NEXT: vn %v0, %v0, %v6
+; CHECK-NEXT: vn %v4, %v4, %v6
+; CHECK-NEXT: vperm %v1, %v1, %v1, %v5
+; CHECK-NEXT: vn %v5, %v1, %v6
+; CHECK-NEXT: vperm %v1, %v0, %v3, %v2
+; CHECK-NEXT: vn %v2, %v1, %v6
+; CHECK-NEXT: vrepif %v1, 127
+; CHECK-NEXT: vchlf %v3, %v5, %v1
+; CHECK-NEXT: vlgvf %r3, %v3, 1
+; CHECK-NEXT: vlgvf %r2, %v3, 0
+; CHECK-NEXT: risbg %r2, %r2, 48, 176, 15
+; CHECK-NEXT: rosbg %r2, %r3, 49, 49, 14
+; CHECK-NEXT: vlgvf %r3, %v3, 2
+; CHECK-NEXT: rosbg %r2, %r3, 50, 50, 13
+; CHECK-NEXT: vlgvf %r3, %v3, 3
+; CHECK-NEXT: rosbg %r2, %r3, 51, 51, 12
+; CHECK-NEXT: vchlf %v3, %v4, %v1
+; CHECK-NEXT: vlgvf %r3, %v3, 0
+; CHECK-NEXT: rosbg %r2, %r3, 52, 52, 11
+; CHECK-NEXT: vlgvf %r3, %v3, 1
+; CHECK-NEXT: rosbg %r2, %r3, 53, 53, 10
+; CHECK-NEXT: vlgvf %r3, %v3, 2
+; CHECK-NEXT: rosbg %r2, %r3, 54, 54, 9
+; CHECK-NEXT: vlgvf %r3, %v3, 3
+; CHECK-NEXT: rosbg %r2, %r3, 55, 55, 8
+; CHECK-NEXT: vchlf %v2, %v2, %v1
+; CHECK-NEXT: vlgvf %r3, %v2, 0
+; CHECK-NEXT: rosbg %r2, %r3, 56, 56, 7
; CHECK-NEXT: vlgvf %r3, %v2, 1
-; CHECK-NEXT: nilf %r3, 1
-; CHECK-NEXT: vlgvf %r4, %v2, 0
-; CHECK-NEXT: risbg %r2, %r4, 48, 176, 15
-; CHECK-NEXT: rosbg %r2, %r3, 32, 49, 14
-; CHECK-NEXT: vlgvf %r5, %v2, 2
-; CHECK-NEXT: nilf %r5, 1
-; CHECK-NEXT: rosbg %r2, %r5, 32, 50, 13
-; CHECK-NEXT: vlgvf %r14, %v2, 3
-; CHECK-NEXT: nilf %r14, 1
-; CHECK-NEXT: rosbg %r2, %r14, 32, 51, 12
-; CHECK-NEXT: rosbg %r2, %r13, 52, 52, 11
-; CHECK-NEXT: vlgvf %r13, %v1, 1
-; CHECK-NEXT: rosbg %r2, %r13, 53, 53, 10
-; CHECK-NEXT: vlgvf %r13, %v1, 2
-; CHECK-NEXT: rosbg %r2, %r13, 54, 54, 9
-; CHECK-NEXT: vlgvf %r13, %v1, 3
-; CHECK-NEXT: rosbg %r2, %r13, 55, 55, 8
-; CHECK-NEXT: vchlf %v0, %v0, %v3
-; CHECK-NEXT: vlgvf %r13, %v0, 0
-; CHECK-NEXT: rosbg %r2, %r13, 56, 56, 7
-; CHECK-NEXT: vlgvf %r13, %v0, 1
-; CHECK-NEXT: rosbg %r2, %r13, 57, 57, 6
-; CHECK-NEXT: vlgvf %r13, %v0, 2
-; CHECK-NEXT: rosbg %r2, %r13, 58, 58, 5
-; CHECK-NEXT: vlgvf %r13, %v0, 3
-; CHECK-NEXT: rosbg %r2, %r13, 59, 59, 4
-; CHECK-NEXT: nilf %r4, 1
-; CHECK-NEXT: rosbg %r2, %r4, 32, 60, 3
-; CHECK-NEXT: rosbg %r2, %r3, 32, 61, 2
-; CHECK-NEXT: rosbg %r2, %r5, 32, 62, 1
-; CHECK-NEXT: or %r2, %r14
+; CHECK-NEXT: rosbg %r2, %r3, 57, 57, 6
+; CHECK-NEXT: vlgvf %r3, %v2, 2
+; CHECK-NEXT: rosbg %r2, %r3, 58, 58, 5
+; CHECK-NEXT: vlgvf %r3, %v2, 3
+; CHECK-NEXT: rosbg %r2, %r3, 59, 59, 4
+; CHECK-NEXT: vchlf %v0, %v0, %v1
+; CHECK-NEXT: vlgvf %r3, %v0, 0
+; CHECK-NEXT: rosbg %r2, %r3, 60, 60, 3
+; CHECK-NEXT: vlgvf %r3, %v0, 1
+; CHECK-NEXT: rosbg %r2, %r3, 61, 61, 2
+; CHECK-NEXT: vlgvf %r3, %v0, 2
+; CHECK-NEXT: rosbg %r2, %r3, 62, 62, 1
+; CHECK-NEXT: vlgvf %r3, %v0, 3
+; CHECK-NEXT: rosbg %r2, %r3, 63, 63, 0
; CHECK-NEXT: vlgvb %r4, %v0, 1
; CHECK-NEXT: vlgvb %r3, %v0, 0
; CHECK-NEXT: risbg %r3, %r3, 48, 176, 15
@@ -116,7 +122,7 @@ define dso_local void @m() local_unnamed_addr #1 {
; CHECK-NEXT: nr %r2, %r0
; CHECK-NEXT: larl %r1, g
; CHECK-NEXT: stc %r2, 0(%r1)
-; CHECK-NEXT: lmg %r13, %r15, 272(%r15)
+; CHECK-NEXT: aghi %r15, 168
; CHECK-NEXT: br %r14
entry:
%n = alloca i32, align 4
diff --git a/llvm/test/CodeGen/VE/Scalar/max.ll b/llvm/test/CodeGen/VE/Scalar/max.ll
index 12aa101cb48c4..51da557c6c49f 100644
--- a/llvm/test/CodeGen/VE/Scalar/max.ll
+++ b/llvm/test/CodeGen/VE/Scalar/max.ll
@@ -281,11 +281,13 @@ define zeroext i1 @maxi1(i1 zeroext, i1 zeroext) {
; CHECK-LABEL: maxi1:
; CHECK: # %bb.0:
; CHECK-NEXT: or %s0, %s0, %s1
+; CHECK-NEXT: and %s0, 1, %s0
; CHECK-NEXT: b.l.t (, %s10)
;
; OPT-LABEL: maxi1:
; OPT: # %bb.0:
; OPT-NEXT: or %s0, %s0, %s1
+; OPT-NEXT: and %s0, 1, %s0
; OPT-NEXT: b.l.t (, %s10)
%3 = xor i1 %1, true
%4 = and i1 %3, %0
diff --git a/llvm/test/CodeGen/VE/Scalar/min.ll b/llvm/test/CodeGen/VE/Scalar/min.ll
index da92ebafd0590..69d5ce48601f8 100644
--- a/llvm/test/CodeGen/VE/Scalar/min.ll
+++ b/llvm/test/CodeGen/VE/Scalar/min.ll
@@ -278,6 +278,7 @@ define i32 @min2u32(i32, i32) {
define zeroext i1 @mini1(i1 zeroext, i1 zeroext) {
; CHECK-LABEL: mini1:
; CHECK: # %bb.0:
+; CHECK-NEXT: and %s0, %s0, (32)0
; CHECK-NEXT: and %s2, %s1, %s0
; CHECK-NEXT: cmov.w.ne %s2, %s1, %s0
; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1
@@ -285,6 +286,7 @@ define zeroext i1 @mini1(i1 zeroext, i1 zeroext) {
;
; OPT-LABEL: mini1:
; OPT: # %bb.0:
+; OPT-NEXT: and %s0, %s0, (32)0
; OPT-NEXT: and %s2, %s1, %s0
; OPT-NEXT: cmov.w.ne %s2, %s1, %s0
; OPT-NEXT: adds.w.zx %s0, %s2, (0)1
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 33cc8e96f663f..163e3c9eda723 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -284,10 +284,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl $127, %ecx
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: cmpl %ebp, %ecx
-; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl $0, %ecx
; X86-NEXT: sbbl %edi, %ecx
; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl $0, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -301,12 +301,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: cmovnel %esi, %eax
; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: jne .LBB4_8
; X86-NEXT: # %bb.1: # %_udiv-special-cases
; X86-NEXT: movl %edi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: xorl $127, %edi
-; X86-NEXT: orl %ebp, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: orl %edi, %ecx
@@ -351,9 +352,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: shll %cl, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: adcl $0, %edx
; X86-NEXT: jae .LBB4_3
@@ -363,7 +365,6 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: jmp .LBB4_7
; X86-NEXT: .LBB4_3: # %udiv-preheader
-; X86-NEXT: movl %ecx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
@@ -379,7 +380,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movb %dl, %ch
; X86-NEXT: andb $7, %ch
; X86-NEXT: movb %dl, %cl
@@ -493,6 +494,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: # %bb.5:
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: .LBB4_7: # %udiv-loop-exit
@@ -518,11 +520,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: sbbl %ecx, %ebx
; X86-NEXT: sbbl %ecx, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %esi, (%ecx)
-; X86-NEXT: movl %eax, 4(%ecx)
-; X86-NEXT: movl %ebx, 8(%ecx)
-; X86-NEXT: movl %edx, 12(%ecx)
+; X86-NEXT: movl %esi, (%ebp)
+; X86-NEXT: movl %eax, 4(%ebp)
+; X86-NEXT: movl %ebx, 8(%ebp)
+; X86-NEXT: movl %edx, 12(%ebp)
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ebx, %edi
diff --git a/llvm/test/CodeGen/X86/pr64589.ll b/llvm/test/CodeGen/X86/pr64589.ll
index 130ef517ae28e..d93d54f4c31d0 100644
--- a/llvm/test/CodeGen/X86/pr64589.ll
+++ b/llvm/test/CodeGen/X86/pr64589.ll
@@ -7,8 +7,8 @@
define i8 @test(ptr %p) {
; CHECK-LABEL: test:
; CHECK: # %bb.0:
-; CHECK-NEXT: movzbl (%rdi), %eax
-; CHECK-NEXT: orb 1(%rdi), %al
+; CHECK-NEXT: movzbl 1(%rdi), %eax
+; CHECK-NEXT: orb (%rdi), %al
; CHECK-NEXT: setne %al
; CHECK-NEXT: addb %al, %al
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll
index ec7dca4285a35..fe17e415dbeb4 100644
--- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll
+++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll
@@ -1541,8 +1541,9 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) {
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE2-NEXT: psllq $63, %xmm0
; SSE2-NEXT: movmskpd %xmm0, %eax
; SSE2-NEXT: cmpl $3, %eax
; SSE2-NEXT: sete %al
@@ -1550,26 +1551,42 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) {
;
; SSE42-LABEL: select_v2i8:
; SSE42: # %bb.0:
-; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT: pxor %xmm0, %xmm1
-; SSE42-NEXT: ptest %xmm1, %xmm1
+; SSE42-NEXT: movzwl (%rdi), %eax
+; SSE42-NEXT: movd %eax, %xmm0
+; SSE42-NEXT: movzwl (%rsi), %eax
+; SSE42-NEXT: movd %eax, %xmm1
+; SSE42-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT: psllq $63, %xmm0
+; SSE42-NEXT: movmskpd %xmm0, %eax
+; SSE42-NEXT: cmpl $3, %eax
; SSE42-NEXT: sete %al
; SSE42-NEXT: retq
;
; AVX1OR2-LABEL: select_v2i8:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT: vptest %xmm0, %xmm0
-; AVX1OR2-NEXT: sete %al
+; AVX1OR2-NEXT: movzwl (%rdi), %eax
+; AVX1OR2-NEXT: vmovd %eax, %xmm0
+; AVX1OR2-NEXT: movzwl (%rsi), %eax
+; AVX1OR2-NEXT: vmovd %eax, %xmm1
+; AVX1OR2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1OR2-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1OR2-NEXT: vtestpd %xmm1, %xmm0
+; AVX1OR2-NEXT: setb %al
; AVX1OR2-NEXT: retq
;
; AVX512-LABEL: select_v2i8:
; AVX512: # %bb.0:
; AVX512-NEXT: movzwl (%rdi), %eax
-; AVX512-NEXT: cmpw (%rsi), %ax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: movzwl (%rsi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
+; AVX512-NEXT: knotw %k0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: testb $3, %al
; AVX512-NEXT: sete %al
; AVX512-NEXT: retq
%v0 = load <2 x i8>, ptr %s0, align 1
diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll
index 951bcfa8fc1b7..4f91eb2cb0a5a 100644
--- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll
+++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll
@@ -1424,8 +1424,9 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) {
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE2-NEXT: psllq $63, %xmm0
; SSE2-NEXT: movmskpd %xmm0, %eax
; SSE2-NEXT: testl %eax, %eax
; SSE2-NEXT: setne %al
@@ -1433,19 +1434,27 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) {
;
; SSE42-LABEL: select_v2i8:
; SSE42: # %bb.0:
-; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT: pcmpeqq %xmm0, %xmm1
-; SSE42-NEXT: movmskpd %xmm1, %eax
+; SSE42-NEXT: movzwl (%rdi), %eax
+; SSE42-NEXT: movd %eax, %xmm0
+; SSE42-NEXT: movzwl (%rsi), %eax
+; SSE42-NEXT: movd %eax, %xmm1
+; SSE42-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; SSE42-NEXT: psllq $63, %xmm0
+; SSE42-NEXT: movmskpd %xmm0, %eax
; SSE42-NEXT: testl %eax, %eax
; SSE42-NEXT: setne %al
; SSE42-NEXT: retq
;
; AVX1OR2-LABEL: select_v2i8:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: movzwl (%rdi), %eax
+; AVX1OR2-NEXT: vmovd %eax, %xmm0
+; AVX1OR2-NEXT: movzwl (%rsi), %eax
+; AVX1OR2-NEXT: vmovd %eax, %xmm1
+; AVX1OR2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1OR2-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX1OR2-NEXT: vtestpd %xmm0, %xmm0
; AVX1OR2-NEXT: setne %al
; AVX1OR2-NEXT: retq
>From e57c1432d57227969a9d6a4f19fd5469a39a4a14 Mon Sep 17 00:00:00 2001
From: Bjorn Pettersson <bjorn.a.pettersson at ericsson.com>
Date: Fri, 5 Apr 2024 10:57:08 +0200
Subject: [PATCH 2/2] [DAGCombiner] Push freeze through SETCC and SELECT_CC
Allow pushing freeze through SETCC and SELECT_CC even if there are
multiple "maybe poison" operands. In the past we have limited it to
a single "maybe poison" operand, but it seems profitable to also
allow the multiple operand scenario.
One goal here is to avoid some regressions seen in review of
https://github.com/llvm/llvm-project/pull/84924
when solving the select->and miscompiles described in
https://github.com/llvm/llvm-project/issues/84653
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +
llvm/test/CodeGen/PowerPC/pr40922.ll | 7 +-
llvm/test/CodeGen/RISCV/double-convert.ll | 3 +-
.../CodeGen/RISCV/double-round-conv-sat.ll | 174 +++++++++---------
.../test/CodeGen/X86/vector-compare-all_of.ll | 11 +-
.../test/CodeGen/X86/vector-compare-any_of.ll | 11 +-
6 files changed, 98 insertions(+), 110 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 694fee6c09bdb..13683cd2889e8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15582,6 +15582,8 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
return SDValue();
bool AllowMultipleMaybePoisonOperands =
+ N0.getOpcode() == ISD::SELECT_CC ||
+ N0.getOpcode() == ISD::SETCC ||
N0.getOpcode() == ISD::BUILD_VECTOR ||
N0.getOpcode() == ISD::BUILD_PAIR ||
N0.getOpcode() == ISD::VECTOR_SHUFFLE ||
diff --git a/llvm/test/CodeGen/PowerPC/pr40922.ll b/llvm/test/CodeGen/PowerPC/pr40922.ll
index 2d9add6a19857..9252e9a3e3aa4 100644
--- a/llvm/test/CodeGen/PowerPC/pr40922.ll
+++ b/llvm/test/CodeGen/PowerPC/pr40922.ll
@@ -23,12 +23,11 @@ define i32 @a() {
; CHECK-NEXT: li 5, 0
; CHECK-NEXT: mr 30, 3
; CHECK-NEXT: addic 6, 4, 6
-; CHECK-NEXT: rlwinm 6, 6, 0, 28, 26
; CHECK-NEXT: addze 5, 5
-; CHECK-NEXT: cmplw 1, 6, 4
+; CHECK-NEXT: rlwinm 6, 6, 0, 28, 26
; CHECK-NEXT: andi. 5, 5, 1
-; CHECK-NEXT: crnot 20, 4
-; CHECK-NEXT: cror 20, 1, 20
+; CHECK-NEXT: cmplw 1, 6, 4
+; CHECK-NEXT: crorc 20, 1, 4
; CHECK-NEXT: bc 12, 20, .LBB0_2
; CHECK-NEXT: # %bb.1: # %if.then
; CHECK-NEXT: bl e
diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll
index 6024a29da33d2..feea4f19720b0 100644
--- a/llvm/test/CodeGen/RISCV/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert.ll
@@ -742,9 +742,8 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI12_0)(a2)
; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0
; RV32IZFINXZDINX-NEXT: lui a5, 524288
-; RV32IZFINXZDINX-NEXT: li a4, 1
; RV32IZFINXZDINX-NEXT: lui a3, 524288
-; RV32IZFINXZDINX-NEXT: bne a2, a4, .LBB12_2
+; RV32IZFINXZDINX-NEXT: beqz a2, .LBB12_2
; RV32IZFINXZDINX-NEXT: # %bb.1: # %start
; RV32IZFINXZDINX-NEXT: mv a3, a1
; RV32IZFINXZDINX-NEXT: .LBB12_2: # %start
diff --git a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
index 927eee2e9e545..0839f61b2d793 100644
--- a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
@@ -102,30 +102,29 @@ define i64 @test_floor_si64(double %x) nounwind {
; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI1_0)
; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI1_0+4)(a2)
; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI1_0)(a2)
-; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0
+; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI1_1)
+; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI1_1+4)(a4)
+; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI1_1)(a4)
+; RV32IZFINXZDINX-NEXT: fle.d a6, a2, s0
+; RV32IZFINXZDINX-NEXT: neg a2, a6
+; RV32IZFINXZDINX-NEXT: and a0, a2, a0
+; RV32IZFINXZDINX-NEXT: flt.d a4, a4, s0
+; RV32IZFINXZDINX-NEXT: neg a2, a4
+; RV32IZFINXZDINX-NEXT: or a0, a2, a0
+; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT: neg a2, a2
; RV32IZFINXZDINX-NEXT: lui a5, 524288
-; RV32IZFINXZDINX-NEXT: li a4, 1
; RV32IZFINXZDINX-NEXT: lui a3, 524288
-; RV32IZFINXZDINX-NEXT: bne a2, a4, .LBB1_2
+; RV32IZFINXZDINX-NEXT: beqz a6, .LBB1_2
; RV32IZFINXZDINX-NEXT: # %bb.1:
; RV32IZFINXZDINX-NEXT: mv a3, a1
; RV32IZFINXZDINX-NEXT: .LBB1_2:
-; RV32IZFINXZDINX-NEXT: lui a1, %hi(.LCPI1_1)
-; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI1_1)(a1)
-; RV32IZFINXZDINX-NEXT: lw a7, %lo(.LCPI1_1+4)(a1)
-; RV32IZFINXZDINX-NEXT: flt.d a4, a6, s0
+; RV32IZFINXZDINX-NEXT: and a0, a2, a0
; RV32IZFINXZDINX-NEXT: beqz a4, .LBB1_4
; RV32IZFINXZDINX-NEXT: # %bb.3:
; RV32IZFINXZDINX-NEXT: addi a3, a5, -1
; RV32IZFINXZDINX-NEXT: .LBB1_4:
-; RV32IZFINXZDINX-NEXT: feq.d a1, s0, s0
-; RV32IZFINXZDINX-NEXT: neg a5, a1
-; RV32IZFINXZDINX-NEXT: and a1, a5, a3
-; RV32IZFINXZDINX-NEXT: neg a2, a2
-; RV32IZFINXZDINX-NEXT: and a0, a2, a0
-; RV32IZFINXZDINX-NEXT: neg a2, a4
-; RV32IZFINXZDINX-NEXT: or a0, a2, a0
-; RV32IZFINXZDINX-NEXT: and a0, a5, a0
+; RV32IZFINXZDINX-NEXT: and a1, a2, a3
; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -347,30 +346,29 @@ define i64 @test_ceil_si64(double %x) nounwind {
; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI5_0)
; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI5_0+4)(a2)
; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI5_0)(a2)
-; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0
+; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI5_1)
+; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI5_1+4)(a4)
+; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI5_1)(a4)
+; RV32IZFINXZDINX-NEXT: fle.d a6, a2, s0
+; RV32IZFINXZDINX-NEXT: neg a2, a6
+; RV32IZFINXZDINX-NEXT: and a0, a2, a0
+; RV32IZFINXZDINX-NEXT: flt.d a4, a4, s0
+; RV32IZFINXZDINX-NEXT: neg a2, a4
+; RV32IZFINXZDINX-NEXT: or a0, a2, a0
+; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT: neg a2, a2
; RV32IZFINXZDINX-NEXT: lui a5, 524288
-; RV32IZFINXZDINX-NEXT: li a4, 1
; RV32IZFINXZDINX-NEXT: lui a3, 524288
-; RV32IZFINXZDINX-NEXT: bne a2, a4, .LBB5_2
+; RV32IZFINXZDINX-NEXT: beqz a6, .LBB5_2
; RV32IZFINXZDINX-NEXT: # %bb.1:
; RV32IZFINXZDINX-NEXT: mv a3, a1
; RV32IZFINXZDINX-NEXT: .LBB5_2:
-; RV32IZFINXZDINX-NEXT: lui a1, %hi(.LCPI5_1)
-; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI5_1)(a1)
-; RV32IZFINXZDINX-NEXT: lw a7, %lo(.LCPI5_1+4)(a1)
-; RV32IZFINXZDINX-NEXT: flt.d a4, a6, s0
+; RV32IZFINXZDINX-NEXT: and a0, a2, a0
; RV32IZFINXZDINX-NEXT: beqz a4, .LBB5_4
; RV32IZFINXZDINX-NEXT: # %bb.3:
; RV32IZFINXZDINX-NEXT: addi a3, a5, -1
; RV32IZFINXZDINX-NEXT: .LBB5_4:
-; RV32IZFINXZDINX-NEXT: feq.d a1, s0, s0
-; RV32IZFINXZDINX-NEXT: neg a5, a1
-; RV32IZFINXZDINX-NEXT: and a1, a5, a3
-; RV32IZFINXZDINX-NEXT: neg a2, a2
-; RV32IZFINXZDINX-NEXT: and a0, a2, a0
-; RV32IZFINXZDINX-NEXT: neg a2, a4
-; RV32IZFINXZDINX-NEXT: or a0, a2, a0
-; RV32IZFINXZDINX-NEXT: and a0, a5, a0
+; RV32IZFINXZDINX-NEXT: and a1, a2, a3
; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -592,30 +590,29 @@ define i64 @test_trunc_si64(double %x) nounwind {
; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI9_0)
; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI9_0+4)(a2)
; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI9_0)(a2)
-; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0
+; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI9_1)
+; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI9_1+4)(a4)
+; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI9_1)(a4)
+; RV32IZFINXZDINX-NEXT: fle.d a6, a2, s0
+; RV32IZFINXZDINX-NEXT: neg a2, a6
+; RV32IZFINXZDINX-NEXT: and a0, a2, a0
+; RV32IZFINXZDINX-NEXT: flt.d a4, a4, s0
+; RV32IZFINXZDINX-NEXT: neg a2, a4
+; RV32IZFINXZDINX-NEXT: or a0, a2, a0
+; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT: neg a2, a2
; RV32IZFINXZDINX-NEXT: lui a5, 524288
-; RV32IZFINXZDINX-NEXT: li a4, 1
; RV32IZFINXZDINX-NEXT: lui a3, 524288
-; RV32IZFINXZDINX-NEXT: bne a2, a4, .LBB9_2
+; RV32IZFINXZDINX-NEXT: beqz a6, .LBB9_2
; RV32IZFINXZDINX-NEXT: # %bb.1:
; RV32IZFINXZDINX-NEXT: mv a3, a1
; RV32IZFINXZDINX-NEXT: .LBB9_2:
-; RV32IZFINXZDINX-NEXT: lui a1, %hi(.LCPI9_1)
-; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI9_1)(a1)
-; RV32IZFINXZDINX-NEXT: lw a7, %lo(.LCPI9_1+4)(a1)
-; RV32IZFINXZDINX-NEXT: flt.d a4, a6, s0
+; RV32IZFINXZDINX-NEXT: and a0, a2, a0
; RV32IZFINXZDINX-NEXT: beqz a4, .LBB9_4
; RV32IZFINXZDINX-NEXT: # %bb.3:
; RV32IZFINXZDINX-NEXT: addi a3, a5, -1
; RV32IZFINXZDINX-NEXT: .LBB9_4:
-; RV32IZFINXZDINX-NEXT: feq.d a1, s0, s0
-; RV32IZFINXZDINX-NEXT: neg a5, a1
-; RV32IZFINXZDINX-NEXT: and a1, a5, a3
-; RV32IZFINXZDINX-NEXT: neg a2, a2
-; RV32IZFINXZDINX-NEXT: and a0, a2, a0
-; RV32IZFINXZDINX-NEXT: neg a2, a4
-; RV32IZFINXZDINX-NEXT: or a0, a2, a0
-; RV32IZFINXZDINX-NEXT: and a0, a5, a0
+; RV32IZFINXZDINX-NEXT: and a1, a2, a3
; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -837,30 +834,29 @@ define i64 @test_round_si64(double %x) nounwind {
; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI13_0)
; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI13_0+4)(a2)
; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI13_0)(a2)
-; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0
+; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI13_1)
+; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI13_1+4)(a4)
+; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI13_1)(a4)
+; RV32IZFINXZDINX-NEXT: fle.d a6, a2, s0
+; RV32IZFINXZDINX-NEXT: neg a2, a6
+; RV32IZFINXZDINX-NEXT: and a0, a2, a0
+; RV32IZFINXZDINX-NEXT: flt.d a4, a4, s0
+; RV32IZFINXZDINX-NEXT: neg a2, a4
+; RV32IZFINXZDINX-NEXT: or a0, a2, a0
+; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT: neg a2, a2
; RV32IZFINXZDINX-NEXT: lui a5, 524288
-; RV32IZFINXZDINX-NEXT: li a4, 1
; RV32IZFINXZDINX-NEXT: lui a3, 524288
-; RV32IZFINXZDINX-NEXT: bne a2, a4, .LBB13_2
+; RV32IZFINXZDINX-NEXT: beqz a6, .LBB13_2
; RV32IZFINXZDINX-NEXT: # %bb.1:
; RV32IZFINXZDINX-NEXT: mv a3, a1
; RV32IZFINXZDINX-NEXT: .LBB13_2:
-; RV32IZFINXZDINX-NEXT: lui a1, %hi(.LCPI13_1)
-; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI13_1)(a1)
-; RV32IZFINXZDINX-NEXT: lw a7, %lo(.LCPI13_1+4)(a1)
-; RV32IZFINXZDINX-NEXT: flt.d a4, a6, s0
+; RV32IZFINXZDINX-NEXT: and a0, a2, a0
; RV32IZFINXZDINX-NEXT: beqz a4, .LBB13_4
; RV32IZFINXZDINX-NEXT: # %bb.3:
; RV32IZFINXZDINX-NEXT: addi a3, a5, -1
; RV32IZFINXZDINX-NEXT: .LBB13_4:
-; RV32IZFINXZDINX-NEXT: feq.d a1, s0, s0
-; RV32IZFINXZDINX-NEXT: neg a5, a1
-; RV32IZFINXZDINX-NEXT: and a1, a5, a3
-; RV32IZFINXZDINX-NEXT: neg a2, a2
-; RV32IZFINXZDINX-NEXT: and a0, a2, a0
-; RV32IZFINXZDINX-NEXT: neg a2, a4
-; RV32IZFINXZDINX-NEXT: or a0, a2, a0
-; RV32IZFINXZDINX-NEXT: and a0, a5, a0
+; RV32IZFINXZDINX-NEXT: and a1, a2, a3
; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -1082,30 +1078,29 @@ define i64 @test_roundeven_si64(double %x) nounwind {
; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI17_0)
; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI17_0+4)(a2)
; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI17_0)(a2)
-; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0
+; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI17_1)
+; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI17_1+4)(a4)
+; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI17_1)(a4)
+; RV32IZFINXZDINX-NEXT: fle.d a6, a2, s0
+; RV32IZFINXZDINX-NEXT: neg a2, a6
+; RV32IZFINXZDINX-NEXT: and a0, a2, a0
+; RV32IZFINXZDINX-NEXT: flt.d a4, a4, s0
+; RV32IZFINXZDINX-NEXT: neg a2, a4
+; RV32IZFINXZDINX-NEXT: or a0, a2, a0
+; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT: neg a2, a2
; RV32IZFINXZDINX-NEXT: lui a5, 524288
-; RV32IZFINXZDINX-NEXT: li a4, 1
; RV32IZFINXZDINX-NEXT: lui a3, 524288
-; RV32IZFINXZDINX-NEXT: bne a2, a4, .LBB17_2
+; RV32IZFINXZDINX-NEXT: beqz a6, .LBB17_2
; RV32IZFINXZDINX-NEXT: # %bb.1:
; RV32IZFINXZDINX-NEXT: mv a3, a1
; RV32IZFINXZDINX-NEXT: .LBB17_2:
-; RV32IZFINXZDINX-NEXT: lui a1, %hi(.LCPI17_1)
-; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI17_1)(a1)
-; RV32IZFINXZDINX-NEXT: lw a7, %lo(.LCPI17_1+4)(a1)
-; RV32IZFINXZDINX-NEXT: flt.d a4, a6, s0
+; RV32IZFINXZDINX-NEXT: and a0, a2, a0
; RV32IZFINXZDINX-NEXT: beqz a4, .LBB17_4
; RV32IZFINXZDINX-NEXT: # %bb.3:
; RV32IZFINXZDINX-NEXT: addi a3, a5, -1
; RV32IZFINXZDINX-NEXT: .LBB17_4:
-; RV32IZFINXZDINX-NEXT: feq.d a1, s0, s0
-; RV32IZFINXZDINX-NEXT: neg a5, a1
-; RV32IZFINXZDINX-NEXT: and a1, a5, a3
-; RV32IZFINXZDINX-NEXT: neg a2, a2
-; RV32IZFINXZDINX-NEXT: and a0, a2, a0
-; RV32IZFINXZDINX-NEXT: neg a2, a4
-; RV32IZFINXZDINX-NEXT: or a0, a2, a0
-; RV32IZFINXZDINX-NEXT: and a0, a5, a0
+; RV32IZFINXZDINX-NEXT: and a1, a2, a3
; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -1327,30 +1322,29 @@ define i64 @test_rint_si64(double %x) nounwind {
; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI21_0)
; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI21_0+4)(a2)
; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI21_0)(a2)
-; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0
+; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI21_1)
+; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI21_1+4)(a4)
+; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI21_1)(a4)
+; RV32IZFINXZDINX-NEXT: fle.d a6, a2, s0
+; RV32IZFINXZDINX-NEXT: neg a2, a6
+; RV32IZFINXZDINX-NEXT: and a0, a2, a0
+; RV32IZFINXZDINX-NEXT: flt.d a4, a4, s0
+; RV32IZFINXZDINX-NEXT: neg a2, a4
+; RV32IZFINXZDINX-NEXT: or a0, a2, a0
+; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT: neg a2, a2
; RV32IZFINXZDINX-NEXT: lui a5, 524288
-; RV32IZFINXZDINX-NEXT: li a4, 1
; RV32IZFINXZDINX-NEXT: lui a3, 524288
-; RV32IZFINXZDINX-NEXT: bne a2, a4, .LBB21_2
+; RV32IZFINXZDINX-NEXT: beqz a6, .LBB21_2
; RV32IZFINXZDINX-NEXT: # %bb.1:
; RV32IZFINXZDINX-NEXT: mv a3, a1
; RV32IZFINXZDINX-NEXT: .LBB21_2:
-; RV32IZFINXZDINX-NEXT: lui a1, %hi(.LCPI21_1)
-; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI21_1)(a1)
-; RV32IZFINXZDINX-NEXT: lw a7, %lo(.LCPI21_1+4)(a1)
-; RV32IZFINXZDINX-NEXT: flt.d a4, a6, s0
+; RV32IZFINXZDINX-NEXT: and a0, a2, a0
; RV32IZFINXZDINX-NEXT: beqz a4, .LBB21_4
; RV32IZFINXZDINX-NEXT: # %bb.3:
; RV32IZFINXZDINX-NEXT: addi a3, a5, -1
; RV32IZFINXZDINX-NEXT: .LBB21_4:
-; RV32IZFINXZDINX-NEXT: feq.d a1, s0, s0
-; RV32IZFINXZDINX-NEXT: neg a5, a1
-; RV32IZFINXZDINX-NEXT: and a1, a5, a3
-; RV32IZFINXZDINX-NEXT: neg a2, a2
-; RV32IZFINXZDINX-NEXT: and a0, a2, a0
-; RV32IZFINXZDINX-NEXT: neg a2, a4
-; RV32IZFINXZDINX-NEXT: or a0, a2, a0
-; RV32IZFINXZDINX-NEXT: and a0, a5, a0
+; RV32IZFINXZDINX-NEXT: and a1, a2, a3
; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll
index fe17e415dbeb4..30202701fdb8c 100644
--- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll
+++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll
@@ -1541,9 +1541,8 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) {
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE2-NEXT: psllq $63, %xmm0
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: movmskpd %xmm0, %eax
; SSE2-NEXT: cmpl $3, %eax
; SSE2-NEXT: sete %al
@@ -1556,8 +1555,7 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) {
; SSE42-NEXT: movzwl (%rsi), %eax
; SSE42-NEXT: movd %eax, %xmm1
; SSE42-NEXT: pcmpeqb %xmm0, %xmm1
-; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT: psllq $63, %xmm0
+; SSE42-NEXT: pmovsxbq %xmm1, %xmm0
; SSE42-NEXT: movmskpd %xmm0, %eax
; SSE42-NEXT: cmpl $3, %eax
; SSE42-NEXT: sete %al
@@ -1570,8 +1568,7 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) {
; AVX1OR2-NEXT: movzwl (%rsi), %eax
; AVX1OR2-NEXT: vmovd %eax, %xmm1
; AVX1OR2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1OR2-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpmovsxbq %xmm0, %xmm0
; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1OR2-NEXT: vtestpd %xmm1, %xmm0
; AVX1OR2-NEXT: setb %al
diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll
index 4f91eb2cb0a5a..2df39d69dbb75 100644
--- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll
+++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll
@@ -1424,9 +1424,8 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) {
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE2-NEXT: psllq $63, %xmm0
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: movmskpd %xmm0, %eax
; SSE2-NEXT: testl %eax, %eax
; SSE2-NEXT: setne %al
@@ -1439,8 +1438,7 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) {
; SSE42-NEXT: movzwl (%rsi), %eax
; SSE42-NEXT: movd %eax, %xmm1
; SSE42-NEXT: pcmpeqb %xmm0, %xmm1
-; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT: psllq $63, %xmm0
+; SSE42-NEXT: pmovsxbq %xmm1, %xmm0
; SSE42-NEXT: movmskpd %xmm0, %eax
; SSE42-NEXT: testl %eax, %eax
; SSE42-NEXT: setne %al
@@ -1453,8 +1451,7 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) {
; AVX1OR2-NEXT: movzwl (%rsi), %eax
; AVX1OR2-NEXT: vmovd %eax, %xmm1
; AVX1OR2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1OR2-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpmovsxbq %xmm0, %xmm0
; AVX1OR2-NEXT: vtestpd %xmm0, %xmm0
; AVX1OR2-NEXT: setne %al
; AVX1OR2-NEXT: retq
More information about the llvm-commits
mailing list