[llvm] 55c6bda - Revert "Revert "[SelectionDAG] Handle more opcodes in canCreateUndefOrPoison (#84921)" and more..."
Bjorn Pettersson via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 29 04:09:15 PDT 2024
Author: Bjorn Pettersson
Date: 2024-04-29T13:08:52+02:00
New Revision: 55c6bda01ef5a166a69b43956775272d9d67bda5
URL: https://github.com/llvm/llvm-project/commit/55c6bda01ef5a166a69b43956775272d9d67bda5
DIFF: https://github.com/llvm/llvm-project/commit/55c6bda01ef5a166a69b43956775272d9d67bda5.diff
LOG: Revert "Revert "[SelectionDAG] Handle more opcodes in canCreateUndefOrPoison (#84921)" and more..."
This reverts commit 16bd10a38730fed27a3bf111076b8ef7a7e7b3ee.
Re-applies:
b3c55b707110084a9f50a16aade34c3be6fa18da - "[SelectionDAG] Handle more opcodes in canCreateUndefOrPoison (#84921)"
8e2f6495c0bac1dd6ee32b6a0d24152c9c343624 - "[DAGCombiner] Do not always fold FREEZE over BUILD_VECTOR (#85932)"
73472c5996716cda0dbb3ddb788304e0e7e6a323 - "[SelectionDAG] Treat CopyFromReg as freezing the value (#85932)"
with a fix in DAGCombiner::visitFREEZE.
Added:
Modified:
llvm/include/llvm/CodeGen/ISDOpcodes.h
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
llvm/test/CodeGen/AArch64/combine-mul.ll
llvm/test/CodeGen/AMDGPU/div_i128.ll
llvm/test/CodeGen/AMDGPU/rem_i128.ll
llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
llvm/test/CodeGen/RISCV/alu64.ll
llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
llvm/test/CodeGen/RISCV/bfloat-convert.ll
llvm/test/CodeGen/RISCV/double-convert.ll
llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
llvm/test/CodeGen/RISCV/float-convert.ll
llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
llvm/test/CodeGen/RISCV/forced-atomics.ll
llvm/test/CodeGen/RISCV/fpclamptosat.ll
llvm/test/CodeGen/RISCV/half-convert.ll
llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
llvm/test/CodeGen/RISCV/iabs.ll
llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
llvm/test/CodeGen/X86/abdu-vector-128.ll
llvm/test/CodeGen/X86/apx/kmov-postrapseudos.ll
llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
llvm/test/CodeGen/X86/combine-mul.ll
llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
llvm/test/CodeGen/X86/fold-masked-merge.ll
llvm/test/CodeGen/X86/freeze-binary.ll
llvm/test/CodeGen/X86/freeze-combine.ll
llvm/test/CodeGen/X86/freeze-vector.ll
llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
llvm/test/CodeGen/X86/gfni-rotates.ll
llvm/test/CodeGen/X86/known-never-zero.ll
llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
llvm/test/CodeGen/X86/pr38539.ll
llvm/test/CodeGen/X86/pr62286.ll
llvm/test/CodeGen/X86/scheduler-backtracking.ll
llvm/test/CodeGen/X86/sdiv_fix_sat.ll
llvm/test/CodeGen/X86/setcc-non-simple-type.ll
llvm/test/CodeGen/X86/vec_saddo.ll
llvm/test/CodeGen/X86/vec_ssubo.ll
llvm/test/CodeGen/X86/vec_uaddo.ll
llvm/test/CodeGen/X86/vec_usubo.ll
llvm/test/CodeGen/X86/vector-bo-select.ll
llvm/test/CodeGen/X86/vector-fshr-128.ll
llvm/test/CodeGen/X86/vector-fshr-256.ll
llvm/test/CodeGen/X86/vector-fshr-sub128.ll
llvm/test/CodeGen/X86/vector-shift-shl-128.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 078a936b061a32..6429947958ee91 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -205,6 +205,7 @@ enum NodeType {
/// CopyFromReg - This node indicates that the input value is a virtual or
/// physical register that is defined outside of the scope of this
/// SelectionDAG. The register is available from the RegisterSDNode object.
+ /// Note that CopyFromReg is considered as also freezing the value.
CopyFromReg,
/// UNDEF - An undefined node.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f8949b926922e4..4b81185c6e311f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15459,6 +15459,12 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, /*PoisonOnly*/ false))
return N0;
+ // We currently avoid folding freeze over SRA/SRL, due to the problems seen
+ // with (freeze (assert ext)) blocking simplifications of SRA/SRL. See for
+ // example https://reviews.llvm.org/D136529#4120959.
+ if (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)
+ return SDValue();
+
// Fold freeze(op(x, ...)) -> op(freeze(x), ...).
// Try to push freeze through instructions that propagate but don't produce
// poison as far as possible. If an operand of freeze follows three
@@ -15475,6 +15481,26 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
N0.getOpcode() == ISD::BUILD_PAIR ||
N0.getOpcode() == ISD::CONCAT_VECTORS;
+ // Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all
+ // ones" or "constant" into something that depends on FrozenUndef. We can
+ // instead pick undef values to keep those properties, while at the same time
+ // folding away the freeze.
+ // If we implement a more general solution for folding away freeze(undef) in
+ // the future, then this special handling can be removed.
+ if (N0.getOpcode() == ISD::BUILD_VECTOR) {
+ SDLoc DL(N0);
+ EVT VT = N0.getValueType();
+ if (llvm::ISD::isBuildVectorAllOnes(N0.getNode()))
+ return DAG.getAllOnesConstant(DL, VT);
+ if (llvm::ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
+ SmallVector<SDValue, 8> NewVecC;
+ for (const SDValue &Op : N0->op_values())
+ NewVecC.push_back(
+ Op.isUndef() ? DAG.getConstant(0, DL, Op.getValueType()) : Op);
+ return DAG.getBuildVector(VT, DL, NewVecC);
+ }
+ }
+
SmallSetVector<SDValue, 8> MaybePoisonOperands;
for (SDValue Op : N0->ops()) {
if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly*/ false,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 224c0c5ee9706c..dfbfaa8c894f55 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5063,6 +5063,7 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
case ISD::VALUETYPE:
case ISD::FrameIndex:
case ISD::TargetFrameIndex:
+ case ISD::CopyFromReg:
return true;
case ISD::UNDEF:
@@ -5136,6 +5137,16 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
case ISD::FREEZE:
case ISD::CONCAT_VECTORS:
case ISD::INSERT_SUBVECTOR:
+ case ISD::SADDSAT:
+ case ISD::UADDSAT:
+ case ISD::SSUBSAT:
+ case ISD::USUBSAT:
+ case ISD::MULHU:
+ case ISD::MULHS:
+ case ISD::SMIN:
+ case ISD::SMAX:
+ case ISD::UMIN:
+ case ISD::UMAX:
case ISD::AND:
case ISD::XOR:
case ISD::ROTL:
@@ -5156,6 +5167,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
case ISD::BUILD_PAIR:
return false;
+ case ISD::SELECT_CC:
case ISD::SETCC: {
// Integer setcc cannot create undef or poison.
if (Op.getOperand(0).getValueType().isInteger())
@@ -5165,7 +5177,8 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
// based on options and flags. The options and flags also cause special
// nonan condition codes to be used. Those condition codes may be preserved
// even if the nonan flag is dropped somewhere.
- ISD::CondCode CCCode = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ unsigned CCOp = Opcode == ISD::SETCC ? 2 : 4;
+ ISD::CondCode CCCode = cast<CondCodeSDNode>(Op.getOperand(CCOp))->get();
if (((unsigned)CCCode & 0x10U))
return true;
@@ -5182,6 +5195,8 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
return false;
case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA:
// If the max shift amount isn't in range, then the shift can create poison.
return !getValidMaximumShiftAmountConstant(Op, DemandedElts);
diff --git a/llvm/test/CodeGen/AArch64/combine-mul.ll b/llvm/test/CodeGen/AArch64/combine-mul.ll
index a2b0425308093d..c49e5ae6620a9e 100644
--- a/llvm/test/CodeGen/AArch64/combine-mul.ll
+++ b/llvm/test/CodeGen/AArch64/combine-mul.ll
@@ -44,8 +44,7 @@ define <4 x i1> @PR48683_vec_undef(<4 x i32> %x) {
define i64 @combine_mul_self_demandedbits(i64 %x) {
; CHECK-LABEL: combine_mul_self_demandedbits:
; CHECK: // %bb.0:
-; CHECK-NEXT: mul x8, x0, x0
-; CHECK-NEXT: and x0, x8, #0xfffffffffffffffd
+; CHECK-NEXT: mul x0, x0, x0
; CHECK-NEXT: ret
%1 = mul i64 %x, %x
%2 = and i64 %1, -3
@@ -77,7 +76,7 @@ define i8 @one_demanded_bit(i8 %x) {
define <2 x i64> @one_demanded_bit_splat(<2 x i64> %x) {
; CHECK-LABEL: one_demanded_bit_splat:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #32
+; CHECK-NEXT: mov w8, #32 // =0x20
; CHECK-NEXT: shl v0.2d, v0.2d, #5
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
@@ -131,7 +130,7 @@ define i32 @squared_demanded_2_low_bits(i32 %x) {
define <2 x i64> @squared_demanded_2_low_bits_splat(<2 x i64> %x) {
; CHECK-LABEL: squared_demanded_2_low_bits_splat:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #-2
+; CHECK-NEXT: mov x8, #-2 // =0xfffffffffffffffe
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index cf99b5d80e13a8..b2f9bf89d9ec60 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -282,21 +282,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v16, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v16
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v14
; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v4
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v10, v4, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v6, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v10, v4, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1
+; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
; GFX9-O0-NEXT: v_xor_b32_e64 v1, v5, v1
@@ -312,21 +312,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12
; GFX9-O0-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v3
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v8, v3, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v8, v5, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v7, v3, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v2, v5, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7
+; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7
; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, v6
; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v4
; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
@@ -339,18 +339,26 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2
; GFX9-O0-NEXT: v_or_b32_e64 v3, v8, v7
@@ -403,7 +411,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[6:7]
+; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
@@ -439,7 +448,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[6:7]
+; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[8:9]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
@@ -690,10 +700,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
@@ -903,14 +913,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(9)
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
@@ -1028,10 +1038,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 6ba66ccf71868e..b068d87c4d6f48 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -242,130 +242,137 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0: ; %bb.0: ; %_udiv-special-cases
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v1
+; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3
; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: s_mov_b32 s4, 63
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v14
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15
-; GFX9-O0-NEXT: v_ashrrev_i64 v[12:13], s4, v[6:7]
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT: v_ashrrev_i64 v[11:12], s4, v[10:11]
+; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9
-; GFX9-O0-NEXT: v_ashrrev_i64 v[6:7], s4, v[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v15
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13
-; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v10
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14
-; GFX9-O0-NEXT: v_xor_b32_e64 v13, v11, v12
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6
+; GFX9-O0-NEXT: v_ashrrev_i64 v[15:16], s4, v[13:14]
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12
+; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v10
+; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v11
+; GFX9-O0-NEXT: v_xor_b32_e64 v13, v8, v12
; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v10
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_xor_b32_e64 v15, v4, v12
-; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7
-; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_xor_b32_e64 v7, v5, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v10
+; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v12
+; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16
+; GFX9-O0-NEXT: v_xor_b32_e64 v9, v8, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v15
+; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, v6
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v4
-; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT: v_xor_b32_e64 v2, v2, v6
-; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v16
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT: v_xor_b32_e64 v9, v9, v3
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, v6
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v14
; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v12
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v10, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v11, v12, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v10, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
-; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8
; GFX9-O0-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v6
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v3, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v5, v6, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v2, v3, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -438,7 +445,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[6:7]
+; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
@@ -474,7 +482,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[6:7]
+; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[8:9]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
@@ -589,27 +598,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-O0-NEXT: ; %bb.2: ; %Flow
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(6)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_5
; GFX9-O0-NEXT: .LBB0_3: ; %Flow2
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
@@ -624,22 +633,22 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_9
; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b32 s4, 1
; GFX9-O0-NEXT: s_waitcnt vmcnt(2)
; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1]
@@ -679,27 +688,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6
; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_4
; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while
; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -709,30 +718,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8
; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b32 s4, 63
; GFX9-O0-NEXT: s_waitcnt vmcnt(16)
; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3]
@@ -872,24 +881,24 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v13
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4
; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5
@@ -899,42 +908,42 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6
; GFX9-O0-NEXT: s_branch .LBB0_1
; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
@@ -1018,12 +1027,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v15, s9
; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8
@@ -1034,30 +1043,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_6
; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
@@ -1099,14 +1108,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f
; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4
; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12]
@@ -1152,12 +1161,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10
; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v4
@@ -1172,18 +1181,18 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec
; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7]
@@ -1203,18 +1212,18 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b32 s4, 32
; GFX9-O0-NEXT: s_waitcnt vmcnt(2)
; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[5:6]
@@ -1486,11 +1495,11 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5
; GFX9-O0-NEXT: ; kill: killed $vgpr4
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
index 6629d34405492c..25106b456d2f7a 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
@@ -123,10 +123,9 @@ define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT: addi.d $a3, $sp, 0
-; CHECK-NEXT: bstrins.d $a3, $a0, 4, 0
-; CHECK-NEXT: st.b $a2, $a3, 0
+; CHECK-NEXT: addi.d $a0, $sp, 0
+; CHECK-NEXT: bstrins.d $a0, $a3, 4, 0
+; CHECK-NEXT: st.b $a2, $a0, 0
; CHECK-NEXT: xvld $xr0, $sp, 0
; CHECK-NEXT: xvst $xr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
@@ -150,10 +149,9 @@ define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT: addi.d $a3, $sp, 0
-; CHECK-NEXT: bstrins.d $a3, $a0, 4, 1
-; CHECK-NEXT: st.h $a2, $a3, 0
+; CHECK-NEXT: addi.d $a0, $sp, 0
+; CHECK-NEXT: bstrins.d $a0, $a3, 4, 1
+; CHECK-NEXT: st.h $a2, $a0, 0
; CHECK-NEXT: xvld $xr0, $sp, 0
; CHECK-NEXT: xvst $xr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
@@ -177,10 +175,9 @@ define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT: addi.d $a3, $sp, 0
-; CHECK-NEXT: bstrins.d $a3, $a0, 4, 2
-; CHECK-NEXT: st.w $a2, $a3, 0
+; CHECK-NEXT: addi.d $a0, $sp, 0
+; CHECK-NEXT: bstrins.d $a0, $a3, 4, 2
+; CHECK-NEXT: st.w $a2, $a0, 0
; CHECK-NEXT: xvld $xr0, $sp, 0
; CHECK-NEXT: xvst $xr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
@@ -204,10 +201,9 @@ define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT: addi.d $a3, $sp, 0
-; CHECK-NEXT: bstrins.d $a3, $a0, 4, 3
-; CHECK-NEXT: st.d $a2, $a3, 0
+; CHECK-NEXT: addi.d $a0, $sp, 0
+; CHECK-NEXT: bstrins.d $a0, $a3, 4, 3
+; CHECK-NEXT: st.d $a2, $a0, 0
; CHECK-NEXT: xvld $xr0, $sp, 0
; CHECK-NEXT: xvst $xr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
@@ -231,10 +227,9 @@ define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwin
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr1, $a0, 0
; CHECK-NEXT: xvst $xr1, $sp, 0
-; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
-; CHECK-NEXT: addi.d $a2, $sp, 0
-; CHECK-NEXT: bstrins.d $a2, $a0, 4, 2
-; CHECK-NEXT: fst.s $fa0, $a2, 0
+; CHECK-NEXT: addi.d $a0, $sp, 0
+; CHECK-NEXT: bstrins.d $a0, $a2, 4, 2
+; CHECK-NEXT: fst.s $fa0, $a0, 0
; CHECK-NEXT: xvld $xr0, $sp, 0
; CHECK-NEXT: xvst $xr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
@@ -258,10 +253,9 @@ define void @insert_4xdouble_idx(ptr %src, ptr %dst, double %in, i32 %idx) nounw
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr1, $a0, 0
; CHECK-NEXT: xvst $xr1, $sp, 0
-; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
-; CHECK-NEXT: addi.d $a2, $sp, 0
-; CHECK-NEXT: bstrins.d $a2, $a0, 4, 3
-; CHECK-NEXT: fst.d $fa0, $a2, 0
+; CHECK-NEXT: addi.d $a0, $sp, 0
+; CHECK-NEXT: bstrins.d $a0, $a2, 4, 3
+; CHECK-NEXT: fst.d $fa0, $a0, 0
; CHECK-NEXT: xvld $xr0, $sp, 0
; CHECK-NEXT: xvst $xr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
index 19171b7d8ed784..7f232073ae129c 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
@@ -87,10 +87,9 @@ define void @insert_16xi8_idx(ptr %src, ptr %dst, i8 %ins, i32 %idx) nounwind {
; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: vld $vr0, $a0, 0
; CHECK-NEXT: vst $vr0, $sp, 0
-; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT: addi.d $a3, $sp, 0
-; CHECK-NEXT: bstrins.d $a3, $a0, 3, 0
-; CHECK-NEXT: st.b $a2, $a3, 0
+; CHECK-NEXT: addi.d $a0, $sp, 0
+; CHECK-NEXT: bstrins.d $a0, $a3, 3, 0
+; CHECK-NEXT: st.b $a2, $a0, 0
; CHECK-NEXT: vld $vr0, $sp, 0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $sp, 16
@@ -107,10 +106,9 @@ define void @insert_8xi16_idx(ptr %src, ptr %dst, i16 %ins, i32 %idx) nounwind {
; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: vld $vr0, $a0, 0
; CHECK-NEXT: vst $vr0, $sp, 0
-; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT: addi.d $a3, $sp, 0
-; CHECK-NEXT: bstrins.d $a3, $a0, 3, 1
-; CHECK-NEXT: st.h $a2, $a3, 0
+; CHECK-NEXT: addi.d $a0, $sp, 0
+; CHECK-NEXT: bstrins.d $a0, $a3, 3, 1
+; CHECK-NEXT: st.h $a2, $a0, 0
; CHECK-NEXT: vld $vr0, $sp, 0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $sp, 16
@@ -127,10 +125,9 @@ define void @insert_4xi32_idx(ptr %src, ptr %dst, i32 %ins, i32 %idx) nounwind {
; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: vld $vr0, $a0, 0
; CHECK-NEXT: vst $vr0, $sp, 0
-; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT: addi.d $a3, $sp, 0
-; CHECK-NEXT: bstrins.d $a3, $a0, 3, 2
-; CHECK-NEXT: st.w $a2, $a3, 0
+; CHECK-NEXT: addi.d $a0, $sp, 0
+; CHECK-NEXT: bstrins.d $a0, $a3, 3, 2
+; CHECK-NEXT: st.w $a2, $a0, 0
; CHECK-NEXT: vld $vr0, $sp, 0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $sp, 16
@@ -147,10 +144,9 @@ define void @insert_2xi64_idx(ptr %src, ptr %dst, i64 %ins, i32 %idx) nounwind {
; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: vld $vr0, $a0, 0
; CHECK-NEXT: vst $vr0, $sp, 0
-; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT: addi.d $a3, $sp, 0
-; CHECK-NEXT: bstrins.d $a3, $a0, 3, 3
-; CHECK-NEXT: st.d $a2, $a3, 0
+; CHECK-NEXT: addi.d $a0, $sp, 0
+; CHECK-NEXT: bstrins.d $a0, $a3, 3, 3
+; CHECK-NEXT: st.d $a2, $a0, 0
; CHECK-NEXT: vld $vr0, $sp, 0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $sp, 16
@@ -167,10 +163,9 @@ define void @insert_4xfloat_idx(ptr %src, ptr %dst, float %ins, i32 %idx) nounwi
; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: vld $vr1, $a0, 0
; CHECK-NEXT: vst $vr1, $sp, 0
-; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
-; CHECK-NEXT: addi.d $a2, $sp, 0
-; CHECK-NEXT: bstrins.d $a2, $a0, 3, 2
-; CHECK-NEXT: fst.s $fa0, $a2, 0
+; CHECK-NEXT: addi.d $a0, $sp, 0
+; CHECK-NEXT: bstrins.d $a0, $a2, 3, 2
+; CHECK-NEXT: fst.s $fa0, $a0, 0
; CHECK-NEXT: vld $vr0, $sp, 0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $sp, 16
@@ -187,10 +182,9 @@ define void @insert_2xdouble_idx(ptr %src, ptr %dst, double %ins, i32 %idx) noun
; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: vld $vr1, $a0, 0
; CHECK-NEXT: vst $vr1, $sp, 0
-; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
-; CHECK-NEXT: addi.d $a2, $sp, 0
-; CHECK-NEXT: bstrins.d $a2, $a0, 3, 3
-; CHECK-NEXT: fst.d $fa0, $a2, 0
+; CHECK-NEXT: addi.d $a0, $sp, 0
+; CHECK-NEXT: bstrins.d $a0, $a2, 3, 3
+; CHECK-NEXT: fst.d $fa0, $a0, 0
; CHECK-NEXT: vld $vr0, $sp, 0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $sp, 16
diff --git a/llvm/test/CodeGen/RISCV/alu64.ll b/llvm/test/CodeGen/RISCV/alu64.ll
index d2ee80e6aa9513..f032756e007b68 100644
--- a/llvm/test/CodeGen/RISCV/alu64.ll
+++ b/llvm/test/CodeGen/RISCV/alu64.ll
@@ -57,8 +57,8 @@ define i64 @sltiu(i64 %a) nounwind {
;
; RV32I-LABEL: sltiu:
; RV32I: # %bb.0:
-; RV32I-NEXT: seqz a1, a1
; RV32I-NEXT: sltiu a0, a0, 3
+; RV32I-NEXT: seqz a1, a1
; RV32I-NEXT: and a0, a1, a0
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
index f96e1bad2e3895..a5a2ae79966c3f 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
@@ -372,10 +372,10 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
; RV32IA-NEXT: # =>This Loop Header: Depth=1
; RV32IA-NEXT: # Child Loop BB2_3 Depth 2
; RV32IA-NEXT: mv a3, a2
-; RV32IA-NEXT: addi a4, a2, 1
-; RV32IA-NEXT: sltu a2, a2, a1
-; RV32IA-NEXT: neg a2, a2
-; RV32IA-NEXT: and a4, a2, a4
+; RV32IA-NEXT: addi a2, a2, 1
+; RV32IA-NEXT: sltu a4, a3, a1
+; RV32IA-NEXT: neg a4, a4
+; RV32IA-NEXT: and a4, a4, a2
; RV32IA-NEXT: .LBB2_3: # %atomicrmw.start
; RV32IA-NEXT: # Parent Loop BB2_1 Depth=1
; RV32IA-NEXT: # => This Inner Loop Header: Depth=2
@@ -607,10 +607,10 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
; RV64IA-NEXT: # =>This Loop Header: Depth=1
; RV64IA-NEXT: # Child Loop BB3_3 Depth 2
; RV64IA-NEXT: mv a3, a2
-; RV64IA-NEXT: addi a4, a2, 1
-; RV64IA-NEXT: sltu a2, a2, a1
-; RV64IA-NEXT: neg a2, a2
-; RV64IA-NEXT: and a4, a2, a4
+; RV64IA-NEXT: addi a2, a2, 1
+; RV64IA-NEXT: sltu a4, a3, a1
+; RV64IA-NEXT: neg a4, a4
+; RV64IA-NEXT: and a4, a4, a2
; RV64IA-NEXT: .LBB3_3: # %atomicrmw.start
; RV64IA-NEXT: # Parent Loop BB3_1 Depth=1
; RV64IA-NEXT: # => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/RISCV/bfloat-convert.ll b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
index 9e2b0b5c3cbb41..770dcccee882be 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-convert.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
@@ -456,92 +456,80 @@ define i64 @fcvt_l_bf16(bfloat %a) nounwind {
define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind {
; RV32IZFBFMIN-LABEL: fcvt_l_bf16_sat:
; RV32IZFBFMIN: # %bb.0: # %start
-; RV32IZFBFMIN-NEXT: addi sp, sp, -32
-; RV32IZFBFMIN-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT: fsw fs0, 8(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT: lui a0, %hi(.LCPI10_0)
-; RV32IZFBFMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a0)
+; RV32IZFBFMIN-NEXT: addi sp, sp, -16
+; RV32IZFBFMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZFBFMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; RV32IZFBFMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fs0, fa0
-; RV32IZFBFMIN-NEXT: flt.s s0, fa5, fs0
-; RV32IZFBFMIN-NEXT: neg s1, s0
; RV32IZFBFMIN-NEXT: lui a0, 913408
; RV32IZFBFMIN-NEXT: fmv.w.x fa5, a0
-; RV32IZFBFMIN-NEXT: fle.s s2, fa5, fs0
-; RV32IZFBFMIN-NEXT: neg s3, s2
+; RV32IZFBFMIN-NEXT: fle.s s0, fa5, fs0
; RV32IZFBFMIN-NEXT: fmv.s fa0, fs0
; RV32IZFBFMIN-NEXT: call __fixsfdi
-; RV32IZFBFMIN-NEXT: and a0, s3, a0
-; RV32IZFBFMIN-NEXT: or a0, s1, a0
-; RV32IZFBFMIN-NEXT: feq.s a2, fs0, fs0
-; RV32IZFBFMIN-NEXT: neg a2, a2
; RV32IZFBFMIN-NEXT: lui a4, 524288
-; RV32IZFBFMIN-NEXT: lui a3, 524288
-; RV32IZFBFMIN-NEXT: beqz s2, .LBB10_2
+; RV32IZFBFMIN-NEXT: lui a2, 524288
+; RV32IZFBFMIN-NEXT: beqz s0, .LBB10_2
; RV32IZFBFMIN-NEXT: # %bb.1: # %start
-; RV32IZFBFMIN-NEXT: mv a3, a1
+; RV32IZFBFMIN-NEXT: mv a2, a1
; RV32IZFBFMIN-NEXT: .LBB10_2: # %start
-; RV32IZFBFMIN-NEXT: and a0, a2, a0
-; RV32IZFBFMIN-NEXT: beqz s0, .LBB10_4
+; RV32IZFBFMIN-NEXT: lui a1, %hi(.LCPI10_0)
+; RV32IZFBFMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1)
+; RV32IZFBFMIN-NEXT: flt.s a3, fa5, fs0
+; RV32IZFBFMIN-NEXT: beqz a3, .LBB10_4
; RV32IZFBFMIN-NEXT: # %bb.3:
-; RV32IZFBFMIN-NEXT: addi a3, a4, -1
+; RV32IZFBFMIN-NEXT: addi a2, a4, -1
; RV32IZFBFMIN-NEXT: .LBB10_4: # %start
-; RV32IZFBFMIN-NEXT: and a1, a2, a3
-; RV32IZFBFMIN-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT: flw fs0, 8(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT: addi sp, sp, 32
+; RV32IZFBFMIN-NEXT: feq.s a1, fs0, fs0
+; RV32IZFBFMIN-NEXT: neg a4, a1
+; RV32IZFBFMIN-NEXT: and a1, a4, a2
+; RV32IZFBFMIN-NEXT: neg a2, a3
+; RV32IZFBFMIN-NEXT: neg a3, s0
+; RV32IZFBFMIN-NEXT: and a0, a3, a0
+; RV32IZFBFMIN-NEXT: or a0, a2, a0
+; RV32IZFBFMIN-NEXT: and a0, a4, a0
+; RV32IZFBFMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZFBFMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
+; RV32IZFBFMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IZFBFMIN-NEXT: addi sp, sp, 16
; RV32IZFBFMIN-NEXT: ret
;
; R32IDZFBFMIN-LABEL: fcvt_l_bf16_sat:
; R32IDZFBFMIN: # %bb.0: # %start
-; R32IDZFBFMIN-NEXT: addi sp, sp, -32
-; R32IDZFBFMIN-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; R32IDZFBFMIN-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
-; R32IDZFBFMIN-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
-; R32IDZFBFMIN-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
-; R32IDZFBFMIN-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; R32IDZFBFMIN-NEXT: addi sp, sp, -16
+; R32IDZFBFMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; R32IDZFBFMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; R32IDZFBFMIN-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill
-; R32IDZFBFMIN-NEXT: lui a0, %hi(.LCPI10_0)
-; R32IDZFBFMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a0)
; R32IDZFBFMIN-NEXT: fcvt.s.bf16 fs0, fa0
-; R32IDZFBFMIN-NEXT: flt.s s0, fa5, fs0
-; R32IDZFBFMIN-NEXT: neg s1, s0
; R32IDZFBFMIN-NEXT: lui a0, 913408
; R32IDZFBFMIN-NEXT: fmv.w.x fa5, a0
-; R32IDZFBFMIN-NEXT: fle.s s2, fa5, fs0
-; R32IDZFBFMIN-NEXT: neg s3, s2
+; R32IDZFBFMIN-NEXT: fle.s s0, fa5, fs0
; R32IDZFBFMIN-NEXT: fmv.s fa0, fs0
; R32IDZFBFMIN-NEXT: call __fixsfdi
-; R32IDZFBFMIN-NEXT: and a0, s3, a0
-; R32IDZFBFMIN-NEXT: or a0, s1, a0
-; R32IDZFBFMIN-NEXT: feq.s a2, fs0, fs0
-; R32IDZFBFMIN-NEXT: neg a2, a2
; R32IDZFBFMIN-NEXT: lui a4, 524288
-; R32IDZFBFMIN-NEXT: lui a3, 524288
-; R32IDZFBFMIN-NEXT: beqz s2, .LBB10_2
+; R32IDZFBFMIN-NEXT: lui a2, 524288
+; R32IDZFBFMIN-NEXT: beqz s0, .LBB10_2
; R32IDZFBFMIN-NEXT: # %bb.1: # %start
-; R32IDZFBFMIN-NEXT: mv a3, a1
+; R32IDZFBFMIN-NEXT: mv a2, a1
; R32IDZFBFMIN-NEXT: .LBB10_2: # %start
-; R32IDZFBFMIN-NEXT: and a0, a2, a0
-; R32IDZFBFMIN-NEXT: beqz s0, .LBB10_4
+; R32IDZFBFMIN-NEXT: lui a1, %hi(.LCPI10_0)
+; R32IDZFBFMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1)
+; R32IDZFBFMIN-NEXT: flt.s a3, fa5, fs0
+; R32IDZFBFMIN-NEXT: beqz a3, .LBB10_4
; R32IDZFBFMIN-NEXT: # %bb.3:
-; R32IDZFBFMIN-NEXT: addi a3, a4, -1
+; R32IDZFBFMIN-NEXT: addi a2, a4, -1
; R32IDZFBFMIN-NEXT: .LBB10_4: # %start
-; R32IDZFBFMIN-NEXT: and a1, a2, a3
-; R32IDZFBFMIN-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
-; R32IDZFBFMIN-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
-; R32IDZFBFMIN-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
-; R32IDZFBFMIN-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
-; R32IDZFBFMIN-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; R32IDZFBFMIN-NEXT: feq.s a1, fs0, fs0
+; R32IDZFBFMIN-NEXT: neg a4, a1
+; R32IDZFBFMIN-NEXT: and a1, a4, a2
+; R32IDZFBFMIN-NEXT: neg a2, a3
+; R32IDZFBFMIN-NEXT: neg a3, s0
+; R32IDZFBFMIN-NEXT: and a0, a3, a0
+; R32IDZFBFMIN-NEXT: or a0, a2, a0
+; R32IDZFBFMIN-NEXT: and a0, a4, a0
+; R32IDZFBFMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; R32IDZFBFMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; R32IDZFBFMIN-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
-; R32IDZFBFMIN-NEXT: addi sp, sp, 32
+; R32IDZFBFMIN-NEXT: addi sp, sp, 16
; R32IDZFBFMIN-NEXT: ret
;
; RV32ID-LABEL: fcvt_l_bf16_sat:
diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll
index c147d6ec6d9b15..6024a29da33d2e 100644
--- a/llvm/test/CodeGen/RISCV/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert.ll
@@ -692,28 +692,27 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
; RV32IFD-NEXT: fmv.d fs0, fa0
; RV32IFD-NEXT: fle.d s0, fa5, fa0
; RV32IFD-NEXT: call __fixdfdi
-; RV32IFD-NEXT: lui a3, 524288
-; RV32IFD-NEXT: li a4, 1
+; RV32IFD-NEXT: lui a4, 524288
; RV32IFD-NEXT: lui a2, 524288
-; RV32IFD-NEXT: bne s0, a4, .LBB12_2
+; RV32IFD-NEXT: beqz s0, .LBB12_2
; RV32IFD-NEXT: # %bb.1: # %start
; RV32IFD-NEXT: mv a2, a1
; RV32IFD-NEXT: .LBB12_2: # %start
; RV32IFD-NEXT: lui a1, %hi(.LCPI12_1)
; RV32IFD-NEXT: fld fa5, %lo(.LCPI12_1)(a1)
-; RV32IFD-NEXT: flt.d a4, fa5, fs0
-; RV32IFD-NEXT: beqz a4, .LBB12_4
+; RV32IFD-NEXT: flt.d a3, fa5, fs0
+; RV32IFD-NEXT: beqz a3, .LBB12_4
; RV32IFD-NEXT: # %bb.3:
-; RV32IFD-NEXT: addi a2, a3, -1
+; RV32IFD-NEXT: addi a2, a4, -1
; RV32IFD-NEXT: .LBB12_4: # %start
; RV32IFD-NEXT: feq.d a1, fs0, fs0
-; RV32IFD-NEXT: neg a3, a1
-; RV32IFD-NEXT: and a1, a3, a2
-; RV32IFD-NEXT: neg a2, a4
-; RV32IFD-NEXT: neg a4, s0
-; RV32IFD-NEXT: and a0, a4, a0
-; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: neg a4, a1
+; RV32IFD-NEXT: and a1, a4, a2
+; RV32IFD-NEXT: neg a2, a3
+; RV32IFD-NEXT: neg a3, s0
; RV32IFD-NEXT: and a0, a3, a0
+; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: and a0, a4, a0
; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
@@ -790,33 +789,32 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a1
; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: lui a3, 278016
+; RV32I-NEXT: addi a3, a3, -1
+; RV32I-NEXT: li a2, -1
+; RV32I-NEXT: call __gtdf2
+; RV32I-NEXT: mv s2, a0
; RV32I-NEXT: lui a3, 802304
+; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a1, s0
; RV32I-NEXT: li a2, 0
; RV32I-NEXT: call __gedf2
-; RV32I-NEXT: mv s2, a0
+; RV32I-NEXT: mv s3, a0
; RV32I-NEXT: mv a0, s1
; RV32I-NEXT: mv a1, s0
; RV32I-NEXT: call __fixdfdi
-; RV32I-NEXT: mv s3, a0
-; RV32I-NEXT: mv s4, a1
-; RV32I-NEXT: lui s6, 524288
-; RV32I-NEXT: bgez s2, .LBB12_2
+; RV32I-NEXT: mv s4, a0
+; RV32I-NEXT: mv s5, a1
+; RV32I-NEXT: lui a0, 524288
+; RV32I-NEXT: bgez s3, .LBB12_2
; RV32I-NEXT: # %bb.1: # %start
-; RV32I-NEXT: lui s4, 524288
+; RV32I-NEXT: lui s5, 524288
; RV32I-NEXT: .LBB12_2: # %start
-; RV32I-NEXT: lui a3, 278016
-; RV32I-NEXT: addi a3, a3, -1
-; RV32I-NEXT: li a2, -1
-; RV32I-NEXT: mv a0, s1
-; RV32I-NEXT: mv a1, s0
-; RV32I-NEXT: call __gtdf2
-; RV32I-NEXT: mv s5, a0
-; RV32I-NEXT: blez a0, .LBB12_4
+; RV32I-NEXT: blez s2, .LBB12_4
; RV32I-NEXT: # %bb.3: # %start
-; RV32I-NEXT: addi s4, s6, -1
+; RV32I-NEXT: addi s5, a0, -1
; RV32I-NEXT: .LBB12_4: # %start
; RV32I-NEXT: mv a0, s1
; RV32I-NEXT: mv a1, s0
@@ -825,11 +823,11 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
; RV32I-NEXT: call __unorddf2
; RV32I-NEXT: snez a0, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a1, a0, s4
-; RV32I-NEXT: slti a2, s2, 0
+; RV32I-NEXT: and a1, a0, s5
+; RV32I-NEXT: slti a2, s3, 0
; RV32I-NEXT: addi a2, a2, -1
-; RV32I-NEXT: and a2, a2, s3
-; RV32I-NEXT: sgtz a3, s5
+; RV32I-NEXT: and a2, a2, s4
+; RV32I-NEXT: sgtz a3, s2
; RV32I-NEXT: neg a3, a3
; RV32I-NEXT: or a2, a3, a2
; RV32I-NEXT: and a0, a0, a2
@@ -840,7 +838,6 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 0(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@@ -949,22 +946,23 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind {
; RV32IFD-NEXT: addi sp, sp, -16
; RV32IFD-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IFD-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT: lui a0, %hi(.LCPI14_0)
-; RV32IFD-NEXT: fld fa5, %lo(.LCPI14_0)(a0)
-; RV32IFD-NEXT: flt.d a0, fa5, fa0
-; RV32IFD-NEXT: neg s0, a0
+; RV32IFD-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: fmv.d fs0, fa0
; RV32IFD-NEXT: fcvt.d.w fa5, zero
; RV32IFD-NEXT: fle.d a0, fa5, fa0
-; RV32IFD-NEXT: neg s1, a0
+; RV32IFD-NEXT: neg s0, a0
; RV32IFD-NEXT: call __fixunsdfdi
-; RV32IFD-NEXT: and a0, s1, a0
-; RV32IFD-NEXT: or a0, s0, a0
-; RV32IFD-NEXT: and a1, s1, a1
-; RV32IFD-NEXT: or a1, s0, a1
+; RV32IFD-NEXT: lui a2, %hi(.LCPI14_0)
+; RV32IFD-NEXT: fld fa5, %lo(.LCPI14_0)(a2)
+; RV32IFD-NEXT: and a0, s0, a0
+; RV32IFD-NEXT: flt.d a2, fa5, fs0
+; RV32IFD-NEXT: neg a2, a2
+; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: and a1, s0, a1
+; RV32IFD-NEXT: or a1, a2, a1
; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
; RV32IFD-NEXT: addi sp, sp, 16
; RV32IFD-NEXT: ret
;
@@ -983,27 +981,24 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind {
; RV32IZFINXZDINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFINXZDINX-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; RV32IZFINXZDINX-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s2, 0(sp) # 4-byte Folded Spill
; RV32IZFINXZDINX-NEXT: mv s1, a1
-; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero
; RV32IZFINXZDINX-NEXT: mv s0, a0
-; RV32IZFINXZDINX-NEXT: fle.d a0, a2, s0
-; RV32IZFINXZDINX-NEXT: neg s2, a0
-; RV32IZFINXZDINX-NEXT: mv a0, s0
; RV32IZFINXZDINX-NEXT: call __fixunsdfdi
-; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI14_0)
-; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI14_0+4)(a2)
-; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI14_0)(a2)
-; RV32IZFINXZDINX-NEXT: and a0, s2, a0
-; RV32IZFINXZDINX-NEXT: flt.d a2, a2, s0
+; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero
+; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI14_0)
+; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI14_0+4)(a4)
+; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI14_0)(a4)
+; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0
; RV32IZFINXZDINX-NEXT: neg a2, a2
-; RV32IZFINXZDINX-NEXT: or a0, a2, a0
-; RV32IZFINXZDINX-NEXT: and a1, s2, a1
-; RV32IZFINXZDINX-NEXT: or a1, a2, a1
+; RV32IZFINXZDINX-NEXT: and a0, a2, a0
+; RV32IZFINXZDINX-NEXT: flt.d a3, a4, s0
+; RV32IZFINXZDINX-NEXT: neg a3, a3
+; RV32IZFINXZDINX-NEXT: or a0, a3, a0
+; RV32IZFINXZDINX-NEXT: and a1, a2, a1
+; RV32IZFINXZDINX-NEXT: or a1, a3, a1
; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZFINXZDINX-NEXT: addi sp, sp, 16
; RV32IZFINXZDINX-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
index f1c56b320b76c4..927eee2e9e5451 100644
--- a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
@@ -53,28 +53,27 @@ define i64 @test_floor_si64(double %x) nounwind {
; RV32IFD-NEXT: fmv.d fs0, fa0
; RV32IFD-NEXT: fle.d s0, fa5, fa0
; RV32IFD-NEXT: call __fixdfdi
-; RV32IFD-NEXT: lui a3, 524288
-; RV32IFD-NEXT: li a4, 1
+; RV32IFD-NEXT: lui a4, 524288
; RV32IFD-NEXT: lui a2, 524288
-; RV32IFD-NEXT: bne s0, a4, .LBB1_2
+; RV32IFD-NEXT: beqz s0, .LBB1_2
; RV32IFD-NEXT: # %bb.1:
; RV32IFD-NEXT: mv a2, a1
; RV32IFD-NEXT: .LBB1_2:
; RV32IFD-NEXT: lui a1, %hi(.LCPI1_1)
; RV32IFD-NEXT: fld fa5, %lo(.LCPI1_1)(a1)
-; RV32IFD-NEXT: flt.d a4, fa5, fs0
-; RV32IFD-NEXT: beqz a4, .LBB1_4
+; RV32IFD-NEXT: flt.d a3, fa5, fs0
+; RV32IFD-NEXT: beqz a3, .LBB1_4
; RV32IFD-NEXT: # %bb.3:
-; RV32IFD-NEXT: addi a2, a3, -1
+; RV32IFD-NEXT: addi a2, a4, -1
; RV32IFD-NEXT: .LBB1_4:
; RV32IFD-NEXT: feq.d a1, fs0, fs0
-; RV32IFD-NEXT: neg a3, a1
-; RV32IFD-NEXT: and a1, a3, a2
-; RV32IFD-NEXT: neg a2, a4
-; RV32IFD-NEXT: neg a4, s0
-; RV32IFD-NEXT: and a0, a4, a0
-; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: neg a4, a1
+; RV32IFD-NEXT: and a1, a4, a2
+; RV32IFD-NEXT: neg a2, a3
+; RV32IFD-NEXT: neg a3, s0
; RV32IFD-NEXT: and a0, a3, a0
+; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: and a0, a4, a0
; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
@@ -299,28 +298,27 @@ define i64 @test_ceil_si64(double %x) nounwind {
; RV32IFD-NEXT: fmv.d fs0, fa0
; RV32IFD-NEXT: fle.d s0, fa5, fa0
; RV32IFD-NEXT: call __fixdfdi
-; RV32IFD-NEXT: lui a3, 524288
-; RV32IFD-NEXT: li a4, 1
+; RV32IFD-NEXT: lui a4, 524288
; RV32IFD-NEXT: lui a2, 524288
-; RV32IFD-NEXT: bne s0, a4, .LBB5_2
+; RV32IFD-NEXT: beqz s0, .LBB5_2
; RV32IFD-NEXT: # %bb.1:
; RV32IFD-NEXT: mv a2, a1
; RV32IFD-NEXT: .LBB5_2:
; RV32IFD-NEXT: lui a1, %hi(.LCPI5_1)
; RV32IFD-NEXT: fld fa5, %lo(.LCPI5_1)(a1)
-; RV32IFD-NEXT: flt.d a4, fa5, fs0
-; RV32IFD-NEXT: beqz a4, .LBB5_4
+; RV32IFD-NEXT: flt.d a3, fa5, fs0
+; RV32IFD-NEXT: beqz a3, .LBB5_4
; RV32IFD-NEXT: # %bb.3:
-; RV32IFD-NEXT: addi a2, a3, -1
+; RV32IFD-NEXT: addi a2, a4, -1
; RV32IFD-NEXT: .LBB5_4:
; RV32IFD-NEXT: feq.d a1, fs0, fs0
-; RV32IFD-NEXT: neg a3, a1
-; RV32IFD-NEXT: and a1, a3, a2
-; RV32IFD-NEXT: neg a2, a4
-; RV32IFD-NEXT: neg a4, s0
-; RV32IFD-NEXT: and a0, a4, a0
-; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: neg a4, a1
+; RV32IFD-NEXT: and a1, a4, a2
+; RV32IFD-NEXT: neg a2, a3
+; RV32IFD-NEXT: neg a3, s0
; RV32IFD-NEXT: and a0, a3, a0
+; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: and a0, a4, a0
; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
@@ -545,28 +543,27 @@ define i64 @test_trunc_si64(double %x) nounwind {
; RV32IFD-NEXT: fmv.d fs0, fa0
; RV32IFD-NEXT: fle.d s0, fa5, fa0
; RV32IFD-NEXT: call __fixdfdi
-; RV32IFD-NEXT: lui a3, 524288
-; RV32IFD-NEXT: li a4, 1
+; RV32IFD-NEXT: lui a4, 524288
; RV32IFD-NEXT: lui a2, 524288
-; RV32IFD-NEXT: bne s0, a4, .LBB9_2
+; RV32IFD-NEXT: beqz s0, .LBB9_2
; RV32IFD-NEXT: # %bb.1:
; RV32IFD-NEXT: mv a2, a1
; RV32IFD-NEXT: .LBB9_2:
; RV32IFD-NEXT: lui a1, %hi(.LCPI9_1)
; RV32IFD-NEXT: fld fa5, %lo(.LCPI9_1)(a1)
-; RV32IFD-NEXT: flt.d a4, fa5, fs0
-; RV32IFD-NEXT: beqz a4, .LBB9_4
+; RV32IFD-NEXT: flt.d a3, fa5, fs0
+; RV32IFD-NEXT: beqz a3, .LBB9_4
; RV32IFD-NEXT: # %bb.3:
-; RV32IFD-NEXT: addi a2, a3, -1
+; RV32IFD-NEXT: addi a2, a4, -1
; RV32IFD-NEXT: .LBB9_4:
; RV32IFD-NEXT: feq.d a1, fs0, fs0
-; RV32IFD-NEXT: neg a3, a1
-; RV32IFD-NEXT: and a1, a3, a2
-; RV32IFD-NEXT: neg a2, a4
-; RV32IFD-NEXT: neg a4, s0
-; RV32IFD-NEXT: and a0, a4, a0
-; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: neg a4, a1
+; RV32IFD-NEXT: and a1, a4, a2
+; RV32IFD-NEXT: neg a2, a3
+; RV32IFD-NEXT: neg a3, s0
; RV32IFD-NEXT: and a0, a3, a0
+; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: and a0, a4, a0
; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
@@ -791,28 +788,27 @@ define i64 @test_round_si64(double %x) nounwind {
; RV32IFD-NEXT: fmv.d fs0, fa0
; RV32IFD-NEXT: fle.d s0, fa5, fa0
; RV32IFD-NEXT: call __fixdfdi
-; RV32IFD-NEXT: lui a3, 524288
-; RV32IFD-NEXT: li a4, 1
+; RV32IFD-NEXT: lui a4, 524288
; RV32IFD-NEXT: lui a2, 524288
-; RV32IFD-NEXT: bne s0, a4, .LBB13_2
+; RV32IFD-NEXT: beqz s0, .LBB13_2
; RV32IFD-NEXT: # %bb.1:
; RV32IFD-NEXT: mv a2, a1
; RV32IFD-NEXT: .LBB13_2:
; RV32IFD-NEXT: lui a1, %hi(.LCPI13_1)
; RV32IFD-NEXT: fld fa5, %lo(.LCPI13_1)(a1)
-; RV32IFD-NEXT: flt.d a4, fa5, fs0
-; RV32IFD-NEXT: beqz a4, .LBB13_4
+; RV32IFD-NEXT: flt.d a3, fa5, fs0
+; RV32IFD-NEXT: beqz a3, .LBB13_4
; RV32IFD-NEXT: # %bb.3:
-; RV32IFD-NEXT: addi a2, a3, -1
+; RV32IFD-NEXT: addi a2, a4, -1
; RV32IFD-NEXT: .LBB13_4:
; RV32IFD-NEXT: feq.d a1, fs0, fs0
-; RV32IFD-NEXT: neg a3, a1
-; RV32IFD-NEXT: and a1, a3, a2
-; RV32IFD-NEXT: neg a2, a4
-; RV32IFD-NEXT: neg a4, s0
-; RV32IFD-NEXT: and a0, a4, a0
-; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: neg a4, a1
+; RV32IFD-NEXT: and a1, a4, a2
+; RV32IFD-NEXT: neg a2, a3
+; RV32IFD-NEXT: neg a3, s0
; RV32IFD-NEXT: and a0, a3, a0
+; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: and a0, a4, a0
; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
@@ -1037,28 +1033,27 @@ define i64 @test_roundeven_si64(double %x) nounwind {
; RV32IFD-NEXT: fmv.d fs0, fa0
; RV32IFD-NEXT: fle.d s0, fa5, fa0
; RV32IFD-NEXT: call __fixdfdi
-; RV32IFD-NEXT: lui a3, 524288
-; RV32IFD-NEXT: li a4, 1
+; RV32IFD-NEXT: lui a4, 524288
; RV32IFD-NEXT: lui a2, 524288
-; RV32IFD-NEXT: bne s0, a4, .LBB17_2
+; RV32IFD-NEXT: beqz s0, .LBB17_2
; RV32IFD-NEXT: # %bb.1:
; RV32IFD-NEXT: mv a2, a1
; RV32IFD-NEXT: .LBB17_2:
; RV32IFD-NEXT: lui a1, %hi(.LCPI17_1)
; RV32IFD-NEXT: fld fa5, %lo(.LCPI17_1)(a1)
-; RV32IFD-NEXT: flt.d a4, fa5, fs0
-; RV32IFD-NEXT: beqz a4, .LBB17_4
+; RV32IFD-NEXT: flt.d a3, fa5, fs0
+; RV32IFD-NEXT: beqz a3, .LBB17_4
; RV32IFD-NEXT: # %bb.3:
-; RV32IFD-NEXT: addi a2, a3, -1
+; RV32IFD-NEXT: addi a2, a4, -1
; RV32IFD-NEXT: .LBB17_4:
; RV32IFD-NEXT: feq.d a1, fs0, fs0
-; RV32IFD-NEXT: neg a3, a1
-; RV32IFD-NEXT: and a1, a3, a2
-; RV32IFD-NEXT: neg a2, a4
-; RV32IFD-NEXT: neg a4, s0
-; RV32IFD-NEXT: and a0, a4, a0
-; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: neg a4, a1
+; RV32IFD-NEXT: and a1, a4, a2
+; RV32IFD-NEXT: neg a2, a3
+; RV32IFD-NEXT: neg a3, s0
; RV32IFD-NEXT: and a0, a3, a0
+; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: and a0, a4, a0
; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
@@ -1283,28 +1278,27 @@ define i64 @test_rint_si64(double %x) nounwind {
; RV32IFD-NEXT: fmv.d fs0, fa0
; RV32IFD-NEXT: fle.d s0, fa5, fa0
; RV32IFD-NEXT: call __fixdfdi
-; RV32IFD-NEXT: lui a3, 524288
-; RV32IFD-NEXT: li a4, 1
+; RV32IFD-NEXT: lui a4, 524288
; RV32IFD-NEXT: lui a2, 524288
-; RV32IFD-NEXT: bne s0, a4, .LBB21_2
+; RV32IFD-NEXT: beqz s0, .LBB21_2
; RV32IFD-NEXT: # %bb.1:
; RV32IFD-NEXT: mv a2, a1
; RV32IFD-NEXT: .LBB21_2:
; RV32IFD-NEXT: lui a1, %hi(.LCPI21_1)
; RV32IFD-NEXT: fld fa5, %lo(.LCPI21_1)(a1)
-; RV32IFD-NEXT: flt.d a4, fa5, fs0
-; RV32IFD-NEXT: beqz a4, .LBB21_4
+; RV32IFD-NEXT: flt.d a3, fa5, fs0
+; RV32IFD-NEXT: beqz a3, .LBB21_4
; RV32IFD-NEXT: # %bb.3:
-; RV32IFD-NEXT: addi a2, a3, -1
+; RV32IFD-NEXT: addi a2, a4, -1
; RV32IFD-NEXT: .LBB21_4:
; RV32IFD-NEXT: feq.d a1, fs0, fs0
-; RV32IFD-NEXT: neg a3, a1
-; RV32IFD-NEXT: and a1, a3, a2
-; RV32IFD-NEXT: neg a2, a4
-; RV32IFD-NEXT: neg a4, s0
-; RV32IFD-NEXT: and a0, a4, a0
-; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: neg a4, a1
+; RV32IFD-NEXT: and a1, a4, a2
+; RV32IFD-NEXT: neg a2, a3
+; RV32IFD-NEXT: neg a3, s0
; RV32IFD-NEXT: and a0, a3, a0
+; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: and a0, a4, a0
; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll
index 653b64ec730496..7eabd3f5f2273a 100644
--- a/llvm/test/CodeGen/RISCV/float-convert.ll
+++ b/llvm/test/CodeGen/RISCV/float-convert.ll
@@ -275,26 +275,24 @@ define i32 @fcvt_wu_s_sat(float %a) nounwind {
; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 0(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a0
-; RV32I-NEXT: lui a1, 325632
-; RV32I-NEXT: addi a1, a1, -1
-; RV32I-NEXT: call __gtsf2
-; RV32I-NEXT: sgtz a0, a0
-; RV32I-NEXT: neg s1, a0
-; RV32I-NEXT: mv a0, s0
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: call __gesf2
; RV32I-NEXT: slti a0, a0, 0
-; RV32I-NEXT: addi s2, a0, -1
+; RV32I-NEXT: addi s1, a0, -1
; RV32I-NEXT: mv a0, s0
; RV32I-NEXT: call __fixunssfsi
-; RV32I-NEXT: and a0, s2, a0
-; RV32I-NEXT: or a0, s1, a0
+; RV32I-NEXT: and s1, s1, a0
+; RV32I-NEXT: lui a1, 325632
+; RV32I-NEXT: addi a1, a1, -1
+; RV32I-NEXT: mv a0, s0
+; RV32I-NEXT: call __gtsf2
+; RV32I-NEXT: sgtz a0, a0
+; RV32I-NEXT: neg a0, a0
+; RV32I-NEXT: or a0, a0, s1
; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 16
; RV32I-NEXT: ret
;
@@ -618,38 +616,36 @@ define i64 @fcvt_l_s_sat(float %a) nounwind {
; RV32IF-NEXT: addi sp, sp, -16
; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IF-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IF-NEXT: fmv.s fs0, fa0
; RV32IF-NEXT: lui a0, 913408
; RV32IF-NEXT: fmv.w.x fa5, a0
; RV32IF-NEXT: fle.s s0, fa5, fa0
-; RV32IF-NEXT: neg s1, s0
; RV32IF-NEXT: call __fixsfdi
-; RV32IF-NEXT: lui a2, %hi(.LCPI12_0)
-; RV32IF-NEXT: flw fa5, %lo(.LCPI12_0)(a2)
-; RV32IF-NEXT: and a0, s1, a0
-; RV32IF-NEXT: flt.s a3, fa5, fs0
-; RV32IF-NEXT: neg a2, a3
-; RV32IF-NEXT: or a0, a2, a0
-; RV32IF-NEXT: feq.s a2, fs0, fs0
-; RV32IF-NEXT: neg a2, a2
-; RV32IF-NEXT: lui a5, 524288
; RV32IF-NEXT: lui a4, 524288
+; RV32IF-NEXT: lui a2, 524288
; RV32IF-NEXT: beqz s0, .LBB12_2
; RV32IF-NEXT: # %bb.1: # %start
-; RV32IF-NEXT: mv a4, a1
+; RV32IF-NEXT: mv a2, a1
; RV32IF-NEXT: .LBB12_2: # %start
-; RV32IF-NEXT: and a0, a2, a0
+; RV32IF-NEXT: lui a1, %hi(.LCPI12_0)
+; RV32IF-NEXT: flw fa5, %lo(.LCPI12_0)(a1)
+; RV32IF-NEXT: flt.s a3, fa5, fs0
; RV32IF-NEXT: beqz a3, .LBB12_4
; RV32IF-NEXT: # %bb.3:
-; RV32IF-NEXT: addi a4, a5, -1
+; RV32IF-NEXT: addi a2, a4, -1
; RV32IF-NEXT: .LBB12_4: # %start
-; RV32IF-NEXT: and a1, a2, a4
+; RV32IF-NEXT: feq.s a1, fs0, fs0
+; RV32IF-NEXT: neg a4, a1
+; RV32IF-NEXT: and a1, a4, a2
+; RV32IF-NEXT: neg a2, s0
+; RV32IF-NEXT: and a0, a2, a0
+; RV32IF-NEXT: neg a2, a3
+; RV32IF-NEXT: or a0, a2, a0
+; RV32IF-NEXT: and a0, a4, a0
; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IF-NEXT: addi sp, sp, 16
; RV32IF-NEXT: ret
;
@@ -867,22 +863,23 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
; RV32IF-NEXT: addi sp, sp, -16
; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IF-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: lui a0, %hi(.LCPI14_0)
-; RV32IF-NEXT: flw fa5, %lo(.LCPI14_0)(a0)
-; RV32IF-NEXT: flt.s a0, fa5, fa0
-; RV32IF-NEXT: neg s0, a0
+; RV32IF-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: fmv.s fs0, fa0
; RV32IF-NEXT: fmv.w.x fa5, zero
; RV32IF-NEXT: fle.s a0, fa5, fa0
-; RV32IF-NEXT: neg s1, a0
+; RV32IF-NEXT: neg s0, a0
; RV32IF-NEXT: call __fixunssfdi
-; RV32IF-NEXT: and a0, s1, a0
-; RV32IF-NEXT: or a0, s0, a0
-; RV32IF-NEXT: and a1, s1, a1
-; RV32IF-NEXT: or a1, s0, a1
+; RV32IF-NEXT: lui a2, %hi(.LCPI14_0)
+; RV32IF-NEXT: flw fa5, %lo(.LCPI14_0)(a2)
+; RV32IF-NEXT: and a0, s0, a0
+; RV32IF-NEXT: flt.s a2, fa5, fs0
+; RV32IF-NEXT: neg a2, a2
+; RV32IF-NEXT: or a0, a2, a0
+; RV32IF-NEXT: and a1, s0, a1
+; RV32IF-NEXT: or a1, a2, a1
; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IF-NEXT: addi sp, sp, 16
; RV32IF-NEXT: ret
;
@@ -901,17 +898,19 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
; RV32IZFINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFINX-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; RV32IZFINX-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFINX-NEXT: lui a1, %hi(.LCPI14_0)
-; RV32IZFINX-NEXT: lw a1, %lo(.LCPI14_0)(a1)
-; RV32IZFINX-NEXT: flt.s a1, a1, a0
-; RV32IZFINX-NEXT: neg s0, a1
-; RV32IZFINX-NEXT: fle.s a1, zero, a0
-; RV32IZFINX-NEXT: neg s1, a1
+; RV32IZFINX-NEXT: mv s0, a0
+; RV32IZFINX-NEXT: fle.s a0, zero, a0
+; RV32IZFINX-NEXT: neg s1, a0
+; RV32IZFINX-NEXT: mv a0, s0
; RV32IZFINX-NEXT: call __fixunssfdi
+; RV32IZFINX-NEXT: lui a2, %hi(.LCPI14_0)
+; RV32IZFINX-NEXT: lw a2, %lo(.LCPI14_0)(a2)
; RV32IZFINX-NEXT: and a0, s1, a0
-; RV32IZFINX-NEXT: or a0, s0, a0
+; RV32IZFINX-NEXT: flt.s a2, a2, s0
+; RV32IZFINX-NEXT: neg a2, a2
+; RV32IZFINX-NEXT: or a0, a2, a0
; RV32IZFINX-NEXT: and a1, s1, a1
-; RV32IZFINX-NEXT: or a1, s0, a1
+; RV32IZFINX-NEXT: or a1, a2, a1
; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -929,33 +928,36 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
;
; RV32I-LABEL: fcvt_lu_s_sat:
; RV32I: # %bb.0: # %start
-; RV32I-NEXT: addi sp, sp, -16
-; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a0
-; RV32I-NEXT: lui a1, 391168
-; RV32I-NEXT: addi a1, a1, -1
-; RV32I-NEXT: call __gtsf2
-; RV32I-NEXT: sgtz a0, a0
-; RV32I-NEXT: neg s1, a0
-; RV32I-NEXT: mv a0, s0
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: call __gesf2
; RV32I-NEXT: slti a0, a0, 0
; RV32I-NEXT: addi s2, a0, -1
; RV32I-NEXT: mv a0, s0
; RV32I-NEXT: call __fixunssfdi
-; RV32I-NEXT: and a0, s2, a0
-; RV32I-NEXT: or a0, s1, a0
-; RV32I-NEXT: and a1, s2, a1
-; RV32I-NEXT: or a1, s1, a1
-; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: mv s1, a1
+; RV32I-NEXT: and s3, s2, a0
+; RV32I-NEXT: lui a1, 391168
+; RV32I-NEXT: addi a1, a1, -1
+; RV32I-NEXT: mv a0, s0
+; RV32I-NEXT: call __gtsf2
+; RV32I-NEXT: sgtz a0, a0
+; RV32I-NEXT: neg a1, a0
+; RV32I-NEXT: or a0, a1, s3
+; RV32I-NEXT: and a2, s2, s1
+; RV32I-NEXT: or a1, a1, a2
+; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
; RV64I-LABEL: fcvt_lu_s_sat:
@@ -2089,26 +2091,24 @@ define zeroext i32 @fcvt_wu_s_sat_zext(float %a) nounwind {
; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 0(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a0
-; RV32I-NEXT: lui a1, 325632
-; RV32I-NEXT: addi a1, a1, -1
-; RV32I-NEXT: call __gtsf2
-; RV32I-NEXT: sgtz a0, a0
-; RV32I-NEXT: neg s1, a0
-; RV32I-NEXT: mv a0, s0
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: call __gesf2
; RV32I-NEXT: slti a0, a0, 0
-; RV32I-NEXT: addi s2, a0, -1
+; RV32I-NEXT: addi s1, a0, -1
; RV32I-NEXT: mv a0, s0
; RV32I-NEXT: call __fixunssfsi
-; RV32I-NEXT: and a0, s2, a0
-; RV32I-NEXT: or a0, s1, a0
+; RV32I-NEXT: and s1, s1, a0
+; RV32I-NEXT: lui a1, 325632
+; RV32I-NEXT: addi a1, a1, -1
+; RV32I-NEXT: mv a0, s0
+; RV32I-NEXT: call __gtsf2
+; RV32I-NEXT: sgtz a0, a0
+; RV32I-NEXT: neg a0, a0
+; RV32I-NEXT: or a0, a0, s1
; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 16
; RV32I-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
index 4f747c278da03c..5e99c7eb905628 100644
--- a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
@@ -37,8 +37,7 @@ define i64 @test_floor_si64(float %x) nounwind {
; RV32IF-NEXT: addi sp, sp, -16
; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IF-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IF-NEXT: fmv.s fs0, fa0
; RV32IF-NEXT: lui a0, 307200
; RV32IF-NEXT: fmv.w.x fa5, a0
@@ -53,33 +52,32 @@ define i64 @test_floor_si64(float %x) nounwind {
; RV32IF-NEXT: lui a0, 913408
; RV32IF-NEXT: fmv.w.x fa5, a0
; RV32IF-NEXT: fle.s s0, fa5, fs0
-; RV32IF-NEXT: neg s1, s0
; RV32IF-NEXT: fmv.s fa0, fs0
; RV32IF-NEXT: call __fixsfdi
-; RV32IF-NEXT: lui a2, %hi(.LCPI1_0)
-; RV32IF-NEXT: flw fa5, %lo(.LCPI1_0)(a2)
-; RV32IF-NEXT: and a0, s1, a0
-; RV32IF-NEXT: flt.s a3, fa5, fs0
-; RV32IF-NEXT: neg a2, a3
-; RV32IF-NEXT: or a0, a2, a0
-; RV32IF-NEXT: feq.s a2, fs0, fs0
-; RV32IF-NEXT: neg a2, a2
-; RV32IF-NEXT: lui a5, 524288
; RV32IF-NEXT: lui a4, 524288
+; RV32IF-NEXT: lui a2, 524288
; RV32IF-NEXT: beqz s0, .LBB1_4
; RV32IF-NEXT: # %bb.3:
-; RV32IF-NEXT: mv a4, a1
+; RV32IF-NEXT: mv a2, a1
; RV32IF-NEXT: .LBB1_4:
-; RV32IF-NEXT: and a0, a2, a0
+; RV32IF-NEXT: lui a1, %hi(.LCPI1_0)
+; RV32IF-NEXT: flw fa5, %lo(.LCPI1_0)(a1)
+; RV32IF-NEXT: flt.s a3, fa5, fs0
; RV32IF-NEXT: beqz a3, .LBB1_6
; RV32IF-NEXT: # %bb.5:
-; RV32IF-NEXT: addi a4, a5, -1
+; RV32IF-NEXT: addi a2, a4, -1
; RV32IF-NEXT: .LBB1_6:
-; RV32IF-NEXT: and a1, a2, a4
+; RV32IF-NEXT: feq.s a1, fs0, fs0
+; RV32IF-NEXT: neg a4, a1
+; RV32IF-NEXT: and a1, a4, a2
+; RV32IF-NEXT: neg a2, s0
+; RV32IF-NEXT: and a0, a2, a0
+; RV32IF-NEXT: neg a2, a3
+; RV32IF-NEXT: or a0, a2, a0
+; RV32IF-NEXT: and a0, a4, a0
; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IF-NEXT: addi sp, sp, 16
; RV32IF-NEXT: ret
;
@@ -117,23 +115,23 @@ define i64 @test_floor_si64(float %x) nounwind {
; RV32IZFINX-NEXT: lui a2, %hi(.LCPI1_0)
; RV32IZFINX-NEXT: lw a2, %lo(.LCPI1_0)(a2)
; RV32IZFINX-NEXT: and a0, s2, a0
-; RV32IZFINX-NEXT: flt.s a3, a2, s0
-; RV32IZFINX-NEXT: neg a2, a3
+; RV32IZFINX-NEXT: flt.s a4, a2, s0
+; RV32IZFINX-NEXT: neg a2, a4
; RV32IZFINX-NEXT: or a0, a2, a0
; RV32IZFINX-NEXT: feq.s a2, s0, s0
; RV32IZFINX-NEXT: neg a2, a2
; RV32IZFINX-NEXT: lui a5, 524288
-; RV32IZFINX-NEXT: lui a4, 524288
+; RV32IZFINX-NEXT: lui a3, 524288
; RV32IZFINX-NEXT: beqz s1, .LBB1_4
; RV32IZFINX-NEXT: # %bb.3:
-; RV32IZFINX-NEXT: mv a4, a1
+; RV32IZFINX-NEXT: mv a3, a1
; RV32IZFINX-NEXT: .LBB1_4:
; RV32IZFINX-NEXT: and a0, a2, a0
-; RV32IZFINX-NEXT: beqz a3, .LBB1_6
+; RV32IZFINX-NEXT: beqz a4, .LBB1_6
; RV32IZFINX-NEXT: # %bb.5:
-; RV32IZFINX-NEXT: addi a4, a5, -1
+; RV32IZFINX-NEXT: addi a3, a5, -1
; RV32IZFINX-NEXT: .LBB1_6:
-; RV32IZFINX-NEXT: and a1, a2, a4
+; RV32IZFINX-NEXT: and a1, a2, a3
; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -299,8 +297,7 @@ define i64 @test_ceil_si64(float %x) nounwind {
; RV32IF-NEXT: addi sp, sp, -16
; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IF-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IF-NEXT: fmv.s fs0, fa0
; RV32IF-NEXT: lui a0, 307200
; RV32IF-NEXT: fmv.w.x fa5, a0
@@ -315,33 +312,32 @@ define i64 @test_ceil_si64(float %x) nounwind {
; RV32IF-NEXT: lui a0, 913408
; RV32IF-NEXT: fmv.w.x fa5, a0
; RV32IF-NEXT: fle.s s0, fa5, fs0
-; RV32IF-NEXT: neg s1, s0
; RV32IF-NEXT: fmv.s fa0, fs0
; RV32IF-NEXT: call __fixsfdi
-; RV32IF-NEXT: lui a2, %hi(.LCPI5_0)
-; RV32IF-NEXT: flw fa5, %lo(.LCPI5_0)(a2)
-; RV32IF-NEXT: and a0, s1, a0
-; RV32IF-NEXT: flt.s a3, fa5, fs0
-; RV32IF-NEXT: neg a2, a3
-; RV32IF-NEXT: or a0, a2, a0
-; RV32IF-NEXT: feq.s a2, fs0, fs0
-; RV32IF-NEXT: neg a2, a2
-; RV32IF-NEXT: lui a5, 524288
; RV32IF-NEXT: lui a4, 524288
+; RV32IF-NEXT: lui a2, 524288
; RV32IF-NEXT: beqz s0, .LBB5_4
; RV32IF-NEXT: # %bb.3:
-; RV32IF-NEXT: mv a4, a1
+; RV32IF-NEXT: mv a2, a1
; RV32IF-NEXT: .LBB5_4:
-; RV32IF-NEXT: and a0, a2, a0
+; RV32IF-NEXT: lui a1, %hi(.LCPI5_0)
+; RV32IF-NEXT: flw fa5, %lo(.LCPI5_0)(a1)
+; RV32IF-NEXT: flt.s a3, fa5, fs0
; RV32IF-NEXT: beqz a3, .LBB5_6
; RV32IF-NEXT: # %bb.5:
-; RV32IF-NEXT: addi a4, a5, -1
+; RV32IF-NEXT: addi a2, a4, -1
; RV32IF-NEXT: .LBB5_6:
-; RV32IF-NEXT: and a1, a2, a4
+; RV32IF-NEXT: feq.s a1, fs0, fs0
+; RV32IF-NEXT: neg a4, a1
+; RV32IF-NEXT: and a1, a4, a2
+; RV32IF-NEXT: neg a2, s0
+; RV32IF-NEXT: and a0, a2, a0
+; RV32IF-NEXT: neg a2, a3
+; RV32IF-NEXT: or a0, a2, a0
+; RV32IF-NEXT: and a0, a4, a0
; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IF-NEXT: addi sp, sp, 16
; RV32IF-NEXT: ret
;
@@ -379,23 +375,23 @@ define i64 @test_ceil_si64(float %x) nounwind {
; RV32IZFINX-NEXT: lui a2, %hi(.LCPI5_0)
; RV32IZFINX-NEXT: lw a2, %lo(.LCPI5_0)(a2)
; RV32IZFINX-NEXT: and a0, s2, a0
-; RV32IZFINX-NEXT: flt.s a3, a2, s0
-; RV32IZFINX-NEXT: neg a2, a3
+; RV32IZFINX-NEXT: flt.s a4, a2, s0
+; RV32IZFINX-NEXT: neg a2, a4
; RV32IZFINX-NEXT: or a0, a2, a0
; RV32IZFINX-NEXT: feq.s a2, s0, s0
; RV32IZFINX-NEXT: neg a2, a2
; RV32IZFINX-NEXT: lui a5, 524288
-; RV32IZFINX-NEXT: lui a4, 524288
+; RV32IZFINX-NEXT: lui a3, 524288
; RV32IZFINX-NEXT: beqz s1, .LBB5_4
; RV32IZFINX-NEXT: # %bb.3:
-; RV32IZFINX-NEXT: mv a4, a1
+; RV32IZFINX-NEXT: mv a3, a1
; RV32IZFINX-NEXT: .LBB5_4:
; RV32IZFINX-NEXT: and a0, a2, a0
-; RV32IZFINX-NEXT: beqz a3, .LBB5_6
+; RV32IZFINX-NEXT: beqz a4, .LBB5_6
; RV32IZFINX-NEXT: # %bb.5:
-; RV32IZFINX-NEXT: addi a4, a5, -1
+; RV32IZFINX-NEXT: addi a3, a5, -1
; RV32IZFINX-NEXT: .LBB5_6:
-; RV32IZFINX-NEXT: and a1, a2, a4
+; RV32IZFINX-NEXT: and a1, a2, a3
; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -561,8 +557,7 @@ define i64 @test_trunc_si64(float %x) nounwind {
; RV32IF-NEXT: addi sp, sp, -16
; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IF-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IF-NEXT: fmv.s fs0, fa0
; RV32IF-NEXT: lui a0, 307200
; RV32IF-NEXT: fmv.w.x fa5, a0
@@ -577,33 +572,32 @@ define i64 @test_trunc_si64(float %x) nounwind {
; RV32IF-NEXT: lui a0, 913408
; RV32IF-NEXT: fmv.w.x fa5, a0
; RV32IF-NEXT: fle.s s0, fa5, fs0
-; RV32IF-NEXT: neg s1, s0
; RV32IF-NEXT: fmv.s fa0, fs0
; RV32IF-NEXT: call __fixsfdi
-; RV32IF-NEXT: lui a2, %hi(.LCPI9_0)
-; RV32IF-NEXT: flw fa5, %lo(.LCPI9_0)(a2)
-; RV32IF-NEXT: and a0, s1, a0
-; RV32IF-NEXT: flt.s a3, fa5, fs0
-; RV32IF-NEXT: neg a2, a3
-; RV32IF-NEXT: or a0, a2, a0
-; RV32IF-NEXT: feq.s a2, fs0, fs0
-; RV32IF-NEXT: neg a2, a2
-; RV32IF-NEXT: lui a5, 524288
; RV32IF-NEXT: lui a4, 524288
+; RV32IF-NEXT: lui a2, 524288
; RV32IF-NEXT: beqz s0, .LBB9_4
; RV32IF-NEXT: # %bb.3:
-; RV32IF-NEXT: mv a4, a1
+; RV32IF-NEXT: mv a2, a1
; RV32IF-NEXT: .LBB9_4:
-; RV32IF-NEXT: and a0, a2, a0
+; RV32IF-NEXT: lui a1, %hi(.LCPI9_0)
+; RV32IF-NEXT: flw fa5, %lo(.LCPI9_0)(a1)
+; RV32IF-NEXT: flt.s a3, fa5, fs0
; RV32IF-NEXT: beqz a3, .LBB9_6
; RV32IF-NEXT: # %bb.5:
-; RV32IF-NEXT: addi a4, a5, -1
+; RV32IF-NEXT: addi a2, a4, -1
; RV32IF-NEXT: .LBB9_6:
-; RV32IF-NEXT: and a1, a2, a4
+; RV32IF-NEXT: feq.s a1, fs0, fs0
+; RV32IF-NEXT: neg a4, a1
+; RV32IF-NEXT: and a1, a4, a2
+; RV32IF-NEXT: neg a2, s0
+; RV32IF-NEXT: and a0, a2, a0
+; RV32IF-NEXT: neg a2, a3
+; RV32IF-NEXT: or a0, a2, a0
+; RV32IF-NEXT: and a0, a4, a0
; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IF-NEXT: addi sp, sp, 16
; RV32IF-NEXT: ret
;
@@ -641,23 +635,23 @@ define i64 @test_trunc_si64(float %x) nounwind {
; RV32IZFINX-NEXT: lui a2, %hi(.LCPI9_0)
; RV32IZFINX-NEXT: lw a2, %lo(.LCPI9_0)(a2)
; RV32IZFINX-NEXT: and a0, s2, a0
-; RV32IZFINX-NEXT: flt.s a3, a2, s0
-; RV32IZFINX-NEXT: neg a2, a3
+; RV32IZFINX-NEXT: flt.s a4, a2, s0
+; RV32IZFINX-NEXT: neg a2, a4
; RV32IZFINX-NEXT: or a0, a2, a0
; RV32IZFINX-NEXT: feq.s a2, s0, s0
; RV32IZFINX-NEXT: neg a2, a2
; RV32IZFINX-NEXT: lui a5, 524288
-; RV32IZFINX-NEXT: lui a4, 524288
+; RV32IZFINX-NEXT: lui a3, 524288
; RV32IZFINX-NEXT: beqz s1, .LBB9_4
; RV32IZFINX-NEXT: # %bb.3:
-; RV32IZFINX-NEXT: mv a4, a1
+; RV32IZFINX-NEXT: mv a3, a1
; RV32IZFINX-NEXT: .LBB9_4:
; RV32IZFINX-NEXT: and a0, a2, a0
-; RV32IZFINX-NEXT: beqz a3, .LBB9_6
+; RV32IZFINX-NEXT: beqz a4, .LBB9_6
; RV32IZFINX-NEXT: # %bb.5:
-; RV32IZFINX-NEXT: addi a4, a5, -1
+; RV32IZFINX-NEXT: addi a3, a5, -1
; RV32IZFINX-NEXT: .LBB9_6:
-; RV32IZFINX-NEXT: and a1, a2, a4
+; RV32IZFINX-NEXT: and a1, a2, a3
; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -823,8 +817,7 @@ define i64 @test_round_si64(float %x) nounwind {
; RV32IF-NEXT: addi sp, sp, -16
; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IF-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IF-NEXT: fmv.s fs0, fa0
; RV32IF-NEXT: lui a0, 307200
; RV32IF-NEXT: fmv.w.x fa5, a0
@@ -839,33 +832,32 @@ define i64 @test_round_si64(float %x) nounwind {
; RV32IF-NEXT: lui a0, 913408
; RV32IF-NEXT: fmv.w.x fa5, a0
; RV32IF-NEXT: fle.s s0, fa5, fs0
-; RV32IF-NEXT: neg s1, s0
; RV32IF-NEXT: fmv.s fa0, fs0
; RV32IF-NEXT: call __fixsfdi
-; RV32IF-NEXT: lui a2, %hi(.LCPI13_0)
-; RV32IF-NEXT: flw fa5, %lo(.LCPI13_0)(a2)
-; RV32IF-NEXT: and a0, s1, a0
-; RV32IF-NEXT: flt.s a3, fa5, fs0
-; RV32IF-NEXT: neg a2, a3
-; RV32IF-NEXT: or a0, a2, a0
-; RV32IF-NEXT: feq.s a2, fs0, fs0
-; RV32IF-NEXT: neg a2, a2
-; RV32IF-NEXT: lui a5, 524288
; RV32IF-NEXT: lui a4, 524288
+; RV32IF-NEXT: lui a2, 524288
; RV32IF-NEXT: beqz s0, .LBB13_4
; RV32IF-NEXT: # %bb.3:
-; RV32IF-NEXT: mv a4, a1
+; RV32IF-NEXT: mv a2, a1
; RV32IF-NEXT: .LBB13_4:
-; RV32IF-NEXT: and a0, a2, a0
+; RV32IF-NEXT: lui a1, %hi(.LCPI13_0)
+; RV32IF-NEXT: flw fa5, %lo(.LCPI13_0)(a1)
+; RV32IF-NEXT: flt.s a3, fa5, fs0
; RV32IF-NEXT: beqz a3, .LBB13_6
; RV32IF-NEXT: # %bb.5:
-; RV32IF-NEXT: addi a4, a5, -1
+; RV32IF-NEXT: addi a2, a4, -1
; RV32IF-NEXT: .LBB13_6:
-; RV32IF-NEXT: and a1, a2, a4
+; RV32IF-NEXT: feq.s a1, fs0, fs0
+; RV32IF-NEXT: neg a4, a1
+; RV32IF-NEXT: and a1, a4, a2
+; RV32IF-NEXT: neg a2, s0
+; RV32IF-NEXT: and a0, a2, a0
+; RV32IF-NEXT: neg a2, a3
+; RV32IF-NEXT: or a0, a2, a0
+; RV32IF-NEXT: and a0, a4, a0
; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IF-NEXT: addi sp, sp, 16
; RV32IF-NEXT: ret
;
@@ -903,23 +895,23 @@ define i64 @test_round_si64(float %x) nounwind {
; RV32IZFINX-NEXT: lui a2, %hi(.LCPI13_0)
; RV32IZFINX-NEXT: lw a2, %lo(.LCPI13_0)(a2)
; RV32IZFINX-NEXT: and a0, s2, a0
-; RV32IZFINX-NEXT: flt.s a3, a2, s0
-; RV32IZFINX-NEXT: neg a2, a3
+; RV32IZFINX-NEXT: flt.s a4, a2, s0
+; RV32IZFINX-NEXT: neg a2, a4
; RV32IZFINX-NEXT: or a0, a2, a0
; RV32IZFINX-NEXT: feq.s a2, s0, s0
; RV32IZFINX-NEXT: neg a2, a2
; RV32IZFINX-NEXT: lui a5, 524288
-; RV32IZFINX-NEXT: lui a4, 524288
+; RV32IZFINX-NEXT: lui a3, 524288
; RV32IZFINX-NEXT: beqz s1, .LBB13_4
; RV32IZFINX-NEXT: # %bb.3:
-; RV32IZFINX-NEXT: mv a4, a1
+; RV32IZFINX-NEXT: mv a3, a1
; RV32IZFINX-NEXT: .LBB13_4:
; RV32IZFINX-NEXT: and a0, a2, a0
-; RV32IZFINX-NEXT: beqz a3, .LBB13_6
+; RV32IZFINX-NEXT: beqz a4, .LBB13_6
; RV32IZFINX-NEXT: # %bb.5:
-; RV32IZFINX-NEXT: addi a4, a5, -1
+; RV32IZFINX-NEXT: addi a3, a5, -1
; RV32IZFINX-NEXT: .LBB13_6:
-; RV32IZFINX-NEXT: and a1, a2, a4
+; RV32IZFINX-NEXT: and a1, a2, a3
; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -1085,8 +1077,7 @@ define i64 @test_roundeven_si64(float %x) nounwind {
; RV32IF-NEXT: addi sp, sp, -16
; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IF-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IF-NEXT: fmv.s fs0, fa0
; RV32IF-NEXT: lui a0, 307200
; RV32IF-NEXT: fmv.w.x fa5, a0
@@ -1101,33 +1092,32 @@ define i64 @test_roundeven_si64(float %x) nounwind {
; RV32IF-NEXT: lui a0, 913408
; RV32IF-NEXT: fmv.w.x fa5, a0
; RV32IF-NEXT: fle.s s0, fa5, fs0
-; RV32IF-NEXT: neg s1, s0
; RV32IF-NEXT: fmv.s fa0, fs0
; RV32IF-NEXT: call __fixsfdi
-; RV32IF-NEXT: lui a2, %hi(.LCPI17_0)
-; RV32IF-NEXT: flw fa5, %lo(.LCPI17_0)(a2)
-; RV32IF-NEXT: and a0, s1, a0
-; RV32IF-NEXT: flt.s a3, fa5, fs0
-; RV32IF-NEXT: neg a2, a3
-; RV32IF-NEXT: or a0, a2, a0
-; RV32IF-NEXT: feq.s a2, fs0, fs0
-; RV32IF-NEXT: neg a2, a2
-; RV32IF-NEXT: lui a5, 524288
; RV32IF-NEXT: lui a4, 524288
+; RV32IF-NEXT: lui a2, 524288
; RV32IF-NEXT: beqz s0, .LBB17_4
; RV32IF-NEXT: # %bb.3:
-; RV32IF-NEXT: mv a4, a1
+; RV32IF-NEXT: mv a2, a1
; RV32IF-NEXT: .LBB17_4:
-; RV32IF-NEXT: and a0, a2, a0
+; RV32IF-NEXT: lui a1, %hi(.LCPI17_0)
+; RV32IF-NEXT: flw fa5, %lo(.LCPI17_0)(a1)
+; RV32IF-NEXT: flt.s a3, fa5, fs0
; RV32IF-NEXT: beqz a3, .LBB17_6
; RV32IF-NEXT: # %bb.5:
-; RV32IF-NEXT: addi a4, a5, -1
+; RV32IF-NEXT: addi a2, a4, -1
; RV32IF-NEXT: .LBB17_6:
-; RV32IF-NEXT: and a1, a2, a4
+; RV32IF-NEXT: feq.s a1, fs0, fs0
+; RV32IF-NEXT: neg a4, a1
+; RV32IF-NEXT: and a1, a4, a2
+; RV32IF-NEXT: neg a2, s0
+; RV32IF-NEXT: and a0, a2, a0
+; RV32IF-NEXT: neg a2, a3
+; RV32IF-NEXT: or a0, a2, a0
+; RV32IF-NEXT: and a0, a4, a0
; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IF-NEXT: addi sp, sp, 16
; RV32IF-NEXT: ret
;
@@ -1165,23 +1155,23 @@ define i64 @test_roundeven_si64(float %x) nounwind {
; RV32IZFINX-NEXT: lui a2, %hi(.LCPI17_0)
; RV32IZFINX-NEXT: lw a2, %lo(.LCPI17_0)(a2)
; RV32IZFINX-NEXT: and a0, s2, a0
-; RV32IZFINX-NEXT: flt.s a3, a2, s0
-; RV32IZFINX-NEXT: neg a2, a3
+; RV32IZFINX-NEXT: flt.s a4, a2, s0
+; RV32IZFINX-NEXT: neg a2, a4
; RV32IZFINX-NEXT: or a0, a2, a0
; RV32IZFINX-NEXT: feq.s a2, s0, s0
; RV32IZFINX-NEXT: neg a2, a2
; RV32IZFINX-NEXT: lui a5, 524288
-; RV32IZFINX-NEXT: lui a4, 524288
+; RV32IZFINX-NEXT: lui a3, 524288
; RV32IZFINX-NEXT: beqz s1, .LBB17_4
; RV32IZFINX-NEXT: # %bb.3:
-; RV32IZFINX-NEXT: mv a4, a1
+; RV32IZFINX-NEXT: mv a3, a1
; RV32IZFINX-NEXT: .LBB17_4:
; RV32IZFINX-NEXT: and a0, a2, a0
-; RV32IZFINX-NEXT: beqz a3, .LBB17_6
+; RV32IZFINX-NEXT: beqz a4, .LBB17_6
; RV32IZFINX-NEXT: # %bb.5:
-; RV32IZFINX-NEXT: addi a4, a5, -1
+; RV32IZFINX-NEXT: addi a3, a5, -1
; RV32IZFINX-NEXT: .LBB17_6:
-; RV32IZFINX-NEXT: and a1, a2, a4
+; RV32IZFINX-NEXT: and a1, a2, a3
; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -1347,8 +1337,7 @@ define i64 @test_rint_si64(float %x) nounwind {
; RV32IF-NEXT: addi sp, sp, -16
; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IF-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IF-NEXT: fmv.s fs0, fa0
; RV32IF-NEXT: lui a0, 307200
; RV32IF-NEXT: fmv.w.x fa5, a0
@@ -1363,33 +1352,32 @@ define i64 @test_rint_si64(float %x) nounwind {
; RV32IF-NEXT: lui a0, 913408
; RV32IF-NEXT: fmv.w.x fa5, a0
; RV32IF-NEXT: fle.s s0, fa5, fs0
-; RV32IF-NEXT: neg s1, s0
; RV32IF-NEXT: fmv.s fa0, fs0
; RV32IF-NEXT: call __fixsfdi
-; RV32IF-NEXT: lui a2, %hi(.LCPI21_0)
-; RV32IF-NEXT: flw fa5, %lo(.LCPI21_0)(a2)
-; RV32IF-NEXT: and a0, s1, a0
-; RV32IF-NEXT: flt.s a3, fa5, fs0
-; RV32IF-NEXT: neg a2, a3
-; RV32IF-NEXT: or a0, a2, a0
-; RV32IF-NEXT: feq.s a2, fs0, fs0
-; RV32IF-NEXT: neg a2, a2
-; RV32IF-NEXT: lui a5, 524288
; RV32IF-NEXT: lui a4, 524288
+; RV32IF-NEXT: lui a2, 524288
; RV32IF-NEXT: beqz s0, .LBB21_4
; RV32IF-NEXT: # %bb.3:
-; RV32IF-NEXT: mv a4, a1
+; RV32IF-NEXT: mv a2, a1
; RV32IF-NEXT: .LBB21_4:
-; RV32IF-NEXT: and a0, a2, a0
+; RV32IF-NEXT: lui a1, %hi(.LCPI21_0)
+; RV32IF-NEXT: flw fa5, %lo(.LCPI21_0)(a1)
+; RV32IF-NEXT: flt.s a3, fa5, fs0
; RV32IF-NEXT: beqz a3, .LBB21_6
; RV32IF-NEXT: # %bb.5:
-; RV32IF-NEXT: addi a4, a5, -1
+; RV32IF-NEXT: addi a2, a4, -1
; RV32IF-NEXT: .LBB21_6:
-; RV32IF-NEXT: and a1, a2, a4
+; RV32IF-NEXT: feq.s a1, fs0, fs0
+; RV32IF-NEXT: neg a4, a1
+; RV32IF-NEXT: and a1, a4, a2
+; RV32IF-NEXT: neg a2, s0
+; RV32IF-NEXT: and a0, a2, a0
+; RV32IF-NEXT: neg a2, a3
+; RV32IF-NEXT: or a0, a2, a0
+; RV32IF-NEXT: and a0, a4, a0
; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IF-NEXT: addi sp, sp, 16
; RV32IF-NEXT: ret
;
@@ -1427,23 +1415,23 @@ define i64 @test_rint_si64(float %x) nounwind {
; RV32IZFINX-NEXT: lui a2, %hi(.LCPI21_0)
; RV32IZFINX-NEXT: lw a2, %lo(.LCPI21_0)(a2)
; RV32IZFINX-NEXT: and a0, s2, a0
-; RV32IZFINX-NEXT: flt.s a3, a2, s0
-; RV32IZFINX-NEXT: neg a2, a3
+; RV32IZFINX-NEXT: flt.s a4, a2, s0
+; RV32IZFINX-NEXT: neg a2, a4
; RV32IZFINX-NEXT: or a0, a2, a0
; RV32IZFINX-NEXT: feq.s a2, s0, s0
; RV32IZFINX-NEXT: neg a2, a2
; RV32IZFINX-NEXT: lui a5, 524288
-; RV32IZFINX-NEXT: lui a4, 524288
+; RV32IZFINX-NEXT: lui a3, 524288
; RV32IZFINX-NEXT: beqz s1, .LBB21_4
; RV32IZFINX-NEXT: # %bb.3:
-; RV32IZFINX-NEXT: mv a4, a1
+; RV32IZFINX-NEXT: mv a3, a1
; RV32IZFINX-NEXT: .LBB21_4:
; RV32IZFINX-NEXT: and a0, a2, a0
-; RV32IZFINX-NEXT: beqz a3, .LBB21_6
+; RV32IZFINX-NEXT: beqz a4, .LBB21_6
; RV32IZFINX-NEXT: # %bb.5:
-; RV32IZFINX-NEXT: addi a4, a5, -1
+; RV32IZFINX-NEXT: addi a3, a5, -1
; RV32IZFINX-NEXT: .LBB21_6:
-; RV32IZFINX-NEXT: and a1, a2, a4
+; RV32IZFINX-NEXT: and a1, a2, a3
; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/forced-atomics.ll b/llvm/test/CodeGen/RISCV/forced-atomics.ll
index c303690aadfff8..f6a53a9d76dd35 100644
--- a/llvm/test/CodeGen/RISCV/forced-atomics.ll
+++ b/llvm/test/CodeGen/RISCV/forced-atomics.ll
@@ -3567,8 +3567,8 @@ define i64 @rmw64_umax_seq_cst(ptr %p) nounwind {
; RV32-NEXT: # in Loop: Header=BB51_2 Depth=1
; RV32-NEXT: neg a3, a0
; RV32-NEXT: and a3, a3, a1
-; RV32-NEXT: sw a1, 4(sp)
; RV32-NEXT: sw a4, 0(sp)
+; RV32-NEXT: sw a1, 4(sp)
; RV32-NEXT: mv a1, sp
; RV32-NEXT: li a4, 5
; RV32-NEXT: li a5, 5
@@ -3659,8 +3659,8 @@ define i64 @rmw64_umin_seq_cst(ptr %p) nounwind {
; RV32-NEXT: # in Loop: Header=BB52_2 Depth=1
; RV32-NEXT: neg a3, a0
; RV32-NEXT: and a3, a3, a1
-; RV32-NEXT: sw a1, 4(sp)
; RV32-NEXT: sw a4, 0(sp)
+; RV32-NEXT: sw a1, 4(sp)
; RV32-NEXT: mv a1, sp
; RV32-NEXT: li a4, 5
; RV32-NEXT: li a5, 5
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index 06ab813faf0253..deb5a6d4013d49 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -114,8 +114,8 @@ define i32 @utest_f64i32(double %x) {
; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IF-NEXT: .cfi_offset ra, -4
; RV32IF-NEXT: call __fixunsdfdi
-; RV32IF-NEXT: seqz a1, a1
; RV32IF-NEXT: sltiu a2, a0, -1
+; RV32IF-NEXT: seqz a1, a1
; RV32IF-NEXT: and a1, a1, a2
; RV32IF-NEXT: addi a1, a1, -1
; RV32IF-NEXT: or a0, a1, a0
@@ -429,8 +429,8 @@ define i32 @utesth_f16i32(half %x) {
; RV32-NEXT: .cfi_offset ra, -4
; RV32-NEXT: call __extendhfsf2
; RV32-NEXT: call __fixunssfdi
-; RV32-NEXT: seqz a1, a1
; RV32-NEXT: sltiu a2, a0, -1
+; RV32-NEXT: seqz a1, a1
; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: or a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll
index 277749c75bbbf1..31fb6e2ee9c840 100644
--- a/llvm/test/CodeGen/RISCV/half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert.ll
@@ -2145,47 +2145,41 @@ define i64 @fcvt_l_h(half %a) nounwind {
define i64 @fcvt_l_h_sat(half %a) nounwind {
; RV32IZFH-LABEL: fcvt_l_h_sat:
; RV32IZFH: # %bb.0: # %start
-; RV32IZFH-NEXT: addi sp, sp, -32
-; RV32IZFH-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: fsw fs0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: lui a0, %hi(.LCPI10_0)
-; RV32IZFH-NEXT: flw fa5, %lo(.LCPI10_0)(a0)
+; RV32IZFH-NEXT: addi sp, sp, -16
+; RV32IZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: fcvt.s.h fs0, fa0
-; RV32IZFH-NEXT: flt.s s0, fa5, fs0
-; RV32IZFH-NEXT: neg s1, s0
; RV32IZFH-NEXT: lui a0, 913408
; RV32IZFH-NEXT: fmv.w.x fa5, a0
-; RV32IZFH-NEXT: fle.s s2, fa5, fs0
-; RV32IZFH-NEXT: neg s3, s2
+; RV32IZFH-NEXT: fle.s s0, fa5, fs0
; RV32IZFH-NEXT: fmv.s fa0, fs0
; RV32IZFH-NEXT: call __fixsfdi
-; RV32IZFH-NEXT: and a0, s3, a0
-; RV32IZFH-NEXT: or a0, s1, a0
-; RV32IZFH-NEXT: feq.s a2, fs0, fs0
-; RV32IZFH-NEXT: neg a2, a2
; RV32IZFH-NEXT: lui a4, 524288
-; RV32IZFH-NEXT: lui a3, 524288
-; RV32IZFH-NEXT: beqz s2, .LBB10_2
+; RV32IZFH-NEXT: lui a2, 524288
+; RV32IZFH-NEXT: beqz s0, .LBB10_2
; RV32IZFH-NEXT: # %bb.1: # %start
-; RV32IZFH-NEXT: mv a3, a1
+; RV32IZFH-NEXT: mv a2, a1
; RV32IZFH-NEXT: .LBB10_2: # %start
-; RV32IZFH-NEXT: and a0, a2, a0
-; RV32IZFH-NEXT: beqz s0, .LBB10_4
+; RV32IZFH-NEXT: lui a1, %hi(.LCPI10_0)
+; RV32IZFH-NEXT: flw fa5, %lo(.LCPI10_0)(a1)
+; RV32IZFH-NEXT: flt.s a3, fa5, fs0
+; RV32IZFH-NEXT: beqz a3, .LBB10_4
; RV32IZFH-NEXT: # %bb.3:
-; RV32IZFH-NEXT: addi a3, a4, -1
+; RV32IZFH-NEXT: addi a2, a4, -1
; RV32IZFH-NEXT: .LBB10_4: # %start
-; RV32IZFH-NEXT: and a1, a2, a3
-; RV32IZFH-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: flw fs0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: addi sp, sp, 32
+; RV32IZFH-NEXT: feq.s a1, fs0, fs0
+; RV32IZFH-NEXT: neg a4, a1
+; RV32IZFH-NEXT: and a1, a4, a2
+; RV32IZFH-NEXT: neg a2, a3
+; RV32IZFH-NEXT: neg a3, s0
+; RV32IZFH-NEXT: and a0, a3, a0
+; RV32IZFH-NEXT: or a0, a2, a0
+; RV32IZFH-NEXT: and a0, a4, a0
+; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: addi sp, sp, 16
; RV32IZFH-NEXT: ret
;
; RV64IZFH-LABEL: fcvt_l_h_sat:
@@ -2199,47 +2193,41 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
;
; RV32IDZFH-LABEL: fcvt_l_h_sat:
; RV32IDZFH: # %bb.0: # %start
-; RV32IDZFH-NEXT: addi sp, sp, -32
-; RV32IDZFH-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; RV32IDZFH-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
-; RV32IDZFH-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
-; RV32IDZFH-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IDZFH-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IDZFH-NEXT: addi sp, sp, -16
+; RV32IDZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IDZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; RV32IDZFH-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill
-; RV32IDZFH-NEXT: lui a0, %hi(.LCPI10_0)
-; RV32IDZFH-NEXT: flw fa5, %lo(.LCPI10_0)(a0)
; RV32IDZFH-NEXT: fcvt.s.h fs0, fa0
-; RV32IDZFH-NEXT: flt.s s0, fa5, fs0
-; RV32IDZFH-NEXT: neg s1, s0
; RV32IDZFH-NEXT: lui a0, 913408
; RV32IDZFH-NEXT: fmv.w.x fa5, a0
-; RV32IDZFH-NEXT: fle.s s2, fa5, fs0
-; RV32IDZFH-NEXT: neg s3, s2
+; RV32IDZFH-NEXT: fle.s s0, fa5, fs0
; RV32IDZFH-NEXT: fmv.s fa0, fs0
; RV32IDZFH-NEXT: call __fixsfdi
-; RV32IDZFH-NEXT: and a0, s3, a0
-; RV32IDZFH-NEXT: or a0, s1, a0
-; RV32IDZFH-NEXT: feq.s a2, fs0, fs0
-; RV32IDZFH-NEXT: neg a2, a2
; RV32IDZFH-NEXT: lui a4, 524288
-; RV32IDZFH-NEXT: lui a3, 524288
-; RV32IDZFH-NEXT: beqz s2, .LBB10_2
+; RV32IDZFH-NEXT: lui a2, 524288
+; RV32IDZFH-NEXT: beqz s0, .LBB10_2
; RV32IDZFH-NEXT: # %bb.1: # %start
-; RV32IDZFH-NEXT: mv a3, a1
+; RV32IDZFH-NEXT: mv a2, a1
; RV32IDZFH-NEXT: .LBB10_2: # %start
-; RV32IDZFH-NEXT: and a0, a2, a0
-; RV32IDZFH-NEXT: beqz s0, .LBB10_4
+; RV32IDZFH-NEXT: lui a1, %hi(.LCPI10_0)
+; RV32IDZFH-NEXT: flw fa5, %lo(.LCPI10_0)(a1)
+; RV32IDZFH-NEXT: flt.s a3, fa5, fs0
+; RV32IDZFH-NEXT: beqz a3, .LBB10_4
; RV32IDZFH-NEXT: # %bb.3:
-; RV32IDZFH-NEXT: addi a3, a4, -1
+; RV32IDZFH-NEXT: addi a2, a4, -1
; RV32IDZFH-NEXT: .LBB10_4: # %start
-; RV32IDZFH-NEXT: and a1, a2, a3
-; RV32IDZFH-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
-; RV32IDZFH-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
-; RV32IDZFH-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
-; RV32IDZFH-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
-; RV32IDZFH-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32IDZFH-NEXT: feq.s a1, fs0, fs0
+; RV32IDZFH-NEXT: neg a4, a1
+; RV32IDZFH-NEXT: and a1, a4, a2
+; RV32IDZFH-NEXT: neg a2, a3
+; RV32IDZFH-NEXT: neg a3, s0
+; RV32IDZFH-NEXT: and a0, a3, a0
+; RV32IDZFH-NEXT: or a0, a2, a0
+; RV32IDZFH-NEXT: and a0, a4, a0
+; RV32IDZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IDZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IDZFH-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
-; RV32IDZFH-NEXT: addi sp, sp, 32
+; RV32IDZFH-NEXT: addi sp, sp, 16
; RV32IDZFH-NEXT: ret
;
; RV64IDZFH-LABEL: fcvt_l_h_sat:
@@ -2515,47 +2503,41 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
;
; RV32ID-LABEL: fcvt_l_h_sat:
; RV32ID: # %bb.0: # %start
-; RV32ID-NEXT: addi sp, sp, -32
-; RV32ID-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; RV32ID-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
-; RV32ID-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
-; RV32ID-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
-; RV32ID-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32ID-NEXT: addi sp, sp, -16
+; RV32ID-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32ID-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; RV32ID-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill
; RV32ID-NEXT: call __extendhfsf2
-; RV32ID-NEXT: lui a0, %hi(.LCPI10_0)
-; RV32ID-NEXT: flw fa5, %lo(.LCPI10_0)(a0)
; RV32ID-NEXT: fmv.s fs0, fa0
-; RV32ID-NEXT: flt.s s0, fa5, fa0
-; RV32ID-NEXT: neg s1, s0
; RV32ID-NEXT: lui a0, 913408
; RV32ID-NEXT: fmv.w.x fa5, a0
-; RV32ID-NEXT: fle.s s2, fa5, fa0
-; RV32ID-NEXT: neg s3, s2
+; RV32ID-NEXT: fle.s s0, fa5, fa0
; RV32ID-NEXT: call __fixsfdi
-; RV32ID-NEXT: and a0, s3, a0
-; RV32ID-NEXT: or a0, s1, a0
-; RV32ID-NEXT: feq.s a2, fs0, fs0
-; RV32ID-NEXT: neg a2, a2
; RV32ID-NEXT: lui a4, 524288
-; RV32ID-NEXT: lui a3, 524288
-; RV32ID-NEXT: beqz s2, .LBB10_2
+; RV32ID-NEXT: lui a2, 524288
+; RV32ID-NEXT: beqz s0, .LBB10_2
; RV32ID-NEXT: # %bb.1: # %start
-; RV32ID-NEXT: mv a3, a1
+; RV32ID-NEXT: mv a2, a1
; RV32ID-NEXT: .LBB10_2: # %start
-; RV32ID-NEXT: and a0, a2, a0
-; RV32ID-NEXT: beqz s0, .LBB10_4
+; RV32ID-NEXT: lui a1, %hi(.LCPI10_0)
+; RV32ID-NEXT: flw fa5, %lo(.LCPI10_0)(a1)
+; RV32ID-NEXT: flt.s a3, fa5, fs0
+; RV32ID-NEXT: beqz a3, .LBB10_4
; RV32ID-NEXT: # %bb.3:
-; RV32ID-NEXT: addi a3, a4, -1
+; RV32ID-NEXT: addi a2, a4, -1
; RV32ID-NEXT: .LBB10_4: # %start
-; RV32ID-NEXT: and a1, a2, a3
-; RV32ID-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
-; RV32ID-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
-; RV32ID-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
-; RV32ID-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
-; RV32ID-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32ID-NEXT: feq.s a1, fs0, fs0
+; RV32ID-NEXT: neg a4, a1
+; RV32ID-NEXT: and a1, a4, a2
+; RV32ID-NEXT: neg a2, s0
+; RV32ID-NEXT: and a0, a2, a0
+; RV32ID-NEXT: neg a2, a3
+; RV32ID-NEXT: or a0, a2, a0
+; RV32ID-NEXT: and a0, a4, a0
+; RV32ID-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32ID-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32ID-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
-; RV32ID-NEXT: addi sp, sp, 32
+; RV32ID-NEXT: addi sp, sp, 16
; RV32ID-NEXT: ret
;
; RV64ID-LABEL: fcvt_l_h_sat:
@@ -2574,47 +2556,41 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
;
; RV32IFZFHMIN-LABEL: fcvt_l_h_sat:
; RV32IFZFHMIN: # %bb.0: # %start
-; RV32IFZFHMIN-NEXT: addi sp, sp, -32
-; RV32IFZFHMIN-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT: fsw fs0, 8(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT: lui a0, %hi(.LCPI10_0)
-; RV32IFZFHMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a0)
+; RV32IFZFHMIN-NEXT: addi sp, sp, -16
+; RV32IFZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IFZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; RV32IFZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IFZFHMIN-NEXT: fcvt.s.h fs0, fa0
-; RV32IFZFHMIN-NEXT: flt.s s0, fa5, fs0
-; RV32IFZFHMIN-NEXT: neg s1, s0
; RV32IFZFHMIN-NEXT: lui a0, 913408
; RV32IFZFHMIN-NEXT: fmv.w.x fa5, a0
-; RV32IFZFHMIN-NEXT: fle.s s2, fa5, fs0
-; RV32IFZFHMIN-NEXT: neg s3, s2
+; RV32IFZFHMIN-NEXT: fle.s s0, fa5, fs0
; RV32IFZFHMIN-NEXT: fmv.s fa0, fs0
; RV32IFZFHMIN-NEXT: call __fixsfdi
-; RV32IFZFHMIN-NEXT: and a0, s3, a0
-; RV32IFZFHMIN-NEXT: or a0, s1, a0
-; RV32IFZFHMIN-NEXT: feq.s a2, fs0, fs0
-; RV32IFZFHMIN-NEXT: neg a2, a2
; RV32IFZFHMIN-NEXT: lui a4, 524288
-; RV32IFZFHMIN-NEXT: lui a3, 524288
-; RV32IFZFHMIN-NEXT: beqz s2, .LBB10_2
+; RV32IFZFHMIN-NEXT: lui a2, 524288
+; RV32IFZFHMIN-NEXT: beqz s0, .LBB10_2
; RV32IFZFHMIN-NEXT: # %bb.1: # %start
-; RV32IFZFHMIN-NEXT: mv a3, a1
+; RV32IFZFHMIN-NEXT: mv a2, a1
; RV32IFZFHMIN-NEXT: .LBB10_2: # %start
-; RV32IFZFHMIN-NEXT: and a0, a2, a0
-; RV32IFZFHMIN-NEXT: beqz s0, .LBB10_4
+; RV32IFZFHMIN-NEXT: lui a1, %hi(.LCPI10_0)
+; RV32IFZFHMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1)
+; RV32IFZFHMIN-NEXT: flt.s a3, fa5, fs0
+; RV32IFZFHMIN-NEXT: beqz a3, .LBB10_4
; RV32IFZFHMIN-NEXT: # %bb.3:
-; RV32IFZFHMIN-NEXT: addi a3, a4, -1
+; RV32IFZFHMIN-NEXT: addi a2, a4, -1
; RV32IFZFHMIN-NEXT: .LBB10_4: # %start
-; RV32IFZFHMIN-NEXT: and a1, a2, a3
-; RV32IFZFHMIN-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT: flw fs0, 8(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT: addi sp, sp, 32
+; RV32IFZFHMIN-NEXT: feq.s a1, fs0, fs0
+; RV32IFZFHMIN-NEXT: neg a4, a1
+; RV32IFZFHMIN-NEXT: and a1, a4, a2
+; RV32IFZFHMIN-NEXT: neg a2, a3
+; RV32IFZFHMIN-NEXT: neg a3, s0
+; RV32IFZFHMIN-NEXT: and a0, a3, a0
+; RV32IFZFHMIN-NEXT: or a0, a2, a0
+; RV32IFZFHMIN-NEXT: and a0, a4, a0
+; RV32IFZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IFZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
+; RV32IFZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IFZFHMIN-NEXT: addi sp, sp, 16
; RV32IFZFHMIN-NEXT: ret
;
; CHECK64-IZFHMIN-LABEL: fcvt_l_h_sat:
@@ -2629,47 +2605,41 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
;
; RV32IDZFHMIN-LABEL: fcvt_l_h_sat:
; RV32IDZFHMIN: # %bb.0: # %start
-; RV32IDZFHMIN-NEXT: addi sp, sp, -32
-; RV32IDZFHMIN-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; RV32IDZFHMIN-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
-; RV32IDZFHMIN-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
-; RV32IDZFHMIN-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IDZFHMIN-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IDZFHMIN-NEXT: addi sp, sp, -16
+; RV32IDZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IDZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; RV32IDZFHMIN-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill
-; RV32IDZFHMIN-NEXT: lui a0, %hi(.LCPI10_0)
-; RV32IDZFHMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a0)
; RV32IDZFHMIN-NEXT: fcvt.s.h fs0, fa0
-; RV32IDZFHMIN-NEXT: flt.s s0, fa5, fs0
-; RV32IDZFHMIN-NEXT: neg s1, s0
; RV32IDZFHMIN-NEXT: lui a0, 913408
; RV32IDZFHMIN-NEXT: fmv.w.x fa5, a0
-; RV32IDZFHMIN-NEXT: fle.s s2, fa5, fs0
-; RV32IDZFHMIN-NEXT: neg s3, s2
+; RV32IDZFHMIN-NEXT: fle.s s0, fa5, fs0
; RV32IDZFHMIN-NEXT: fmv.s fa0, fs0
; RV32IDZFHMIN-NEXT: call __fixsfdi
-; RV32IDZFHMIN-NEXT: and a0, s3, a0
-; RV32IDZFHMIN-NEXT: or a0, s1, a0
-; RV32IDZFHMIN-NEXT: feq.s a2, fs0, fs0
-; RV32IDZFHMIN-NEXT: neg a2, a2
; RV32IDZFHMIN-NEXT: lui a4, 524288
-; RV32IDZFHMIN-NEXT: lui a3, 524288
-; RV32IDZFHMIN-NEXT: beqz s2, .LBB10_2
+; RV32IDZFHMIN-NEXT: lui a2, 524288
+; RV32IDZFHMIN-NEXT: beqz s0, .LBB10_2
; RV32IDZFHMIN-NEXT: # %bb.1: # %start
-; RV32IDZFHMIN-NEXT: mv a3, a1
+; RV32IDZFHMIN-NEXT: mv a2, a1
; RV32IDZFHMIN-NEXT: .LBB10_2: # %start
-; RV32IDZFHMIN-NEXT: and a0, a2, a0
-; RV32IDZFHMIN-NEXT: beqz s0, .LBB10_4
+; RV32IDZFHMIN-NEXT: lui a1, %hi(.LCPI10_0)
+; RV32IDZFHMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1)
+; RV32IDZFHMIN-NEXT: flt.s a3, fa5, fs0
+; RV32IDZFHMIN-NEXT: beqz a3, .LBB10_4
; RV32IDZFHMIN-NEXT: # %bb.3:
-; RV32IDZFHMIN-NEXT: addi a3, a4, -1
+; RV32IDZFHMIN-NEXT: addi a2, a4, -1
; RV32IDZFHMIN-NEXT: .LBB10_4: # %start
-; RV32IDZFHMIN-NEXT: and a1, a2, a3
-; RV32IDZFHMIN-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
-; RV32IDZFHMIN-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
-; RV32IDZFHMIN-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
-; RV32IDZFHMIN-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
-; RV32IDZFHMIN-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32IDZFHMIN-NEXT: feq.s a1, fs0, fs0
+; RV32IDZFHMIN-NEXT: neg a4, a1
+; RV32IDZFHMIN-NEXT: and a1, a4, a2
+; RV32IDZFHMIN-NEXT: neg a2, a3
+; RV32IDZFHMIN-NEXT: neg a3, s0
+; RV32IDZFHMIN-NEXT: and a0, a3, a0
+; RV32IDZFHMIN-NEXT: or a0, a2, a0
+; RV32IDZFHMIN-NEXT: and a0, a4, a0
+; RV32IDZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IDZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IDZFHMIN-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
-; RV32IDZFHMIN-NEXT: addi sp, sp, 32
+; RV32IDZFHMIN-NEXT: addi sp, sp, 16
; RV32IDZFHMIN-NEXT: ret
;
; CHECK32-IZHINXMIN-LABEL: fcvt_l_h_sat:
diff --git a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
index 9c95210bfa7c01..04a8a66f44598f 100644
--- a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
@@ -108,40 +108,38 @@ define i64 @test_floor_si64(half %x) nounwind {
; RV32IZFH-NEXT: addi sp, sp, -16
; RV32IZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: fcvt.s.h fs0, fa0
; RV32IZFH-NEXT: lui a0, 913408
; RV32IZFH-NEXT: fmv.w.x fa5, a0
; RV32IZFH-NEXT: fle.s s0, fa5, fs0
-; RV32IZFH-NEXT: neg s1, s0
; RV32IZFH-NEXT: fmv.s fa0, fs0
; RV32IZFH-NEXT: call __fixsfdi
-; RV32IZFH-NEXT: lui a2, %hi(.LCPI1_1)
-; RV32IZFH-NEXT: flw fa5, %lo(.LCPI1_1)(a2)
-; RV32IZFH-NEXT: and a0, s1, a0
-; RV32IZFH-NEXT: flt.s a3, fa5, fs0
-; RV32IZFH-NEXT: neg a2, a3
-; RV32IZFH-NEXT: or a0, a2, a0
-; RV32IZFH-NEXT: feq.s a2, fs0, fs0
-; RV32IZFH-NEXT: neg a2, a2
-; RV32IZFH-NEXT: lui a5, 524288
; RV32IZFH-NEXT: lui a4, 524288
+; RV32IZFH-NEXT: lui a2, 524288
; RV32IZFH-NEXT: beqz s0, .LBB1_4
; RV32IZFH-NEXT: # %bb.3:
-; RV32IZFH-NEXT: mv a4, a1
+; RV32IZFH-NEXT: mv a2, a1
; RV32IZFH-NEXT: .LBB1_4:
+; RV32IZFH-NEXT: lui a1, %hi(.LCPI1_1)
+; RV32IZFH-NEXT: flw fa5, %lo(.LCPI1_1)(a1)
+; RV32IZFH-NEXT: flt.s a3, fa5, fs0
+; RV32IZFH-NEXT: beqz a3, .LBB1_6
+; RV32IZFH-NEXT: # %bb.5:
+; RV32IZFH-NEXT: addi a2, a4, -1
+; RV32IZFH-NEXT: .LBB1_6:
+; RV32IZFH-NEXT: feq.s a1, fs0, fs0
+; RV32IZFH-NEXT: neg a4, a1
+; RV32IZFH-NEXT: and a1, a4, a2
+; RV32IZFH-NEXT: neg a2, s0
; RV32IZFH-NEXT: and a0, a2, a0
+; RV32IZFH-NEXT: neg a2, a3
+; RV32IZFH-NEXT: or a0, a2, a0
+; RV32IZFH-NEXT: and a0, a4, a0
; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: addi sp, sp, 16
-; RV32IZFH-NEXT: beqz a3, .LBB1_6
-; RV32IZFH-NEXT: # %bb.5:
-; RV32IZFH-NEXT: addi a4, a5, -1
-; RV32IZFH-NEXT: .LBB1_6:
-; RV32IZFH-NEXT: and a1, a2, a4
; RV32IZFH-NEXT: ret
;
; RV64IZFH-LABEL: test_floor_si64:
@@ -179,16 +177,16 @@ define i64 @test_floor_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lui a2, %hi(.LCPI1_1)
; RV32IZHINX-NEXT: lw a2, %lo(.LCPI1_1)(a2)
; RV32IZHINX-NEXT: and a0, s2, a0
-; RV32IZHINX-NEXT: flt.s a3, a2, s0
-; RV32IZHINX-NEXT: neg a2, a3
+; RV32IZHINX-NEXT: flt.s a4, a2, s0
+; RV32IZHINX-NEXT: neg a2, a4
; RV32IZHINX-NEXT: or a0, a2, a0
; RV32IZHINX-NEXT: feq.s a2, s0, s0
; RV32IZHINX-NEXT: neg a2, a2
; RV32IZHINX-NEXT: lui a5, 524288
-; RV32IZHINX-NEXT: lui a4, 524288
+; RV32IZHINX-NEXT: lui a3, 524288
; RV32IZHINX-NEXT: beqz s1, .LBB1_4
; RV32IZHINX-NEXT: # %bb.3:
-; RV32IZHINX-NEXT: mv a4, a1
+; RV32IZHINX-NEXT: mv a3, a1
; RV32IZHINX-NEXT: .LBB1_4:
; RV32IZHINX-NEXT: and a0, a2, a0
; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -196,11 +194,11 @@ define i64 @test_floor_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: addi sp, sp, 16
-; RV32IZHINX-NEXT: beqz a3, .LBB1_6
+; RV32IZHINX-NEXT: beqz a4, .LBB1_6
; RV32IZHINX-NEXT: # %bb.5:
-; RV32IZHINX-NEXT: addi a4, a5, -1
+; RV32IZHINX-NEXT: addi a3, a5, -1
; RV32IZHINX-NEXT: .LBB1_6:
-; RV32IZHINX-NEXT: and a1, a2, a4
+; RV32IZHINX-NEXT: and a1, a2, a3
; RV32IZHINX-NEXT: ret
;
; RV64IZHINX-LABEL: test_floor_si64:
@@ -238,41 +236,39 @@ define i64 @test_floor_si64(half %x) nounwind {
; RV32IZFHMIN-NEXT: addi sp, sp, -16
; RV32IZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5
; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5
; RV32IZFHMIN-NEXT: lui a0, 913408
; RV32IZFHMIN-NEXT: fmv.w.x fa5, a0
; RV32IZFHMIN-NEXT: fle.s s0, fa5, fs0
-; RV32IZFHMIN-NEXT: neg s1, s0
; RV32IZFHMIN-NEXT: fmv.s fa0, fs0
; RV32IZFHMIN-NEXT: call __fixsfdi
-; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI1_0)
-; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI1_0)(a2)
-; RV32IZFHMIN-NEXT: and a0, s1, a0
-; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT: neg a2, a3
-; RV32IZFHMIN-NEXT: or a0, a2, a0
-; RV32IZFHMIN-NEXT: feq.s a2, fs0, fs0
-; RV32IZFHMIN-NEXT: neg a2, a2
-; RV32IZFHMIN-NEXT: lui a5, 524288
; RV32IZFHMIN-NEXT: lui a4, 524288
+; RV32IZFHMIN-NEXT: lui a2, 524288
; RV32IZFHMIN-NEXT: beqz s0, .LBB1_4
; RV32IZFHMIN-NEXT: # %bb.3:
-; RV32IZFHMIN-NEXT: mv a4, a1
+; RV32IZFHMIN-NEXT: mv a2, a1
; RV32IZFHMIN-NEXT: .LBB1_4:
+; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI1_0)
+; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI1_0)(a1)
+; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT: beqz a3, .LBB1_6
+; RV32IZFHMIN-NEXT: # %bb.5:
+; RV32IZFHMIN-NEXT: addi a2, a4, -1
+; RV32IZFHMIN-NEXT: .LBB1_6:
+; RV32IZFHMIN-NEXT: feq.s a1, fs0, fs0
+; RV32IZFHMIN-NEXT: neg a4, a1
+; RV32IZFHMIN-NEXT: and a1, a4, a2
+; RV32IZFHMIN-NEXT: neg a2, s0
; RV32IZFHMIN-NEXT: and a0, a2, a0
+; RV32IZFHMIN-NEXT: neg a2, a3
+; RV32IZFHMIN-NEXT: or a0, a2, a0
+; RV32IZFHMIN-NEXT: and a0, a4, a0
; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: addi sp, sp, 16
-; RV32IZFHMIN-NEXT: beqz a3, .LBB1_6
-; RV32IZFHMIN-NEXT: # %bb.5:
-; RV32IZFHMIN-NEXT: addi a4, a5, -1
-; RV32IZFHMIN-NEXT: .LBB1_6:
-; RV32IZFHMIN-NEXT: and a1, a2, a4
; RV32IZFHMIN-NEXT: ret
;
; RV64IZFHMIN-LABEL: test_floor_si64:
@@ -324,16 +320,16 @@ define i64 @test_floor_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI1_0)
; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI1_0)(a2)
; RV32IZHINXMIN-NEXT: and a0, s2, a0
-; RV32IZHINXMIN-NEXT: flt.s a3, a2, s0
-; RV32IZHINXMIN-NEXT: neg a2, a3
+; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0
+; RV32IZHINXMIN-NEXT: neg a2, a4
; RV32IZHINXMIN-NEXT: or a0, a2, a0
; RV32IZHINXMIN-NEXT: feq.s a2, s0, s0
; RV32IZHINXMIN-NEXT: neg a2, a2
; RV32IZHINXMIN-NEXT: lui a5, 524288
-; RV32IZHINXMIN-NEXT: lui a4, 524288
+; RV32IZHINXMIN-NEXT: lui a3, 524288
; RV32IZHINXMIN-NEXT: beqz s1, .LBB1_4
; RV32IZHINXMIN-NEXT: # %bb.3:
-; RV32IZHINXMIN-NEXT: mv a4, a1
+; RV32IZHINXMIN-NEXT: mv a3, a1
; RV32IZHINXMIN-NEXT: .LBB1_4:
; RV32IZHINXMIN-NEXT: and a0, a2, a0
; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -341,11 +337,11 @@ define i64 @test_floor_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: addi sp, sp, 16
-; RV32IZHINXMIN-NEXT: beqz a3, .LBB1_6
+; RV32IZHINXMIN-NEXT: beqz a4, .LBB1_6
; RV32IZHINXMIN-NEXT: # %bb.5:
-; RV32IZHINXMIN-NEXT: addi a4, a5, -1
+; RV32IZHINXMIN-NEXT: addi a3, a5, -1
; RV32IZHINXMIN-NEXT: .LBB1_6:
-; RV32IZHINXMIN-NEXT: and a1, a2, a4
+; RV32IZHINXMIN-NEXT: and a1, a2, a3
; RV32IZHINXMIN-NEXT: ret
;
; RV64IZHINXMIN-LABEL: test_floor_si64:
@@ -824,40 +820,38 @@ define i64 @test_ceil_si64(half %x) nounwind {
; RV32IZFH-NEXT: addi sp, sp, -16
; RV32IZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: fcvt.s.h fs0, fa0
; RV32IZFH-NEXT: lui a0, 913408
; RV32IZFH-NEXT: fmv.w.x fa5, a0
; RV32IZFH-NEXT: fle.s s0, fa5, fs0
-; RV32IZFH-NEXT: neg s1, s0
; RV32IZFH-NEXT: fmv.s fa0, fs0
; RV32IZFH-NEXT: call __fixsfdi
-; RV32IZFH-NEXT: lui a2, %hi(.LCPI5_1)
-; RV32IZFH-NEXT: flw fa5, %lo(.LCPI5_1)(a2)
-; RV32IZFH-NEXT: and a0, s1, a0
-; RV32IZFH-NEXT: flt.s a3, fa5, fs0
-; RV32IZFH-NEXT: neg a2, a3
-; RV32IZFH-NEXT: or a0, a2, a0
-; RV32IZFH-NEXT: feq.s a2, fs0, fs0
-; RV32IZFH-NEXT: neg a2, a2
-; RV32IZFH-NEXT: lui a5, 524288
; RV32IZFH-NEXT: lui a4, 524288
+; RV32IZFH-NEXT: lui a2, 524288
; RV32IZFH-NEXT: beqz s0, .LBB5_4
; RV32IZFH-NEXT: # %bb.3:
-; RV32IZFH-NEXT: mv a4, a1
+; RV32IZFH-NEXT: mv a2, a1
; RV32IZFH-NEXT: .LBB5_4:
+; RV32IZFH-NEXT: lui a1, %hi(.LCPI5_1)
+; RV32IZFH-NEXT: flw fa5, %lo(.LCPI5_1)(a1)
+; RV32IZFH-NEXT: flt.s a3, fa5, fs0
+; RV32IZFH-NEXT: beqz a3, .LBB5_6
+; RV32IZFH-NEXT: # %bb.5:
+; RV32IZFH-NEXT: addi a2, a4, -1
+; RV32IZFH-NEXT: .LBB5_6:
+; RV32IZFH-NEXT: feq.s a1, fs0, fs0
+; RV32IZFH-NEXT: neg a4, a1
+; RV32IZFH-NEXT: and a1, a4, a2
+; RV32IZFH-NEXT: neg a2, s0
; RV32IZFH-NEXT: and a0, a2, a0
+; RV32IZFH-NEXT: neg a2, a3
+; RV32IZFH-NEXT: or a0, a2, a0
+; RV32IZFH-NEXT: and a0, a4, a0
; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: addi sp, sp, 16
-; RV32IZFH-NEXT: beqz a3, .LBB5_6
-; RV32IZFH-NEXT: # %bb.5:
-; RV32IZFH-NEXT: addi a4, a5, -1
-; RV32IZFH-NEXT: .LBB5_6:
-; RV32IZFH-NEXT: and a1, a2, a4
; RV32IZFH-NEXT: ret
;
; RV64IZFH-LABEL: test_ceil_si64:
@@ -895,16 +889,16 @@ define i64 @test_ceil_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lui a2, %hi(.LCPI5_1)
; RV32IZHINX-NEXT: lw a2, %lo(.LCPI5_1)(a2)
; RV32IZHINX-NEXT: and a0, s2, a0
-; RV32IZHINX-NEXT: flt.s a3, a2, s0
-; RV32IZHINX-NEXT: neg a2, a3
+; RV32IZHINX-NEXT: flt.s a4, a2, s0
+; RV32IZHINX-NEXT: neg a2, a4
; RV32IZHINX-NEXT: or a0, a2, a0
; RV32IZHINX-NEXT: feq.s a2, s0, s0
; RV32IZHINX-NEXT: neg a2, a2
; RV32IZHINX-NEXT: lui a5, 524288
-; RV32IZHINX-NEXT: lui a4, 524288
+; RV32IZHINX-NEXT: lui a3, 524288
; RV32IZHINX-NEXT: beqz s1, .LBB5_4
; RV32IZHINX-NEXT: # %bb.3:
-; RV32IZHINX-NEXT: mv a4, a1
+; RV32IZHINX-NEXT: mv a3, a1
; RV32IZHINX-NEXT: .LBB5_4:
; RV32IZHINX-NEXT: and a0, a2, a0
; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -912,11 +906,11 @@ define i64 @test_ceil_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: addi sp, sp, 16
-; RV32IZHINX-NEXT: beqz a3, .LBB5_6
+; RV32IZHINX-NEXT: beqz a4, .LBB5_6
; RV32IZHINX-NEXT: # %bb.5:
-; RV32IZHINX-NEXT: addi a4, a5, -1
+; RV32IZHINX-NEXT: addi a3, a5, -1
; RV32IZHINX-NEXT: .LBB5_6:
-; RV32IZHINX-NEXT: and a1, a2, a4
+; RV32IZHINX-NEXT: and a1, a2, a3
; RV32IZHINX-NEXT: ret
;
; RV64IZHINX-LABEL: test_ceil_si64:
@@ -954,41 +948,39 @@ define i64 @test_ceil_si64(half %x) nounwind {
; RV32IZFHMIN-NEXT: addi sp, sp, -16
; RV32IZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5
; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5
; RV32IZFHMIN-NEXT: lui a0, 913408
; RV32IZFHMIN-NEXT: fmv.w.x fa5, a0
; RV32IZFHMIN-NEXT: fle.s s0, fa5, fs0
-; RV32IZFHMIN-NEXT: neg s1, s0
; RV32IZFHMIN-NEXT: fmv.s fa0, fs0
; RV32IZFHMIN-NEXT: call __fixsfdi
-; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI5_0)
-; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI5_0)(a2)
-; RV32IZFHMIN-NEXT: and a0, s1, a0
-; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT: neg a2, a3
-; RV32IZFHMIN-NEXT: or a0, a2, a0
-; RV32IZFHMIN-NEXT: feq.s a2, fs0, fs0
-; RV32IZFHMIN-NEXT: neg a2, a2
-; RV32IZFHMIN-NEXT: lui a5, 524288
; RV32IZFHMIN-NEXT: lui a4, 524288
+; RV32IZFHMIN-NEXT: lui a2, 524288
; RV32IZFHMIN-NEXT: beqz s0, .LBB5_4
; RV32IZFHMIN-NEXT: # %bb.3:
-; RV32IZFHMIN-NEXT: mv a4, a1
+; RV32IZFHMIN-NEXT: mv a2, a1
; RV32IZFHMIN-NEXT: .LBB5_4:
+; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI5_0)
+; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI5_0)(a1)
+; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT: beqz a3, .LBB5_6
+; RV32IZFHMIN-NEXT: # %bb.5:
+; RV32IZFHMIN-NEXT: addi a2, a4, -1
+; RV32IZFHMIN-NEXT: .LBB5_6:
+; RV32IZFHMIN-NEXT: feq.s a1, fs0, fs0
+; RV32IZFHMIN-NEXT: neg a4, a1
+; RV32IZFHMIN-NEXT: and a1, a4, a2
+; RV32IZFHMIN-NEXT: neg a2, s0
; RV32IZFHMIN-NEXT: and a0, a2, a0
+; RV32IZFHMIN-NEXT: neg a2, a3
+; RV32IZFHMIN-NEXT: or a0, a2, a0
+; RV32IZFHMIN-NEXT: and a0, a4, a0
; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: addi sp, sp, 16
-; RV32IZFHMIN-NEXT: beqz a3, .LBB5_6
-; RV32IZFHMIN-NEXT: # %bb.5:
-; RV32IZFHMIN-NEXT: addi a4, a5, -1
-; RV32IZFHMIN-NEXT: .LBB5_6:
-; RV32IZFHMIN-NEXT: and a1, a2, a4
; RV32IZFHMIN-NEXT: ret
;
; RV64IZFHMIN-LABEL: test_ceil_si64:
@@ -1040,16 +1032,16 @@ define i64 @test_ceil_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI5_0)
; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI5_0)(a2)
; RV32IZHINXMIN-NEXT: and a0, s2, a0
-; RV32IZHINXMIN-NEXT: flt.s a3, a2, s0
-; RV32IZHINXMIN-NEXT: neg a2, a3
+; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0
+; RV32IZHINXMIN-NEXT: neg a2, a4
; RV32IZHINXMIN-NEXT: or a0, a2, a0
; RV32IZHINXMIN-NEXT: feq.s a2, s0, s0
; RV32IZHINXMIN-NEXT: neg a2, a2
; RV32IZHINXMIN-NEXT: lui a5, 524288
-; RV32IZHINXMIN-NEXT: lui a4, 524288
+; RV32IZHINXMIN-NEXT: lui a3, 524288
; RV32IZHINXMIN-NEXT: beqz s1, .LBB5_4
; RV32IZHINXMIN-NEXT: # %bb.3:
-; RV32IZHINXMIN-NEXT: mv a4, a1
+; RV32IZHINXMIN-NEXT: mv a3, a1
; RV32IZHINXMIN-NEXT: .LBB5_4:
; RV32IZHINXMIN-NEXT: and a0, a2, a0
; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -1057,11 +1049,11 @@ define i64 @test_ceil_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: addi sp, sp, 16
-; RV32IZHINXMIN-NEXT: beqz a3, .LBB5_6
+; RV32IZHINXMIN-NEXT: beqz a4, .LBB5_6
; RV32IZHINXMIN-NEXT: # %bb.5:
-; RV32IZHINXMIN-NEXT: addi a4, a5, -1
+; RV32IZHINXMIN-NEXT: addi a3, a5, -1
; RV32IZHINXMIN-NEXT: .LBB5_6:
-; RV32IZHINXMIN-NEXT: and a1, a2, a4
+; RV32IZHINXMIN-NEXT: and a1, a2, a3
; RV32IZHINXMIN-NEXT: ret
;
; RV64IZHINXMIN-LABEL: test_ceil_si64:
@@ -1540,40 +1532,38 @@ define i64 @test_trunc_si64(half %x) nounwind {
; RV32IZFH-NEXT: addi sp, sp, -16
; RV32IZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: fcvt.s.h fs0, fa0
; RV32IZFH-NEXT: lui a0, 913408
; RV32IZFH-NEXT: fmv.w.x fa5, a0
; RV32IZFH-NEXT: fle.s s0, fa5, fs0
-; RV32IZFH-NEXT: neg s1, s0
; RV32IZFH-NEXT: fmv.s fa0, fs0
; RV32IZFH-NEXT: call __fixsfdi
-; RV32IZFH-NEXT: lui a2, %hi(.LCPI9_1)
-; RV32IZFH-NEXT: flw fa5, %lo(.LCPI9_1)(a2)
-; RV32IZFH-NEXT: and a0, s1, a0
-; RV32IZFH-NEXT: flt.s a3, fa5, fs0
-; RV32IZFH-NEXT: neg a2, a3
-; RV32IZFH-NEXT: or a0, a2, a0
-; RV32IZFH-NEXT: feq.s a2, fs0, fs0
-; RV32IZFH-NEXT: neg a2, a2
-; RV32IZFH-NEXT: lui a5, 524288
; RV32IZFH-NEXT: lui a4, 524288
+; RV32IZFH-NEXT: lui a2, 524288
; RV32IZFH-NEXT: beqz s0, .LBB9_4
; RV32IZFH-NEXT: # %bb.3:
-; RV32IZFH-NEXT: mv a4, a1
+; RV32IZFH-NEXT: mv a2, a1
; RV32IZFH-NEXT: .LBB9_4:
+; RV32IZFH-NEXT: lui a1, %hi(.LCPI9_1)
+; RV32IZFH-NEXT: flw fa5, %lo(.LCPI9_1)(a1)
+; RV32IZFH-NEXT: flt.s a3, fa5, fs0
+; RV32IZFH-NEXT: beqz a3, .LBB9_6
+; RV32IZFH-NEXT: # %bb.5:
+; RV32IZFH-NEXT: addi a2, a4, -1
+; RV32IZFH-NEXT: .LBB9_6:
+; RV32IZFH-NEXT: feq.s a1, fs0, fs0
+; RV32IZFH-NEXT: neg a4, a1
+; RV32IZFH-NEXT: and a1, a4, a2
+; RV32IZFH-NEXT: neg a2, s0
; RV32IZFH-NEXT: and a0, a2, a0
+; RV32IZFH-NEXT: neg a2, a3
+; RV32IZFH-NEXT: or a0, a2, a0
+; RV32IZFH-NEXT: and a0, a4, a0
; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: addi sp, sp, 16
-; RV32IZFH-NEXT: beqz a3, .LBB9_6
-; RV32IZFH-NEXT: # %bb.5:
-; RV32IZFH-NEXT: addi a4, a5, -1
-; RV32IZFH-NEXT: .LBB9_6:
-; RV32IZFH-NEXT: and a1, a2, a4
; RV32IZFH-NEXT: ret
;
; RV64IZFH-LABEL: test_trunc_si64:
@@ -1611,16 +1601,16 @@ define i64 @test_trunc_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lui a2, %hi(.LCPI9_1)
; RV32IZHINX-NEXT: lw a2, %lo(.LCPI9_1)(a2)
; RV32IZHINX-NEXT: and a0, s2, a0
-; RV32IZHINX-NEXT: flt.s a3, a2, s0
-; RV32IZHINX-NEXT: neg a2, a3
+; RV32IZHINX-NEXT: flt.s a4, a2, s0
+; RV32IZHINX-NEXT: neg a2, a4
; RV32IZHINX-NEXT: or a0, a2, a0
; RV32IZHINX-NEXT: feq.s a2, s0, s0
; RV32IZHINX-NEXT: neg a2, a2
; RV32IZHINX-NEXT: lui a5, 524288
-; RV32IZHINX-NEXT: lui a4, 524288
+; RV32IZHINX-NEXT: lui a3, 524288
; RV32IZHINX-NEXT: beqz s1, .LBB9_4
; RV32IZHINX-NEXT: # %bb.3:
-; RV32IZHINX-NEXT: mv a4, a1
+; RV32IZHINX-NEXT: mv a3, a1
; RV32IZHINX-NEXT: .LBB9_4:
; RV32IZHINX-NEXT: and a0, a2, a0
; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -1628,11 +1618,11 @@ define i64 @test_trunc_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: addi sp, sp, 16
-; RV32IZHINX-NEXT: beqz a3, .LBB9_6
+; RV32IZHINX-NEXT: beqz a4, .LBB9_6
; RV32IZHINX-NEXT: # %bb.5:
-; RV32IZHINX-NEXT: addi a4, a5, -1
+; RV32IZHINX-NEXT: addi a3, a5, -1
; RV32IZHINX-NEXT: .LBB9_6:
-; RV32IZHINX-NEXT: and a1, a2, a4
+; RV32IZHINX-NEXT: and a1, a2, a3
; RV32IZHINX-NEXT: ret
;
; RV64IZHINX-LABEL: test_trunc_si64:
@@ -1670,41 +1660,39 @@ define i64 @test_trunc_si64(half %x) nounwind {
; RV32IZFHMIN-NEXT: addi sp, sp, -16
; RV32IZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5
; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5
; RV32IZFHMIN-NEXT: lui a0, 913408
; RV32IZFHMIN-NEXT: fmv.w.x fa5, a0
; RV32IZFHMIN-NEXT: fle.s s0, fa5, fs0
-; RV32IZFHMIN-NEXT: neg s1, s0
; RV32IZFHMIN-NEXT: fmv.s fa0, fs0
; RV32IZFHMIN-NEXT: call __fixsfdi
-; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI9_0)
-; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI9_0)(a2)
-; RV32IZFHMIN-NEXT: and a0, s1, a0
-; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT: neg a2, a3
-; RV32IZFHMIN-NEXT: or a0, a2, a0
-; RV32IZFHMIN-NEXT: feq.s a2, fs0, fs0
-; RV32IZFHMIN-NEXT: neg a2, a2
-; RV32IZFHMIN-NEXT: lui a5, 524288
; RV32IZFHMIN-NEXT: lui a4, 524288
+; RV32IZFHMIN-NEXT: lui a2, 524288
; RV32IZFHMIN-NEXT: beqz s0, .LBB9_4
; RV32IZFHMIN-NEXT: # %bb.3:
-; RV32IZFHMIN-NEXT: mv a4, a1
+; RV32IZFHMIN-NEXT: mv a2, a1
; RV32IZFHMIN-NEXT: .LBB9_4:
+; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI9_0)
+; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI9_0)(a1)
+; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT: beqz a3, .LBB9_6
+; RV32IZFHMIN-NEXT: # %bb.5:
+; RV32IZFHMIN-NEXT: addi a2, a4, -1
+; RV32IZFHMIN-NEXT: .LBB9_6:
+; RV32IZFHMIN-NEXT: feq.s a1, fs0, fs0
+; RV32IZFHMIN-NEXT: neg a4, a1
+; RV32IZFHMIN-NEXT: and a1, a4, a2
+; RV32IZFHMIN-NEXT: neg a2, s0
; RV32IZFHMIN-NEXT: and a0, a2, a0
+; RV32IZFHMIN-NEXT: neg a2, a3
+; RV32IZFHMIN-NEXT: or a0, a2, a0
+; RV32IZFHMIN-NEXT: and a0, a4, a0
; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: addi sp, sp, 16
-; RV32IZFHMIN-NEXT: beqz a3, .LBB9_6
-; RV32IZFHMIN-NEXT: # %bb.5:
-; RV32IZFHMIN-NEXT: addi a4, a5, -1
-; RV32IZFHMIN-NEXT: .LBB9_6:
-; RV32IZFHMIN-NEXT: and a1, a2, a4
; RV32IZFHMIN-NEXT: ret
;
; RV64IZFHMIN-LABEL: test_trunc_si64:
@@ -1756,16 +1744,16 @@ define i64 @test_trunc_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI9_0)
; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI9_0)(a2)
; RV32IZHINXMIN-NEXT: and a0, s2, a0
-; RV32IZHINXMIN-NEXT: flt.s a3, a2, s0
-; RV32IZHINXMIN-NEXT: neg a2, a3
+; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0
+; RV32IZHINXMIN-NEXT: neg a2, a4
; RV32IZHINXMIN-NEXT: or a0, a2, a0
; RV32IZHINXMIN-NEXT: feq.s a2, s0, s0
; RV32IZHINXMIN-NEXT: neg a2, a2
; RV32IZHINXMIN-NEXT: lui a5, 524288
-; RV32IZHINXMIN-NEXT: lui a4, 524288
+; RV32IZHINXMIN-NEXT: lui a3, 524288
; RV32IZHINXMIN-NEXT: beqz s1, .LBB9_4
; RV32IZHINXMIN-NEXT: # %bb.3:
-; RV32IZHINXMIN-NEXT: mv a4, a1
+; RV32IZHINXMIN-NEXT: mv a3, a1
; RV32IZHINXMIN-NEXT: .LBB9_4:
; RV32IZHINXMIN-NEXT: and a0, a2, a0
; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -1773,11 +1761,11 @@ define i64 @test_trunc_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: addi sp, sp, 16
-; RV32IZHINXMIN-NEXT: beqz a3, .LBB9_6
+; RV32IZHINXMIN-NEXT: beqz a4, .LBB9_6
; RV32IZHINXMIN-NEXT: # %bb.5:
-; RV32IZHINXMIN-NEXT: addi a4, a5, -1
+; RV32IZHINXMIN-NEXT: addi a3, a5, -1
; RV32IZHINXMIN-NEXT: .LBB9_6:
-; RV32IZHINXMIN-NEXT: and a1, a2, a4
+; RV32IZHINXMIN-NEXT: and a1, a2, a3
; RV32IZHINXMIN-NEXT: ret
;
; RV64IZHINXMIN-LABEL: test_trunc_si64:
@@ -2256,40 +2244,38 @@ define i64 @test_round_si64(half %x) nounwind {
; RV32IZFH-NEXT: addi sp, sp, -16
; RV32IZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: fcvt.s.h fs0, fa0
; RV32IZFH-NEXT: lui a0, 913408
; RV32IZFH-NEXT: fmv.w.x fa5, a0
; RV32IZFH-NEXT: fle.s s0, fa5, fs0
-; RV32IZFH-NEXT: neg s1, s0
; RV32IZFH-NEXT: fmv.s fa0, fs0
; RV32IZFH-NEXT: call __fixsfdi
-; RV32IZFH-NEXT: lui a2, %hi(.LCPI13_1)
-; RV32IZFH-NEXT: flw fa5, %lo(.LCPI13_1)(a2)
-; RV32IZFH-NEXT: and a0, s1, a0
-; RV32IZFH-NEXT: flt.s a3, fa5, fs0
-; RV32IZFH-NEXT: neg a2, a3
-; RV32IZFH-NEXT: or a0, a2, a0
-; RV32IZFH-NEXT: feq.s a2, fs0, fs0
-; RV32IZFH-NEXT: neg a2, a2
-; RV32IZFH-NEXT: lui a5, 524288
; RV32IZFH-NEXT: lui a4, 524288
+; RV32IZFH-NEXT: lui a2, 524288
; RV32IZFH-NEXT: beqz s0, .LBB13_4
; RV32IZFH-NEXT: # %bb.3:
-; RV32IZFH-NEXT: mv a4, a1
+; RV32IZFH-NEXT: mv a2, a1
; RV32IZFH-NEXT: .LBB13_4:
+; RV32IZFH-NEXT: lui a1, %hi(.LCPI13_1)
+; RV32IZFH-NEXT: flw fa5, %lo(.LCPI13_1)(a1)
+; RV32IZFH-NEXT: flt.s a3, fa5, fs0
+; RV32IZFH-NEXT: beqz a3, .LBB13_6
+; RV32IZFH-NEXT: # %bb.5:
+; RV32IZFH-NEXT: addi a2, a4, -1
+; RV32IZFH-NEXT: .LBB13_6:
+; RV32IZFH-NEXT: feq.s a1, fs0, fs0
+; RV32IZFH-NEXT: neg a4, a1
+; RV32IZFH-NEXT: and a1, a4, a2
+; RV32IZFH-NEXT: neg a2, s0
; RV32IZFH-NEXT: and a0, a2, a0
+; RV32IZFH-NEXT: neg a2, a3
+; RV32IZFH-NEXT: or a0, a2, a0
+; RV32IZFH-NEXT: and a0, a4, a0
; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: addi sp, sp, 16
-; RV32IZFH-NEXT: beqz a3, .LBB13_6
-; RV32IZFH-NEXT: # %bb.5:
-; RV32IZFH-NEXT: addi a4, a5, -1
-; RV32IZFH-NEXT: .LBB13_6:
-; RV32IZFH-NEXT: and a1, a2, a4
; RV32IZFH-NEXT: ret
;
; RV64IZFH-LABEL: test_round_si64:
@@ -2327,16 +2313,16 @@ define i64 @test_round_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lui a2, %hi(.LCPI13_1)
; RV32IZHINX-NEXT: lw a2, %lo(.LCPI13_1)(a2)
; RV32IZHINX-NEXT: and a0, s2, a0
-; RV32IZHINX-NEXT: flt.s a3, a2, s0
-; RV32IZHINX-NEXT: neg a2, a3
+; RV32IZHINX-NEXT: flt.s a4, a2, s0
+; RV32IZHINX-NEXT: neg a2, a4
; RV32IZHINX-NEXT: or a0, a2, a0
; RV32IZHINX-NEXT: feq.s a2, s0, s0
; RV32IZHINX-NEXT: neg a2, a2
; RV32IZHINX-NEXT: lui a5, 524288
-; RV32IZHINX-NEXT: lui a4, 524288
+; RV32IZHINX-NEXT: lui a3, 524288
; RV32IZHINX-NEXT: beqz s1, .LBB13_4
; RV32IZHINX-NEXT: # %bb.3:
-; RV32IZHINX-NEXT: mv a4, a1
+; RV32IZHINX-NEXT: mv a3, a1
; RV32IZHINX-NEXT: .LBB13_4:
; RV32IZHINX-NEXT: and a0, a2, a0
; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -2344,11 +2330,11 @@ define i64 @test_round_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: addi sp, sp, 16
-; RV32IZHINX-NEXT: beqz a3, .LBB13_6
+; RV32IZHINX-NEXT: beqz a4, .LBB13_6
; RV32IZHINX-NEXT: # %bb.5:
-; RV32IZHINX-NEXT: addi a4, a5, -1
+; RV32IZHINX-NEXT: addi a3, a5, -1
; RV32IZHINX-NEXT: .LBB13_6:
-; RV32IZHINX-NEXT: and a1, a2, a4
+; RV32IZHINX-NEXT: and a1, a2, a3
; RV32IZHINX-NEXT: ret
;
; RV64IZHINX-LABEL: test_round_si64:
@@ -2386,41 +2372,39 @@ define i64 @test_round_si64(half %x) nounwind {
; RV32IZFHMIN-NEXT: addi sp, sp, -16
; RV32IZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5
; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5
; RV32IZFHMIN-NEXT: lui a0, 913408
; RV32IZFHMIN-NEXT: fmv.w.x fa5, a0
; RV32IZFHMIN-NEXT: fle.s s0, fa5, fs0
-; RV32IZFHMIN-NEXT: neg s1, s0
; RV32IZFHMIN-NEXT: fmv.s fa0, fs0
; RV32IZFHMIN-NEXT: call __fixsfdi
-; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI13_0)
-; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI13_0)(a2)
-; RV32IZFHMIN-NEXT: and a0, s1, a0
-; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT: neg a2, a3
-; RV32IZFHMIN-NEXT: or a0, a2, a0
-; RV32IZFHMIN-NEXT: feq.s a2, fs0, fs0
-; RV32IZFHMIN-NEXT: neg a2, a2
-; RV32IZFHMIN-NEXT: lui a5, 524288
; RV32IZFHMIN-NEXT: lui a4, 524288
+; RV32IZFHMIN-NEXT: lui a2, 524288
; RV32IZFHMIN-NEXT: beqz s0, .LBB13_4
; RV32IZFHMIN-NEXT: # %bb.3:
-; RV32IZFHMIN-NEXT: mv a4, a1
+; RV32IZFHMIN-NEXT: mv a2, a1
; RV32IZFHMIN-NEXT: .LBB13_4:
+; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI13_0)
+; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI13_0)(a1)
+; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT: beqz a3, .LBB13_6
+; RV32IZFHMIN-NEXT: # %bb.5:
+; RV32IZFHMIN-NEXT: addi a2, a4, -1
+; RV32IZFHMIN-NEXT: .LBB13_6:
+; RV32IZFHMIN-NEXT: feq.s a1, fs0, fs0
+; RV32IZFHMIN-NEXT: neg a4, a1
+; RV32IZFHMIN-NEXT: and a1, a4, a2
+; RV32IZFHMIN-NEXT: neg a2, s0
; RV32IZFHMIN-NEXT: and a0, a2, a0
+; RV32IZFHMIN-NEXT: neg a2, a3
+; RV32IZFHMIN-NEXT: or a0, a2, a0
+; RV32IZFHMIN-NEXT: and a0, a4, a0
; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: addi sp, sp, 16
-; RV32IZFHMIN-NEXT: beqz a3, .LBB13_6
-; RV32IZFHMIN-NEXT: # %bb.5:
-; RV32IZFHMIN-NEXT: addi a4, a5, -1
-; RV32IZFHMIN-NEXT: .LBB13_6:
-; RV32IZFHMIN-NEXT: and a1, a2, a4
; RV32IZFHMIN-NEXT: ret
;
; RV64IZFHMIN-LABEL: test_round_si64:
@@ -2472,16 +2456,16 @@ define i64 @test_round_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI13_0)
; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI13_0)(a2)
; RV32IZHINXMIN-NEXT: and a0, s2, a0
-; RV32IZHINXMIN-NEXT: flt.s a3, a2, s0
-; RV32IZHINXMIN-NEXT: neg a2, a3
+; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0
+; RV32IZHINXMIN-NEXT: neg a2, a4
; RV32IZHINXMIN-NEXT: or a0, a2, a0
; RV32IZHINXMIN-NEXT: feq.s a2, s0, s0
; RV32IZHINXMIN-NEXT: neg a2, a2
; RV32IZHINXMIN-NEXT: lui a5, 524288
-; RV32IZHINXMIN-NEXT: lui a4, 524288
+; RV32IZHINXMIN-NEXT: lui a3, 524288
; RV32IZHINXMIN-NEXT: beqz s1, .LBB13_4
; RV32IZHINXMIN-NEXT: # %bb.3:
-; RV32IZHINXMIN-NEXT: mv a4, a1
+; RV32IZHINXMIN-NEXT: mv a3, a1
; RV32IZHINXMIN-NEXT: .LBB13_4:
; RV32IZHINXMIN-NEXT: and a0, a2, a0
; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -2489,11 +2473,11 @@ define i64 @test_round_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: addi sp, sp, 16
-; RV32IZHINXMIN-NEXT: beqz a3, .LBB13_6
+; RV32IZHINXMIN-NEXT: beqz a4, .LBB13_6
; RV32IZHINXMIN-NEXT: # %bb.5:
-; RV32IZHINXMIN-NEXT: addi a4, a5, -1
+; RV32IZHINXMIN-NEXT: addi a3, a5, -1
; RV32IZHINXMIN-NEXT: .LBB13_6:
-; RV32IZHINXMIN-NEXT: and a1, a2, a4
+; RV32IZHINXMIN-NEXT: and a1, a2, a3
; RV32IZHINXMIN-NEXT: ret
;
; RV64IZHINXMIN-LABEL: test_round_si64:
@@ -2972,40 +2956,38 @@ define i64 @test_roundeven_si64(half %x) nounwind {
; RV32IZFH-NEXT: addi sp, sp, -16
; RV32IZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: fcvt.s.h fs0, fa0
; RV32IZFH-NEXT: lui a0, 913408
; RV32IZFH-NEXT: fmv.w.x fa5, a0
; RV32IZFH-NEXT: fle.s s0, fa5, fs0
-; RV32IZFH-NEXT: neg s1, s0
; RV32IZFH-NEXT: fmv.s fa0, fs0
; RV32IZFH-NEXT: call __fixsfdi
-; RV32IZFH-NEXT: lui a2, %hi(.LCPI17_1)
-; RV32IZFH-NEXT: flw fa5, %lo(.LCPI17_1)(a2)
-; RV32IZFH-NEXT: and a0, s1, a0
-; RV32IZFH-NEXT: flt.s a3, fa5, fs0
-; RV32IZFH-NEXT: neg a2, a3
-; RV32IZFH-NEXT: or a0, a2, a0
-; RV32IZFH-NEXT: feq.s a2, fs0, fs0
-; RV32IZFH-NEXT: neg a2, a2
-; RV32IZFH-NEXT: lui a5, 524288
; RV32IZFH-NEXT: lui a4, 524288
+; RV32IZFH-NEXT: lui a2, 524288
; RV32IZFH-NEXT: beqz s0, .LBB17_4
; RV32IZFH-NEXT: # %bb.3:
-; RV32IZFH-NEXT: mv a4, a1
+; RV32IZFH-NEXT: mv a2, a1
; RV32IZFH-NEXT: .LBB17_4:
+; RV32IZFH-NEXT: lui a1, %hi(.LCPI17_1)
+; RV32IZFH-NEXT: flw fa5, %lo(.LCPI17_1)(a1)
+; RV32IZFH-NEXT: flt.s a3, fa5, fs0
+; RV32IZFH-NEXT: beqz a3, .LBB17_6
+; RV32IZFH-NEXT: # %bb.5:
+; RV32IZFH-NEXT: addi a2, a4, -1
+; RV32IZFH-NEXT: .LBB17_6:
+; RV32IZFH-NEXT: feq.s a1, fs0, fs0
+; RV32IZFH-NEXT: neg a4, a1
+; RV32IZFH-NEXT: and a1, a4, a2
+; RV32IZFH-NEXT: neg a2, s0
; RV32IZFH-NEXT: and a0, a2, a0
+; RV32IZFH-NEXT: neg a2, a3
+; RV32IZFH-NEXT: or a0, a2, a0
+; RV32IZFH-NEXT: and a0, a4, a0
; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: addi sp, sp, 16
-; RV32IZFH-NEXT: beqz a3, .LBB17_6
-; RV32IZFH-NEXT: # %bb.5:
-; RV32IZFH-NEXT: addi a4, a5, -1
-; RV32IZFH-NEXT: .LBB17_6:
-; RV32IZFH-NEXT: and a1, a2, a4
; RV32IZFH-NEXT: ret
;
; RV64IZFH-LABEL: test_roundeven_si64:
@@ -3043,16 +3025,16 @@ define i64 @test_roundeven_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lui a2, %hi(.LCPI17_1)
; RV32IZHINX-NEXT: lw a2, %lo(.LCPI17_1)(a2)
; RV32IZHINX-NEXT: and a0, s2, a0
-; RV32IZHINX-NEXT: flt.s a3, a2, s0
-; RV32IZHINX-NEXT: neg a2, a3
+; RV32IZHINX-NEXT: flt.s a4, a2, s0
+; RV32IZHINX-NEXT: neg a2, a4
; RV32IZHINX-NEXT: or a0, a2, a0
; RV32IZHINX-NEXT: feq.s a2, s0, s0
; RV32IZHINX-NEXT: neg a2, a2
; RV32IZHINX-NEXT: lui a5, 524288
-; RV32IZHINX-NEXT: lui a4, 524288
+; RV32IZHINX-NEXT: lui a3, 524288
; RV32IZHINX-NEXT: beqz s1, .LBB17_4
; RV32IZHINX-NEXT: # %bb.3:
-; RV32IZHINX-NEXT: mv a4, a1
+; RV32IZHINX-NEXT: mv a3, a1
; RV32IZHINX-NEXT: .LBB17_4:
; RV32IZHINX-NEXT: and a0, a2, a0
; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -3060,11 +3042,11 @@ define i64 @test_roundeven_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: addi sp, sp, 16
-; RV32IZHINX-NEXT: beqz a3, .LBB17_6
+; RV32IZHINX-NEXT: beqz a4, .LBB17_6
; RV32IZHINX-NEXT: # %bb.5:
-; RV32IZHINX-NEXT: addi a4, a5, -1
+; RV32IZHINX-NEXT: addi a3, a5, -1
; RV32IZHINX-NEXT: .LBB17_6:
-; RV32IZHINX-NEXT: and a1, a2, a4
+; RV32IZHINX-NEXT: and a1, a2, a3
; RV32IZHINX-NEXT: ret
;
; RV64IZHINX-LABEL: test_roundeven_si64:
@@ -3102,41 +3084,39 @@ define i64 @test_roundeven_si64(half %x) nounwind {
; RV32IZFHMIN-NEXT: addi sp, sp, -16
; RV32IZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5
; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5
; RV32IZFHMIN-NEXT: lui a0, 913408
; RV32IZFHMIN-NEXT: fmv.w.x fa5, a0
; RV32IZFHMIN-NEXT: fle.s s0, fa5, fs0
-; RV32IZFHMIN-NEXT: neg s1, s0
; RV32IZFHMIN-NEXT: fmv.s fa0, fs0
; RV32IZFHMIN-NEXT: call __fixsfdi
-; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI17_0)
-; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI17_0)(a2)
-; RV32IZFHMIN-NEXT: and a0, s1, a0
-; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT: neg a2, a3
-; RV32IZFHMIN-NEXT: or a0, a2, a0
-; RV32IZFHMIN-NEXT: feq.s a2, fs0, fs0
-; RV32IZFHMIN-NEXT: neg a2, a2
-; RV32IZFHMIN-NEXT: lui a5, 524288
; RV32IZFHMIN-NEXT: lui a4, 524288
+; RV32IZFHMIN-NEXT: lui a2, 524288
; RV32IZFHMIN-NEXT: beqz s0, .LBB17_4
; RV32IZFHMIN-NEXT: # %bb.3:
-; RV32IZFHMIN-NEXT: mv a4, a1
+; RV32IZFHMIN-NEXT: mv a2, a1
; RV32IZFHMIN-NEXT: .LBB17_4:
+; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI17_0)
+; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI17_0)(a1)
+; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT: beqz a3, .LBB17_6
+; RV32IZFHMIN-NEXT: # %bb.5:
+; RV32IZFHMIN-NEXT: addi a2, a4, -1
+; RV32IZFHMIN-NEXT: .LBB17_6:
+; RV32IZFHMIN-NEXT: feq.s a1, fs0, fs0
+; RV32IZFHMIN-NEXT: neg a4, a1
+; RV32IZFHMIN-NEXT: and a1, a4, a2
+; RV32IZFHMIN-NEXT: neg a2, s0
; RV32IZFHMIN-NEXT: and a0, a2, a0
+; RV32IZFHMIN-NEXT: neg a2, a3
+; RV32IZFHMIN-NEXT: or a0, a2, a0
+; RV32IZFHMIN-NEXT: and a0, a4, a0
; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: addi sp, sp, 16
-; RV32IZFHMIN-NEXT: beqz a3, .LBB17_6
-; RV32IZFHMIN-NEXT: # %bb.5:
-; RV32IZFHMIN-NEXT: addi a4, a5, -1
-; RV32IZFHMIN-NEXT: .LBB17_6:
-; RV32IZFHMIN-NEXT: and a1, a2, a4
; RV32IZFHMIN-NEXT: ret
;
; RV64IZFHMIN-LABEL: test_roundeven_si64:
@@ -3188,16 +3168,16 @@ define i64 @test_roundeven_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI17_0)
; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI17_0)(a2)
; RV32IZHINXMIN-NEXT: and a0, s2, a0
-; RV32IZHINXMIN-NEXT: flt.s a3, a2, s0
-; RV32IZHINXMIN-NEXT: neg a2, a3
+; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0
+; RV32IZHINXMIN-NEXT: neg a2, a4
; RV32IZHINXMIN-NEXT: or a0, a2, a0
; RV32IZHINXMIN-NEXT: feq.s a2, s0, s0
; RV32IZHINXMIN-NEXT: neg a2, a2
; RV32IZHINXMIN-NEXT: lui a5, 524288
-; RV32IZHINXMIN-NEXT: lui a4, 524288
+; RV32IZHINXMIN-NEXT: lui a3, 524288
; RV32IZHINXMIN-NEXT: beqz s1, .LBB17_4
; RV32IZHINXMIN-NEXT: # %bb.3:
-; RV32IZHINXMIN-NEXT: mv a4, a1
+; RV32IZHINXMIN-NEXT: mv a3, a1
; RV32IZHINXMIN-NEXT: .LBB17_4:
; RV32IZHINXMIN-NEXT: and a0, a2, a0
; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -3205,11 +3185,11 @@ define i64 @test_roundeven_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: addi sp, sp, 16
-; RV32IZHINXMIN-NEXT: beqz a3, .LBB17_6
+; RV32IZHINXMIN-NEXT: beqz a4, .LBB17_6
; RV32IZHINXMIN-NEXT: # %bb.5:
-; RV32IZHINXMIN-NEXT: addi a4, a5, -1
+; RV32IZHINXMIN-NEXT: addi a3, a5, -1
; RV32IZHINXMIN-NEXT: .LBB17_6:
-; RV32IZHINXMIN-NEXT: and a1, a2, a4
+; RV32IZHINXMIN-NEXT: and a1, a2, a3
; RV32IZHINXMIN-NEXT: ret
;
; RV64IZHINXMIN-LABEL: test_roundeven_si64:
@@ -3688,40 +3668,38 @@ define i64 @test_rint_si64(half %x) nounwind {
; RV32IZFH-NEXT: addi sp, sp, -16
; RV32IZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: fcvt.s.h fs0, fa0
; RV32IZFH-NEXT: lui a0, 913408
; RV32IZFH-NEXT: fmv.w.x fa5, a0
; RV32IZFH-NEXT: fle.s s0, fa5, fs0
-; RV32IZFH-NEXT: neg s1, s0
; RV32IZFH-NEXT: fmv.s fa0, fs0
; RV32IZFH-NEXT: call __fixsfdi
-; RV32IZFH-NEXT: lui a2, %hi(.LCPI21_1)
-; RV32IZFH-NEXT: flw fa5, %lo(.LCPI21_1)(a2)
-; RV32IZFH-NEXT: and a0, s1, a0
-; RV32IZFH-NEXT: flt.s a3, fa5, fs0
-; RV32IZFH-NEXT: neg a2, a3
-; RV32IZFH-NEXT: or a0, a2, a0
-; RV32IZFH-NEXT: feq.s a2, fs0, fs0
-; RV32IZFH-NEXT: neg a2, a2
-; RV32IZFH-NEXT: lui a5, 524288
; RV32IZFH-NEXT: lui a4, 524288
+; RV32IZFH-NEXT: lui a2, 524288
; RV32IZFH-NEXT: beqz s0, .LBB21_4
; RV32IZFH-NEXT: # %bb.3:
-; RV32IZFH-NEXT: mv a4, a1
+; RV32IZFH-NEXT: mv a2, a1
; RV32IZFH-NEXT: .LBB21_4:
+; RV32IZFH-NEXT: lui a1, %hi(.LCPI21_1)
+; RV32IZFH-NEXT: flw fa5, %lo(.LCPI21_1)(a1)
+; RV32IZFH-NEXT: flt.s a3, fa5, fs0
+; RV32IZFH-NEXT: beqz a3, .LBB21_6
+; RV32IZFH-NEXT: # %bb.5:
+; RV32IZFH-NEXT: addi a2, a4, -1
+; RV32IZFH-NEXT: .LBB21_6:
+; RV32IZFH-NEXT: feq.s a1, fs0, fs0
+; RV32IZFH-NEXT: neg a4, a1
+; RV32IZFH-NEXT: and a1, a4, a2
+; RV32IZFH-NEXT: neg a2, s0
; RV32IZFH-NEXT: and a0, a2, a0
+; RV32IZFH-NEXT: neg a2, a3
+; RV32IZFH-NEXT: or a0, a2, a0
+; RV32IZFH-NEXT: and a0, a4, a0
; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: addi sp, sp, 16
-; RV32IZFH-NEXT: beqz a3, .LBB21_6
-; RV32IZFH-NEXT: # %bb.5:
-; RV32IZFH-NEXT: addi a4, a5, -1
-; RV32IZFH-NEXT: .LBB21_6:
-; RV32IZFH-NEXT: and a1, a2, a4
; RV32IZFH-NEXT: ret
;
; RV64IZFH-LABEL: test_rint_si64:
@@ -3759,16 +3737,16 @@ define i64 @test_rint_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lui a2, %hi(.LCPI21_1)
; RV32IZHINX-NEXT: lw a2, %lo(.LCPI21_1)(a2)
; RV32IZHINX-NEXT: and a0, s2, a0
-; RV32IZHINX-NEXT: flt.s a3, a2, s0
-; RV32IZHINX-NEXT: neg a2, a3
+; RV32IZHINX-NEXT: flt.s a4, a2, s0
+; RV32IZHINX-NEXT: neg a2, a4
; RV32IZHINX-NEXT: or a0, a2, a0
; RV32IZHINX-NEXT: feq.s a2, s0, s0
; RV32IZHINX-NEXT: neg a2, a2
; RV32IZHINX-NEXT: lui a5, 524288
-; RV32IZHINX-NEXT: lui a4, 524288
+; RV32IZHINX-NEXT: lui a3, 524288
; RV32IZHINX-NEXT: beqz s1, .LBB21_4
; RV32IZHINX-NEXT: # %bb.3:
-; RV32IZHINX-NEXT: mv a4, a1
+; RV32IZHINX-NEXT: mv a3, a1
; RV32IZHINX-NEXT: .LBB21_4:
; RV32IZHINX-NEXT: and a0, a2, a0
; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -3776,11 +3754,11 @@ define i64 @test_rint_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: addi sp, sp, 16
-; RV32IZHINX-NEXT: beqz a3, .LBB21_6
+; RV32IZHINX-NEXT: beqz a4, .LBB21_6
; RV32IZHINX-NEXT: # %bb.5:
-; RV32IZHINX-NEXT: addi a4, a5, -1
+; RV32IZHINX-NEXT: addi a3, a5, -1
; RV32IZHINX-NEXT: .LBB21_6:
-; RV32IZHINX-NEXT: and a1, a2, a4
+; RV32IZHINX-NEXT: and a1, a2, a3
; RV32IZHINX-NEXT: ret
;
; RV64IZHINX-LABEL: test_rint_si64:
@@ -3818,41 +3796,39 @@ define i64 @test_rint_si64(half %x) nounwind {
; RV32IZFHMIN-NEXT: addi sp, sp, -16
; RV32IZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5
; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5
; RV32IZFHMIN-NEXT: lui a0, 913408
; RV32IZFHMIN-NEXT: fmv.w.x fa5, a0
; RV32IZFHMIN-NEXT: fle.s s0, fa5, fs0
-; RV32IZFHMIN-NEXT: neg s1, s0
; RV32IZFHMIN-NEXT: fmv.s fa0, fs0
; RV32IZFHMIN-NEXT: call __fixsfdi
-; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI21_0)
-; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI21_0)(a2)
-; RV32IZFHMIN-NEXT: and a0, s1, a0
-; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT: neg a2, a3
-; RV32IZFHMIN-NEXT: or a0, a2, a0
-; RV32IZFHMIN-NEXT: feq.s a2, fs0, fs0
-; RV32IZFHMIN-NEXT: neg a2, a2
-; RV32IZFHMIN-NEXT: lui a5, 524288
; RV32IZFHMIN-NEXT: lui a4, 524288
+; RV32IZFHMIN-NEXT: lui a2, 524288
; RV32IZFHMIN-NEXT: beqz s0, .LBB21_4
; RV32IZFHMIN-NEXT: # %bb.3:
-; RV32IZFHMIN-NEXT: mv a4, a1
+; RV32IZFHMIN-NEXT: mv a2, a1
; RV32IZFHMIN-NEXT: .LBB21_4:
+; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI21_0)
+; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI21_0)(a1)
+; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT: beqz a3, .LBB21_6
+; RV32IZFHMIN-NEXT: # %bb.5:
+; RV32IZFHMIN-NEXT: addi a2, a4, -1
+; RV32IZFHMIN-NEXT: .LBB21_6:
+; RV32IZFHMIN-NEXT: feq.s a1, fs0, fs0
+; RV32IZFHMIN-NEXT: neg a4, a1
+; RV32IZFHMIN-NEXT: and a1, a4, a2
+; RV32IZFHMIN-NEXT: neg a2, s0
; RV32IZFHMIN-NEXT: and a0, a2, a0
+; RV32IZFHMIN-NEXT: neg a2, a3
+; RV32IZFHMIN-NEXT: or a0, a2, a0
+; RV32IZFHMIN-NEXT: and a0, a4, a0
; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: addi sp, sp, 16
-; RV32IZFHMIN-NEXT: beqz a3, .LBB21_6
-; RV32IZFHMIN-NEXT: # %bb.5:
-; RV32IZFHMIN-NEXT: addi a4, a5, -1
-; RV32IZFHMIN-NEXT: .LBB21_6:
-; RV32IZFHMIN-NEXT: and a1, a2, a4
; RV32IZFHMIN-NEXT: ret
;
; RV64IZFHMIN-LABEL: test_rint_si64:
@@ -3904,16 +3880,16 @@ define i64 @test_rint_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI21_0)
; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI21_0)(a2)
; RV32IZHINXMIN-NEXT: and a0, s2, a0
-; RV32IZHINXMIN-NEXT: flt.s a3, a2, s0
-; RV32IZHINXMIN-NEXT: neg a2, a3
+; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0
+; RV32IZHINXMIN-NEXT: neg a2, a4
; RV32IZHINXMIN-NEXT: or a0, a2, a0
; RV32IZHINXMIN-NEXT: feq.s a2, s0, s0
; RV32IZHINXMIN-NEXT: neg a2, a2
; RV32IZHINXMIN-NEXT: lui a5, 524288
-; RV32IZHINXMIN-NEXT: lui a4, 524288
+; RV32IZHINXMIN-NEXT: lui a3, 524288
; RV32IZHINXMIN-NEXT: beqz s1, .LBB21_4
; RV32IZHINXMIN-NEXT: # %bb.3:
-; RV32IZHINXMIN-NEXT: mv a4, a1
+; RV32IZHINXMIN-NEXT: mv a3, a1
; RV32IZHINXMIN-NEXT: .LBB21_4:
; RV32IZHINXMIN-NEXT: and a0, a2, a0
; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -3921,11 +3897,11 @@ define i64 @test_rint_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: addi sp, sp, 16
-; RV32IZHINXMIN-NEXT: beqz a3, .LBB21_6
+; RV32IZHINXMIN-NEXT: beqz a4, .LBB21_6
; RV32IZHINXMIN-NEXT: # %bb.5:
-; RV32IZHINXMIN-NEXT: addi a4, a5, -1
+; RV32IZHINXMIN-NEXT: addi a3, a5, -1
; RV32IZHINXMIN-NEXT: .LBB21_6:
-; RV32IZHINXMIN-NEXT: and a1, a2, a4
+; RV32IZHINXMIN-NEXT: and a1, a2, a3
; RV32IZHINXMIN-NEXT: ret
;
; RV64IZHINXMIN-LABEL: test_rint_si64:
diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll
index 98c886333d69a0..a0c85ab4dca7f7 100644
--- a/llvm/test/CodeGen/RISCV/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/iabs.ll
@@ -630,8 +630,8 @@ define void @zext16_abs8(i8 %x, ptr %p) {
; RV32I-LABEL: zext16_abs8:
; RV32I: # %bb.0:
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: srai a0, a0, 24
; RV32I-NEXT: srai a2, a0, 31
+; RV32I-NEXT: srai a0, a0, 24
; RV32I-NEXT: xor a0, a0, a2
; RV32I-NEXT: sub a0, a0, a2
; RV32I-NEXT: sh a0, 0(a1)
@@ -648,8 +648,8 @@ define void @zext16_abs8(i8 %x, ptr %p) {
; RV64I-LABEL: zext16_abs8:
; RV64I: # %bb.0:
; RV64I-NEXT: slli a0, a0, 56
-; RV64I-NEXT: srai a0, a0, 56
; RV64I-NEXT: srai a2, a0, 63
+; RV64I-NEXT: srai a0, a0, 56
; RV64I-NEXT: xor a0, a0, a2
; RV64I-NEXT: subw a0, a0, a2
; RV64I-NEXT: sh a0, 0(a1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index b3bda5973eb8c4..a6b2d3141f22f9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -2190,65 +2190,66 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
; CHECK-NOV-NEXT: .cfi_offset s0, -16
; CHECK-NOV-NEXT: .cfi_offset s1, -24
; CHECK-NOV-NEXT: .cfi_offset fs0, -32
-; CHECK-NOV-NEXT: fmv.d fs0, fa0
-; CHECK-NOV-NEXT: fmv.d fa0, fa1
+; CHECK-NOV-NEXT: fmv.d fs0, fa1
; CHECK-NOV-NEXT: call __fixdfti
; CHECK-NOV-NEXT: mv s0, a0
; CHECK-NOV-NEXT: mv s1, a1
; CHECK-NOV-NEXT: fmv.d fa0, fs0
; CHECK-NOV-NEXT: call __fixdfti
-; CHECK-NOV-NEXT: li a2, -1
-; CHECK-NOV-NEXT: srli a3, a2, 1
-; CHECK-NOV-NEXT: beqz s1, .LBB18_3
+; CHECK-NOV-NEXT: mv a2, a0
+; CHECK-NOV-NEXT: li a0, -1
+; CHECK-NOV-NEXT: srli a3, a0, 1
+; CHECK-NOV-NEXT: beqz a1, .LBB18_3
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: slti a4, s1, 0
-; CHECK-NOV-NEXT: bnez a1, .LBB18_4
+; CHECK-NOV-NEXT: slti a4, a1, 0
+; CHECK-NOV-NEXT: bnez s1, .LBB18_4
; CHECK-NOV-NEXT: .LBB18_2:
-; CHECK-NOV-NEXT: sltu a5, a0, a3
+; CHECK-NOV-NEXT: sltu a5, s0, a3
; CHECK-NOV-NEXT: beqz a5, .LBB18_5
; CHECK-NOV-NEXT: j .LBB18_6
; CHECK-NOV-NEXT: .LBB18_3:
-; CHECK-NOV-NEXT: sltu a4, s0, a3
-; CHECK-NOV-NEXT: beqz a1, .LBB18_2
+; CHECK-NOV-NEXT: sltu a4, a2, a3
+; CHECK-NOV-NEXT: beqz s1, .LBB18_2
; CHECK-NOV-NEXT: .LBB18_4: # %entry
-; CHECK-NOV-NEXT: slti a5, a1, 0
+; CHECK-NOV-NEXT: slti a5, s1, 0
; CHECK-NOV-NEXT: bnez a5, .LBB18_6
; CHECK-NOV-NEXT: .LBB18_5: # %entry
-; CHECK-NOV-NEXT: mv a0, a3
+; CHECK-NOV-NEXT: mv s0, a3
; CHECK-NOV-NEXT: .LBB18_6: # %entry
; CHECK-NOV-NEXT: neg a6, a5
; CHECK-NOV-NEXT: neg a5, a4
-; CHECK-NOV-NEXT: and a5, a5, s1
+; CHECK-NOV-NEXT: and a5, a5, a1
; CHECK-NOV-NEXT: bnez a4, .LBB18_8
; CHECK-NOV-NEXT: # %bb.7: # %entry
-; CHECK-NOV-NEXT: mv s0, a3
+; CHECK-NOV-NEXT: mv a2, a3
; CHECK-NOV-NEXT: .LBB18_8: # %entry
-; CHECK-NOV-NEXT: and a4, a6, a1
-; CHECK-NOV-NEXT: slli a1, a2, 63
-; CHECK-NOV-NEXT: beq a5, a2, .LBB18_11
+; CHECK-NOV-NEXT: and a4, a6, s1
+; CHECK-NOV-NEXT: slli a1, a0, 63
+; CHECK-NOV-NEXT: beq a5, a0, .LBB18_11
; CHECK-NOV-NEXT: # %bb.9: # %entry
; CHECK-NOV-NEXT: slti a3, a5, 0
; CHECK-NOV-NEXT: xori a3, a3, 1
-; CHECK-NOV-NEXT: bne a4, a2, .LBB18_12
+; CHECK-NOV-NEXT: bne a4, a0, .LBB18_12
; CHECK-NOV-NEXT: .LBB18_10:
-; CHECK-NOV-NEXT: sltu a2, a1, a0
-; CHECK-NOV-NEXT: beqz a2, .LBB18_13
+; CHECK-NOV-NEXT: sltu a0, a1, s0
+; CHECK-NOV-NEXT: beqz a0, .LBB18_13
; CHECK-NOV-NEXT: j .LBB18_14
; CHECK-NOV-NEXT: .LBB18_11:
-; CHECK-NOV-NEXT: sltu a3, a1, s0
-; CHECK-NOV-NEXT: beq a4, a2, .LBB18_10
+; CHECK-NOV-NEXT: sltu a3, a1, a2
+; CHECK-NOV-NEXT: beq a4, a0, .LBB18_10
; CHECK-NOV-NEXT: .LBB18_12: # %entry
-; CHECK-NOV-NEXT: slti a2, a4, 0
-; CHECK-NOV-NEXT: xori a2, a2, 1
-; CHECK-NOV-NEXT: bnez a2, .LBB18_14
+; CHECK-NOV-NEXT: slti a0, a4, 0
+; CHECK-NOV-NEXT: xori a0, a0, 1
+; CHECK-NOV-NEXT: bnez a0, .LBB18_14
; CHECK-NOV-NEXT: .LBB18_13: # %entry
-; CHECK-NOV-NEXT: mv a0, a1
+; CHECK-NOV-NEXT: mv s0, a1
; CHECK-NOV-NEXT: .LBB18_14: # %entry
; CHECK-NOV-NEXT: bnez a3, .LBB18_16
; CHECK-NOV-NEXT: # %bb.15: # %entry
-; CHECK-NOV-NEXT: mv s0, a1
+; CHECK-NOV-NEXT: mv a2, a1
; CHECK-NOV-NEXT: .LBB18_16: # %entry
-; CHECK-NOV-NEXT: mv a1, s0
+; CHECK-NOV-NEXT: mv a0, s0
+; CHECK-NOV-NEXT: mv a1, a2
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
@@ -2273,43 +2274,43 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
; CHECK-V-NEXT: addi a0, sp, 32
; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT: vfmv.f.s fa0, v8
+; CHECK-V-NEXT: vslidedown.vi v9, v8, 1
+; CHECK-V-NEXT: vfmv.f.s fa0, v9
; CHECK-V-NEXT: call __fixdfti
; CHECK-V-NEXT: mv s0, a0
; CHECK-V-NEXT: mv s1, a1
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: addi a0, sp, 32
; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslidedown.vi v8, v8, 1
; CHECK-V-NEXT: vfmv.f.s fa0, v8
; CHECK-V-NEXT: call __fixdfti
; CHECK-V-NEXT: li a2, -1
; CHECK-V-NEXT: srli a3, a2, 1
-; CHECK-V-NEXT: beqz s1, .LBB18_3
+; CHECK-V-NEXT: beqz a1, .LBB18_3
; CHECK-V-NEXT: # %bb.1: # %entry
-; CHECK-V-NEXT: slti a4, s1, 0
-; CHECK-V-NEXT: bnez a1, .LBB18_4
+; CHECK-V-NEXT: slti a4, a1, 0
+; CHECK-V-NEXT: bnez s1, .LBB18_4
; CHECK-V-NEXT: .LBB18_2:
-; CHECK-V-NEXT: sltu a5, a0, a3
+; CHECK-V-NEXT: sltu a5, s0, a3
; CHECK-V-NEXT: beqz a5, .LBB18_5
; CHECK-V-NEXT: j .LBB18_6
; CHECK-V-NEXT: .LBB18_3:
-; CHECK-V-NEXT: sltu a4, s0, a3
-; CHECK-V-NEXT: beqz a1, .LBB18_2
+; CHECK-V-NEXT: sltu a4, a0, a3
+; CHECK-V-NEXT: beqz s1, .LBB18_2
; CHECK-V-NEXT: .LBB18_4: # %entry
-; CHECK-V-NEXT: slti a5, a1, 0
+; CHECK-V-NEXT: slti a5, s1, 0
; CHECK-V-NEXT: bnez a5, .LBB18_6
; CHECK-V-NEXT: .LBB18_5: # %entry
-; CHECK-V-NEXT: mv a0, a3
+; CHECK-V-NEXT: mv s0, a3
; CHECK-V-NEXT: .LBB18_6: # %entry
; CHECK-V-NEXT: neg a6, a5
; CHECK-V-NEXT: neg a5, a4
-; CHECK-V-NEXT: and a5, a5, s1
+; CHECK-V-NEXT: and a5, a5, a1
; CHECK-V-NEXT: bnez a4, .LBB18_8
; CHECK-V-NEXT: # %bb.7: # %entry
-; CHECK-V-NEXT: mv s0, a3
+; CHECK-V-NEXT: mv a0, a3
; CHECK-V-NEXT: .LBB18_8: # %entry
-; CHECK-V-NEXT: and a4, a6, a1
+; CHECK-V-NEXT: and a4, a6, s1
; CHECK-V-NEXT: slli a1, a2, 63
; CHECK-V-NEXT: beq a5, a2, .LBB18_11
; CHECK-V-NEXT: # %bb.9: # %entry
@@ -2317,26 +2318,26 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
; CHECK-V-NEXT: xori a3, a3, 1
; CHECK-V-NEXT: bne a4, a2, .LBB18_12
; CHECK-V-NEXT: .LBB18_10:
-; CHECK-V-NEXT: sltu a2, a1, a0
+; CHECK-V-NEXT: sltu a2, a1, s0
; CHECK-V-NEXT: beqz a2, .LBB18_13
; CHECK-V-NEXT: j .LBB18_14
; CHECK-V-NEXT: .LBB18_11:
-; CHECK-V-NEXT: sltu a3, a1, s0
+; CHECK-V-NEXT: sltu a3, a1, a0
; CHECK-V-NEXT: beq a4, a2, .LBB18_10
; CHECK-V-NEXT: .LBB18_12: # %entry
; CHECK-V-NEXT: slti a2, a4, 0
; CHECK-V-NEXT: xori a2, a2, 1
; CHECK-V-NEXT: bnez a2, .LBB18_14
; CHECK-V-NEXT: .LBB18_13: # %entry
-; CHECK-V-NEXT: mv a0, a1
+; CHECK-V-NEXT: mv s0, a1
; CHECK-V-NEXT: .LBB18_14: # %entry
; CHECK-V-NEXT: bnez a3, .LBB18_16
; CHECK-V-NEXT: # %bb.15: # %entry
-; CHECK-V-NEXT: mv s0, a1
+; CHECK-V-NEXT: mv a0, a1
; CHECK-V-NEXT: .LBB18_16: # %entry
; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT: vmv.s.x v8, s0
-; CHECK-V-NEXT: vmv.s.x v9, a0
+; CHECK-V-NEXT: vmv.s.x v8, a0
+; CHECK-V-NEXT: vmv.s.x v9, s0
; CHECK-V-NEXT: vslideup.vi v8, v9, 1
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
@@ -2369,19 +2370,19 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) {
; CHECK-NOV-NEXT: .cfi_offset s0, -16
; CHECK-NOV-NEXT: .cfi_offset s1, -24
; CHECK-NOV-NEXT: .cfi_offset fs0, -32
-; CHECK-NOV-NEXT: fmv.d fs0, fa0
-; CHECK-NOV-NEXT: fmv.d fa0, fa1
+; CHECK-NOV-NEXT: fmv.d fs0, fa1
; CHECK-NOV-NEXT: call __fixunsdfti
; CHECK-NOV-NEXT: mv s0, a0
; CHECK-NOV-NEXT: mv s1, a1
; CHECK-NOV-NEXT: fmv.d fa0, fs0
; CHECK-NOV-NEXT: call __fixunsdfti
-; CHECK-NOV-NEXT: snez a2, s1
; CHECK-NOV-NEXT: snez a1, a1
+; CHECK-NOV-NEXT: snez a2, s1
+; CHECK-NOV-NEXT: addi a2, a2, -1
+; CHECK-NOV-NEXT: and a2, a2, s0
; CHECK-NOV-NEXT: addi a1, a1, -1
-; CHECK-NOV-NEXT: and a0, a1, a0
-; CHECK-NOV-NEXT: addi a1, a2, -1
-; CHECK-NOV-NEXT: and a1, a1, s0
+; CHECK-NOV-NEXT: and a1, a1, a0
+; CHECK-NOV-NEXT: mv a0, a2
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
@@ -2406,25 +2407,25 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) {
; CHECK-V-NEXT: addi a0, sp, 32
; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT: vfmv.f.s fa0, v8
+; CHECK-V-NEXT: vslidedown.vi v9, v8, 1
+; CHECK-V-NEXT: vfmv.f.s fa0, v9
; CHECK-V-NEXT: call __fixunsdfti
; CHECK-V-NEXT: mv s0, a0
; CHECK-V-NEXT: mv s1, a1
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: addi a0, sp, 32
; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslidedown.vi v8, v8, 1
; CHECK-V-NEXT: vfmv.f.s fa0, v8
; CHECK-V-NEXT: call __fixunsdfti
-; CHECK-V-NEXT: snez a2, s1
; CHECK-V-NEXT: snez a1, a1
-; CHECK-V-NEXT: addi a1, a1, -1
-; CHECK-V-NEXT: and a0, a1, a0
+; CHECK-V-NEXT: snez a2, s1
; CHECK-V-NEXT: addi a2, a2, -1
; CHECK-V-NEXT: and a2, a2, s0
+; CHECK-V-NEXT: addi a1, a1, -1
+; CHECK-V-NEXT: and a0, a1, a0
; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT: vmv.s.x v8, a2
-; CHECK-V-NEXT: vmv.s.x v9, a0
+; CHECK-V-NEXT: vmv.s.x v8, a0
+; CHECK-V-NEXT: vmv.s.x v9, a2
; CHECK-V-NEXT: vslideup.vi v8, v9, 1
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
@@ -2466,32 +2467,32 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) {
; CHECK-NOV-NEXT: # %bb.1: # %entry
; CHECK-NOV-NEXT: li a2, 1
; CHECK-NOV-NEXT: .LBB20_2: # %entry
-; CHECK-NOV-NEXT: slti a4, s1, 1
; CHECK-NOV-NEXT: slti a3, a1, 1
+; CHECK-NOV-NEXT: slti a4, s1, 1
; CHECK-NOV-NEXT: blez a1, .LBB20_4
; CHECK-NOV-NEXT: # %bb.3: # %entry
; CHECK-NOV-NEXT: li a1, 1
; CHECK-NOV-NEXT: .LBB20_4: # %entry
+; CHECK-NOV-NEXT: neg a4, a4
; CHECK-NOV-NEXT: neg a3, a3
; CHECK-NOV-NEXT: and a3, a3, a0
-; CHECK-NOV-NEXT: neg a0, a4
; CHECK-NOV-NEXT: beqz a1, .LBB20_7
; CHECK-NOV-NEXT: # %bb.5: # %entry
; CHECK-NOV-NEXT: sgtz a1, a1
-; CHECK-NOV-NEXT: and a0, a0, s0
+; CHECK-NOV-NEXT: and a4, a4, s0
; CHECK-NOV-NEXT: bnez a2, .LBB20_8
; CHECK-NOV-NEXT: .LBB20_6:
-; CHECK-NOV-NEXT: snez a2, a0
+; CHECK-NOV-NEXT: snez a0, a4
; CHECK-NOV-NEXT: j .LBB20_9
; CHECK-NOV-NEXT: .LBB20_7:
; CHECK-NOV-NEXT: snez a1, a3
-; CHECK-NOV-NEXT: and a0, a0, s0
+; CHECK-NOV-NEXT: and a4, a4, s0
; CHECK-NOV-NEXT: beqz a2, .LBB20_6
; CHECK-NOV-NEXT: .LBB20_8: # %entry
-; CHECK-NOV-NEXT: sgtz a2, a2
+; CHECK-NOV-NEXT: sgtz a0, a2
; CHECK-NOV-NEXT: .LBB20_9: # %entry
-; CHECK-NOV-NEXT: neg a2, a2
-; CHECK-NOV-NEXT: and a0, a2, a0
+; CHECK-NOV-NEXT: neg a0, a0
+; CHECK-NOV-NEXT: and a0, a0, a4
; CHECK-NOV-NEXT: neg a1, a1
; CHECK-NOV-NEXT: and a1, a1, a3
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
@@ -2533,15 +2534,15 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) {
; CHECK-V-NEXT: # %bb.1: # %entry
; CHECK-V-NEXT: li a2, 1
; CHECK-V-NEXT: .LBB20_2: # %entry
-; CHECK-V-NEXT: slti a3, s1, 1
; CHECK-V-NEXT: slti a4, a1, 1
+; CHECK-V-NEXT: slti a3, s1, 1
; CHECK-V-NEXT: blez a1, .LBB20_4
; CHECK-V-NEXT: # %bb.3: # %entry
; CHECK-V-NEXT: li a1, 1
; CHECK-V-NEXT: .LBB20_4: # %entry
+; CHECK-V-NEXT: neg a3, a3
; CHECK-V-NEXT: neg a4, a4
; CHECK-V-NEXT: and a0, a4, a0
-; CHECK-V-NEXT: neg a3, a3
; CHECK-V-NEXT: beqz a1, .LBB20_7
; CHECK-V-NEXT: # %bb.5: # %entry
; CHECK-V-NEXT: sgtz a1, a1
@@ -2596,65 +2597,66 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
; CHECK-NOV-NEXT: .cfi_offset s0, -16
; CHECK-NOV-NEXT: .cfi_offset s1, -24
; CHECK-NOV-NEXT: .cfi_offset fs0, -32
-; CHECK-NOV-NEXT: fmv.s fs0, fa0
-; CHECK-NOV-NEXT: fmv.s fa0, fa1
+; CHECK-NOV-NEXT: fmv.s fs0, fa1
; CHECK-NOV-NEXT: call __fixsfti
; CHECK-NOV-NEXT: mv s0, a0
; CHECK-NOV-NEXT: mv s1, a1
; CHECK-NOV-NEXT: fmv.s fa0, fs0
; CHECK-NOV-NEXT: call __fixsfti
-; CHECK-NOV-NEXT: li a2, -1
-; CHECK-NOV-NEXT: srli a3, a2, 1
-; CHECK-NOV-NEXT: beqz s1, .LBB21_3
+; CHECK-NOV-NEXT: mv a2, a0
+; CHECK-NOV-NEXT: li a0, -1
+; CHECK-NOV-NEXT: srli a3, a0, 1
+; CHECK-NOV-NEXT: beqz a1, .LBB21_3
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: slti a4, s1, 0
-; CHECK-NOV-NEXT: bnez a1, .LBB21_4
+; CHECK-NOV-NEXT: slti a4, a1, 0
+; CHECK-NOV-NEXT: bnez s1, .LBB21_4
; CHECK-NOV-NEXT: .LBB21_2:
-; CHECK-NOV-NEXT: sltu a5, a0, a3
+; CHECK-NOV-NEXT: sltu a5, s0, a3
; CHECK-NOV-NEXT: beqz a5, .LBB21_5
; CHECK-NOV-NEXT: j .LBB21_6
; CHECK-NOV-NEXT: .LBB21_3:
-; CHECK-NOV-NEXT: sltu a4, s0, a3
-; CHECK-NOV-NEXT: beqz a1, .LBB21_2
+; CHECK-NOV-NEXT: sltu a4, a2, a3
+; CHECK-NOV-NEXT: beqz s1, .LBB21_2
; CHECK-NOV-NEXT: .LBB21_4: # %entry
-; CHECK-NOV-NEXT: slti a5, a1, 0
+; CHECK-NOV-NEXT: slti a5, s1, 0
; CHECK-NOV-NEXT: bnez a5, .LBB21_6
; CHECK-NOV-NEXT: .LBB21_5: # %entry
-; CHECK-NOV-NEXT: mv a0, a3
+; CHECK-NOV-NEXT: mv s0, a3
; CHECK-NOV-NEXT: .LBB21_6: # %entry
; CHECK-NOV-NEXT: neg a6, a5
; CHECK-NOV-NEXT: neg a5, a4
-; CHECK-NOV-NEXT: and a5, a5, s1
+; CHECK-NOV-NEXT: and a5, a5, a1
; CHECK-NOV-NEXT: bnez a4, .LBB21_8
; CHECK-NOV-NEXT: # %bb.7: # %entry
-; CHECK-NOV-NEXT: mv s0, a3
+; CHECK-NOV-NEXT: mv a2, a3
; CHECK-NOV-NEXT: .LBB21_8: # %entry
-; CHECK-NOV-NEXT: and a4, a6, a1
-; CHECK-NOV-NEXT: slli a1, a2, 63
-; CHECK-NOV-NEXT: beq a5, a2, .LBB21_11
+; CHECK-NOV-NEXT: and a4, a6, s1
+; CHECK-NOV-NEXT: slli a1, a0, 63
+; CHECK-NOV-NEXT: beq a5, a0, .LBB21_11
; CHECK-NOV-NEXT: # %bb.9: # %entry
; CHECK-NOV-NEXT: slti a3, a5, 0
; CHECK-NOV-NEXT: xori a3, a3, 1
-; CHECK-NOV-NEXT: bne a4, a2, .LBB21_12
+; CHECK-NOV-NEXT: bne a4, a0, .LBB21_12
; CHECK-NOV-NEXT: .LBB21_10:
-; CHECK-NOV-NEXT: sltu a2, a1, a0
-; CHECK-NOV-NEXT: beqz a2, .LBB21_13
+; CHECK-NOV-NEXT: sltu a0, a1, s0
+; CHECK-NOV-NEXT: beqz a0, .LBB21_13
; CHECK-NOV-NEXT: j .LBB21_14
; CHECK-NOV-NEXT: .LBB21_11:
-; CHECK-NOV-NEXT: sltu a3, a1, s0
-; CHECK-NOV-NEXT: beq a4, a2, .LBB21_10
+; CHECK-NOV-NEXT: sltu a3, a1, a2
+; CHECK-NOV-NEXT: beq a4, a0, .LBB21_10
; CHECK-NOV-NEXT: .LBB21_12: # %entry
-; CHECK-NOV-NEXT: slti a2, a4, 0
-; CHECK-NOV-NEXT: xori a2, a2, 1
-; CHECK-NOV-NEXT: bnez a2, .LBB21_14
+; CHECK-NOV-NEXT: slti a0, a4, 0
+; CHECK-NOV-NEXT: xori a0, a0, 1
+; CHECK-NOV-NEXT: bnez a0, .LBB21_14
; CHECK-NOV-NEXT: .LBB21_13: # %entry
-; CHECK-NOV-NEXT: mv a0, a1
+; CHECK-NOV-NEXT: mv s0, a1
; CHECK-NOV-NEXT: .LBB21_14: # %entry
; CHECK-NOV-NEXT: bnez a3, .LBB21_16
; CHECK-NOV-NEXT: # %bb.15: # %entry
-; CHECK-NOV-NEXT: mv s0, a1
+; CHECK-NOV-NEXT: mv a2, a1
; CHECK-NOV-NEXT: .LBB21_16: # %entry
-; CHECK-NOV-NEXT: mv a1, s0
+; CHECK-NOV-NEXT: mv a0, s0
+; CHECK-NOV-NEXT: mv a1, a2
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
@@ -2679,43 +2681,43 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
; CHECK-V-NEXT: addi a0, sp, 32
; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-V-NEXT: vfmv.f.s fa0, v8
+; CHECK-V-NEXT: vslidedown.vi v9, v8, 1
+; CHECK-V-NEXT: vfmv.f.s fa0, v9
; CHECK-V-NEXT: call __fixsfti
; CHECK-V-NEXT: mv s0, a0
; CHECK-V-NEXT: mv s1, a1
; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
; CHECK-V-NEXT: addi a0, sp, 32
; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslidedown.vi v8, v8, 1
; CHECK-V-NEXT: vfmv.f.s fa0, v8
; CHECK-V-NEXT: call __fixsfti
; CHECK-V-NEXT: li a2, -1
; CHECK-V-NEXT: srli a3, a2, 1
-; CHECK-V-NEXT: beqz s1, .LBB21_3
+; CHECK-V-NEXT: beqz a1, .LBB21_3
; CHECK-V-NEXT: # %bb.1: # %entry
-; CHECK-V-NEXT: slti a4, s1, 0
-; CHECK-V-NEXT: bnez a1, .LBB21_4
+; CHECK-V-NEXT: slti a4, a1, 0
+; CHECK-V-NEXT: bnez s1, .LBB21_4
; CHECK-V-NEXT: .LBB21_2:
-; CHECK-V-NEXT: sltu a5, a0, a3
+; CHECK-V-NEXT: sltu a5, s0, a3
; CHECK-V-NEXT: beqz a5, .LBB21_5
; CHECK-V-NEXT: j .LBB21_6
; CHECK-V-NEXT: .LBB21_3:
-; CHECK-V-NEXT: sltu a4, s0, a3
-; CHECK-V-NEXT: beqz a1, .LBB21_2
+; CHECK-V-NEXT: sltu a4, a0, a3
+; CHECK-V-NEXT: beqz s1, .LBB21_2
; CHECK-V-NEXT: .LBB21_4: # %entry
-; CHECK-V-NEXT: slti a5, a1, 0
+; CHECK-V-NEXT: slti a5, s1, 0
; CHECK-V-NEXT: bnez a5, .LBB21_6
; CHECK-V-NEXT: .LBB21_5: # %entry
-; CHECK-V-NEXT: mv a0, a3
+; CHECK-V-NEXT: mv s0, a3
; CHECK-V-NEXT: .LBB21_6: # %entry
; CHECK-V-NEXT: neg a6, a5
; CHECK-V-NEXT: neg a5, a4
-; CHECK-V-NEXT: and a5, a5, s1
+; CHECK-V-NEXT: and a5, a5, a1
; CHECK-V-NEXT: bnez a4, .LBB21_8
; CHECK-V-NEXT: # %bb.7: # %entry
-; CHECK-V-NEXT: mv s0, a3
+; CHECK-V-NEXT: mv a0, a3
; CHECK-V-NEXT: .LBB21_8: # %entry
-; CHECK-V-NEXT: and a4, a6, a1
+; CHECK-V-NEXT: and a4, a6, s1
; CHECK-V-NEXT: slli a1, a2, 63
; CHECK-V-NEXT: beq a5, a2, .LBB21_11
; CHECK-V-NEXT: # %bb.9: # %entry
@@ -2723,26 +2725,26 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
; CHECK-V-NEXT: xori a3, a3, 1
; CHECK-V-NEXT: bne a4, a2, .LBB21_12
; CHECK-V-NEXT: .LBB21_10:
-; CHECK-V-NEXT: sltu a2, a1, a0
+; CHECK-V-NEXT: sltu a2, a1, s0
; CHECK-V-NEXT: beqz a2, .LBB21_13
; CHECK-V-NEXT: j .LBB21_14
; CHECK-V-NEXT: .LBB21_11:
-; CHECK-V-NEXT: sltu a3, a1, s0
+; CHECK-V-NEXT: sltu a3, a1, a0
; CHECK-V-NEXT: beq a4, a2, .LBB21_10
; CHECK-V-NEXT: .LBB21_12: # %entry
; CHECK-V-NEXT: slti a2, a4, 0
; CHECK-V-NEXT: xori a2, a2, 1
; CHECK-V-NEXT: bnez a2, .LBB21_14
; CHECK-V-NEXT: .LBB21_13: # %entry
-; CHECK-V-NEXT: mv a0, a1
+; CHECK-V-NEXT: mv s0, a1
; CHECK-V-NEXT: .LBB21_14: # %entry
; CHECK-V-NEXT: bnez a3, .LBB21_16
; CHECK-V-NEXT: # %bb.15: # %entry
-; CHECK-V-NEXT: mv s0, a1
+; CHECK-V-NEXT: mv a0, a1
; CHECK-V-NEXT: .LBB21_16: # %entry
; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT: vmv.s.x v8, s0
-; CHECK-V-NEXT: vmv.s.x v9, a0
+; CHECK-V-NEXT: vmv.s.x v8, a0
+; CHECK-V-NEXT: vmv.s.x v9, s0
; CHECK-V-NEXT: vslideup.vi v8, v9, 1
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
@@ -2775,19 +2777,19 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
; CHECK-NOV-NEXT: .cfi_offset s0, -16
; CHECK-NOV-NEXT: .cfi_offset s1, -24
; CHECK-NOV-NEXT: .cfi_offset fs0, -32
-; CHECK-NOV-NEXT: fmv.s fs0, fa0
-; CHECK-NOV-NEXT: fmv.s fa0, fa1
+; CHECK-NOV-NEXT: fmv.s fs0, fa1
; CHECK-NOV-NEXT: call __fixunssfti
; CHECK-NOV-NEXT: mv s0, a0
; CHECK-NOV-NEXT: mv s1, a1
; CHECK-NOV-NEXT: fmv.s fa0, fs0
; CHECK-NOV-NEXT: call __fixunssfti
-; CHECK-NOV-NEXT: snez a2, s1
; CHECK-NOV-NEXT: snez a1, a1
+; CHECK-NOV-NEXT: snez a2, s1
+; CHECK-NOV-NEXT: addi a2, a2, -1
+; CHECK-NOV-NEXT: and a2, a2, s0
; CHECK-NOV-NEXT: addi a1, a1, -1
-; CHECK-NOV-NEXT: and a0, a1, a0
-; CHECK-NOV-NEXT: addi a1, a2, -1
-; CHECK-NOV-NEXT: and a1, a1, s0
+; CHECK-NOV-NEXT: and a1, a1, a0
+; CHECK-NOV-NEXT: mv a0, a2
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
@@ -2812,25 +2814,25 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
; CHECK-V-NEXT: addi a0, sp, 32
; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-V-NEXT: vfmv.f.s fa0, v8
+; CHECK-V-NEXT: vslidedown.vi v9, v8, 1
+; CHECK-V-NEXT: vfmv.f.s fa0, v9
; CHECK-V-NEXT: call __fixunssfti
; CHECK-V-NEXT: mv s0, a0
; CHECK-V-NEXT: mv s1, a1
; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
; CHECK-V-NEXT: addi a0, sp, 32
; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT: vslidedown.vi v8, v8, 1
; CHECK-V-NEXT: vfmv.f.s fa0, v8
; CHECK-V-NEXT: call __fixunssfti
-; CHECK-V-NEXT: snez a2, s1
; CHECK-V-NEXT: snez a1, a1
-; CHECK-V-NEXT: addi a1, a1, -1
-; CHECK-V-NEXT: and a0, a1, a0
+; CHECK-V-NEXT: snez a2, s1
; CHECK-V-NEXT: addi a2, a2, -1
; CHECK-V-NEXT: and a2, a2, s0
+; CHECK-V-NEXT: addi a1, a1, -1
+; CHECK-V-NEXT: and a0, a1, a0
; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT: vmv.s.x v8, a2
-; CHECK-V-NEXT: vmv.s.x v9, a0
+; CHECK-V-NEXT: vmv.s.x v8, a0
+; CHECK-V-NEXT: vmv.s.x v9, a2
; CHECK-V-NEXT: vslideup.vi v8, v9, 1
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
@@ -2872,32 +2874,32 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) {
; CHECK-NOV-NEXT: # %bb.1: # %entry
; CHECK-NOV-NEXT: li a2, 1
; CHECK-NOV-NEXT: .LBB23_2: # %entry
-; CHECK-NOV-NEXT: slti a4, s1, 1
; CHECK-NOV-NEXT: slti a3, a1, 1
+; CHECK-NOV-NEXT: slti a4, s1, 1
; CHECK-NOV-NEXT: blez a1, .LBB23_4
; CHECK-NOV-NEXT: # %bb.3: # %entry
; CHECK-NOV-NEXT: li a1, 1
; CHECK-NOV-NEXT: .LBB23_4: # %entry
+; CHECK-NOV-NEXT: neg a4, a4
; CHECK-NOV-NEXT: neg a3, a3
; CHECK-NOV-NEXT: and a3, a3, a0
-; CHECK-NOV-NEXT: neg a0, a4
; CHECK-NOV-NEXT: beqz a1, .LBB23_7
; CHECK-NOV-NEXT: # %bb.5: # %entry
; CHECK-NOV-NEXT: sgtz a1, a1
-; CHECK-NOV-NEXT: and a0, a0, s0
+; CHECK-NOV-NEXT: and a4, a4, s0
; CHECK-NOV-NEXT: bnez a2, .LBB23_8
; CHECK-NOV-NEXT: .LBB23_6:
-; CHECK-NOV-NEXT: snez a2, a0
+; CHECK-NOV-NEXT: snez a0, a4
; CHECK-NOV-NEXT: j .LBB23_9
; CHECK-NOV-NEXT: .LBB23_7:
; CHECK-NOV-NEXT: snez a1, a3
-; CHECK-NOV-NEXT: and a0, a0, s0
+; CHECK-NOV-NEXT: and a4, a4, s0
; CHECK-NOV-NEXT: beqz a2, .LBB23_6
; CHECK-NOV-NEXT: .LBB23_8: # %entry
-; CHECK-NOV-NEXT: sgtz a2, a2
+; CHECK-NOV-NEXT: sgtz a0, a2
; CHECK-NOV-NEXT: .LBB23_9: # %entry
-; CHECK-NOV-NEXT: neg a2, a2
-; CHECK-NOV-NEXT: and a0, a2, a0
+; CHECK-NOV-NEXT: neg a0, a0
+; CHECK-NOV-NEXT: and a0, a0, a4
; CHECK-NOV-NEXT: neg a1, a1
; CHECK-NOV-NEXT: and a1, a1, a3
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
@@ -2939,15 +2941,15 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) {
; CHECK-V-NEXT: # %bb.1: # %entry
; CHECK-V-NEXT: li a2, 1
; CHECK-V-NEXT: .LBB23_2: # %entry
-; CHECK-V-NEXT: slti a3, s1, 1
; CHECK-V-NEXT: slti a4, a1, 1
+; CHECK-V-NEXT: slti a3, s1, 1
; CHECK-V-NEXT: blez a1, .LBB23_4
; CHECK-V-NEXT: # %bb.3: # %entry
; CHECK-V-NEXT: li a1, 1
; CHECK-V-NEXT: .LBB23_4: # %entry
+; CHECK-V-NEXT: neg a3, a3
; CHECK-V-NEXT: neg a4, a4
; CHECK-V-NEXT: and a0, a4, a0
-; CHECK-V-NEXT: neg a3, a3
; CHECK-V-NEXT: beqz a1, .LBB23_7
; CHECK-V-NEXT: # %bb.5: # %entry
; CHECK-V-NEXT: sgtz a1, a1
@@ -3002,8 +3004,8 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
; CHECK-NOV-NEXT: .cfi_offset s0, -16
; CHECK-NOV-NEXT: .cfi_offset s1, -24
; CHECK-NOV-NEXT: .cfi_offset s2, -32
-; CHECK-NOV-NEXT: mv s2, a0
-; CHECK-NOV-NEXT: fmv.w.x fa0, a1
+; CHECK-NOV-NEXT: mv s2, a1
+; CHECK-NOV-NEXT: fmv.w.x fa0, a0
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: call __fixsfti
; CHECK-NOV-NEXT: mv s0, a0
@@ -3011,58 +3013,60 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
; CHECK-NOV-NEXT: fmv.w.x fa0, s2
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: call __fixsfti
-; CHECK-NOV-NEXT: li a2, -1
-; CHECK-NOV-NEXT: srli a3, a2, 1
-; CHECK-NOV-NEXT: beqz s1, .LBB24_3
+; CHECK-NOV-NEXT: mv a2, a0
+; CHECK-NOV-NEXT: li a0, -1
+; CHECK-NOV-NEXT: srli a3, a0, 1
+; CHECK-NOV-NEXT: beqz a1, .LBB24_3
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: slti a4, s1, 0
-; CHECK-NOV-NEXT: bnez a1, .LBB24_4
+; CHECK-NOV-NEXT: slti a4, a1, 0
+; CHECK-NOV-NEXT: bnez s1, .LBB24_4
; CHECK-NOV-NEXT: .LBB24_2:
-; CHECK-NOV-NEXT: sltu a5, a0, a3
+; CHECK-NOV-NEXT: sltu a5, s0, a3
; CHECK-NOV-NEXT: beqz a5, .LBB24_5
; CHECK-NOV-NEXT: j .LBB24_6
; CHECK-NOV-NEXT: .LBB24_3:
-; CHECK-NOV-NEXT: sltu a4, s0, a3
-; CHECK-NOV-NEXT: beqz a1, .LBB24_2
+; CHECK-NOV-NEXT: sltu a4, a2, a3
+; CHECK-NOV-NEXT: beqz s1, .LBB24_2
; CHECK-NOV-NEXT: .LBB24_4: # %entry
-; CHECK-NOV-NEXT: slti a5, a1, 0
+; CHECK-NOV-NEXT: slti a5, s1, 0
; CHECK-NOV-NEXT: bnez a5, .LBB24_6
; CHECK-NOV-NEXT: .LBB24_5: # %entry
-; CHECK-NOV-NEXT: mv a0, a3
+; CHECK-NOV-NEXT: mv s0, a3
; CHECK-NOV-NEXT: .LBB24_6: # %entry
; CHECK-NOV-NEXT: neg a6, a5
; CHECK-NOV-NEXT: neg a5, a4
-; CHECK-NOV-NEXT: and a5, a5, s1
+; CHECK-NOV-NEXT: and a5, a5, a1
; CHECK-NOV-NEXT: bnez a4, .LBB24_8
; CHECK-NOV-NEXT: # %bb.7: # %entry
-; CHECK-NOV-NEXT: mv s0, a3
+; CHECK-NOV-NEXT: mv a2, a3
; CHECK-NOV-NEXT: .LBB24_8: # %entry
-; CHECK-NOV-NEXT: and a4, a6, a1
-; CHECK-NOV-NEXT: slli a1, a2, 63
-; CHECK-NOV-NEXT: beq a5, a2, .LBB24_11
+; CHECK-NOV-NEXT: and a4, a6, s1
+; CHECK-NOV-NEXT: slli a1, a0, 63
+; CHECK-NOV-NEXT: beq a5, a0, .LBB24_11
; CHECK-NOV-NEXT: # %bb.9: # %entry
; CHECK-NOV-NEXT: slti a3, a5, 0
; CHECK-NOV-NEXT: xori a3, a3, 1
-; CHECK-NOV-NEXT: bne a4, a2, .LBB24_12
+; CHECK-NOV-NEXT: bne a4, a0, .LBB24_12
; CHECK-NOV-NEXT: .LBB24_10:
-; CHECK-NOV-NEXT: sltu a2, a1, a0
-; CHECK-NOV-NEXT: beqz a2, .LBB24_13
+; CHECK-NOV-NEXT: sltu a0, a1, s0
+; CHECK-NOV-NEXT: beqz a0, .LBB24_13
; CHECK-NOV-NEXT: j .LBB24_14
; CHECK-NOV-NEXT: .LBB24_11:
-; CHECK-NOV-NEXT: sltu a3, a1, s0
-; CHECK-NOV-NEXT: beq a4, a2, .LBB24_10
+; CHECK-NOV-NEXT: sltu a3, a1, a2
+; CHECK-NOV-NEXT: beq a4, a0, .LBB24_10
; CHECK-NOV-NEXT: .LBB24_12: # %entry
-; CHECK-NOV-NEXT: slti a2, a4, 0
-; CHECK-NOV-NEXT: xori a2, a2, 1
-; CHECK-NOV-NEXT: bnez a2, .LBB24_14
+; CHECK-NOV-NEXT: slti a0, a4, 0
+; CHECK-NOV-NEXT: xori a0, a0, 1
+; CHECK-NOV-NEXT: bnez a0, .LBB24_14
; CHECK-NOV-NEXT: .LBB24_13: # %entry
-; CHECK-NOV-NEXT: mv a0, a1
+; CHECK-NOV-NEXT: mv s0, a1
; CHECK-NOV-NEXT: .LBB24_14: # %entry
; CHECK-NOV-NEXT: bnez a3, .LBB24_16
; CHECK-NOV-NEXT: # %bb.15: # %entry
-; CHECK-NOV-NEXT: mv s0, a1
+; CHECK-NOV-NEXT: mv a2, a1
; CHECK-NOV-NEXT: .LBB24_16: # %entry
-; CHECK-NOV-NEXT: mv a1, s0
+; CHECK-NOV-NEXT: mv a0, s0
+; CHECK-NOV-NEXT: mv a1, a2
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
@@ -3082,8 +3086,8 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
; CHECK-V-NEXT: .cfi_offset s0, -16
; CHECK-V-NEXT: .cfi_offset s1, -24
; CHECK-V-NEXT: .cfi_offset s2, -32
-; CHECK-V-NEXT: mv s2, a0
-; CHECK-V-NEXT: fmv.w.x fa0, a1
+; CHECK-V-NEXT: mv s2, a1
+; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2
; CHECK-V-NEXT: call __fixsfti
; CHECK-V-NEXT: mv s0, a0
@@ -3093,31 +3097,31 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
; CHECK-V-NEXT: call __fixsfti
; CHECK-V-NEXT: li a2, -1
; CHECK-V-NEXT: srli a3, a2, 1
-; CHECK-V-NEXT: beqz s1, .LBB24_3
+; CHECK-V-NEXT: beqz a1, .LBB24_3
; CHECK-V-NEXT: # %bb.1: # %entry
-; CHECK-V-NEXT: slti a4, s1, 0
-; CHECK-V-NEXT: bnez a1, .LBB24_4
+; CHECK-V-NEXT: slti a4, a1, 0
+; CHECK-V-NEXT: bnez s1, .LBB24_4
; CHECK-V-NEXT: .LBB24_2:
-; CHECK-V-NEXT: sltu a5, a0, a3
+; CHECK-V-NEXT: sltu a5, s0, a3
; CHECK-V-NEXT: beqz a5, .LBB24_5
; CHECK-V-NEXT: j .LBB24_6
; CHECK-V-NEXT: .LBB24_3:
-; CHECK-V-NEXT: sltu a4, s0, a3
-; CHECK-V-NEXT: beqz a1, .LBB24_2
+; CHECK-V-NEXT: sltu a4, a0, a3
+; CHECK-V-NEXT: beqz s1, .LBB24_2
; CHECK-V-NEXT: .LBB24_4: # %entry
-; CHECK-V-NEXT: slti a5, a1, 0
+; CHECK-V-NEXT: slti a5, s1, 0
; CHECK-V-NEXT: bnez a5, .LBB24_6
; CHECK-V-NEXT: .LBB24_5: # %entry
-; CHECK-V-NEXT: mv a0, a3
+; CHECK-V-NEXT: mv s0, a3
; CHECK-V-NEXT: .LBB24_6: # %entry
; CHECK-V-NEXT: neg a6, a5
; CHECK-V-NEXT: neg a5, a4
-; CHECK-V-NEXT: and a5, a5, s1
+; CHECK-V-NEXT: and a5, a5, a1
; CHECK-V-NEXT: bnez a4, .LBB24_8
; CHECK-V-NEXT: # %bb.7: # %entry
-; CHECK-V-NEXT: mv s0, a3
+; CHECK-V-NEXT: mv a0, a3
; CHECK-V-NEXT: .LBB24_8: # %entry
-; CHECK-V-NEXT: and a4, a6, a1
+; CHECK-V-NEXT: and a4, a6, s1
; CHECK-V-NEXT: slli a1, a2, 63
; CHECK-V-NEXT: beq a5, a2, .LBB24_11
; CHECK-V-NEXT: # %bb.9: # %entry
@@ -3125,26 +3129,26 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
; CHECK-V-NEXT: xori a3, a3, 1
; CHECK-V-NEXT: bne a4, a2, .LBB24_12
; CHECK-V-NEXT: .LBB24_10:
-; CHECK-V-NEXT: sltu a2, a1, a0
+; CHECK-V-NEXT: sltu a2, a1, s0
; CHECK-V-NEXT: beqz a2, .LBB24_13
; CHECK-V-NEXT: j .LBB24_14
; CHECK-V-NEXT: .LBB24_11:
-; CHECK-V-NEXT: sltu a3, a1, s0
+; CHECK-V-NEXT: sltu a3, a1, a0
; CHECK-V-NEXT: beq a4, a2, .LBB24_10
; CHECK-V-NEXT: .LBB24_12: # %entry
; CHECK-V-NEXT: slti a2, a4, 0
; CHECK-V-NEXT: xori a2, a2, 1
; CHECK-V-NEXT: bnez a2, .LBB24_14
; CHECK-V-NEXT: .LBB24_13: # %entry
-; CHECK-V-NEXT: mv a0, a1
+; CHECK-V-NEXT: mv s0, a1
; CHECK-V-NEXT: .LBB24_14: # %entry
; CHECK-V-NEXT: bnez a3, .LBB24_16
; CHECK-V-NEXT: # %bb.15: # %entry
-; CHECK-V-NEXT: mv s0, a1
+; CHECK-V-NEXT: mv a0, a1
; CHECK-V-NEXT: .LBB24_16: # %entry
; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT: vmv.s.x v9, s0
-; CHECK-V-NEXT: vmv.s.x v8, a0
+; CHECK-V-NEXT: vmv.s.x v9, a0
+; CHECK-V-NEXT: vmv.s.x v8, s0
; CHECK-V-NEXT: vslideup.vi v8, v9, 1
; CHECK-V-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-V-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
@@ -3175,8 +3179,8 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
; CHECK-NOV-NEXT: .cfi_offset s0, -16
; CHECK-NOV-NEXT: .cfi_offset s1, -24
; CHECK-NOV-NEXT: .cfi_offset s2, -32
-; CHECK-NOV-NEXT: mv s0, a0
-; CHECK-NOV-NEXT: fmv.w.x fa0, a1
+; CHECK-NOV-NEXT: mv s0, a1
+; CHECK-NOV-NEXT: fmv.w.x fa0, a0
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: call __fixunssfti
; CHECK-NOV-NEXT: mv s1, a0
@@ -3184,12 +3188,13 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
; CHECK-NOV-NEXT: fmv.w.x fa0, s0
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: call __fixunssfti
-; CHECK-NOV-NEXT: snez a2, s2
; CHECK-NOV-NEXT: snez a1, a1
+; CHECK-NOV-NEXT: snez a2, s2
+; CHECK-NOV-NEXT: addi a2, a2, -1
+; CHECK-NOV-NEXT: and a2, a2, s1
; CHECK-NOV-NEXT: addi a1, a1, -1
-; CHECK-NOV-NEXT: and a0, a1, a0
-; CHECK-NOV-NEXT: addi a1, a2, -1
-; CHECK-NOV-NEXT: and a1, a1, s1
+; CHECK-NOV-NEXT: and a1, a1, a0
+; CHECK-NOV-NEXT: mv a0, a2
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
@@ -3209,8 +3214,8 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
; CHECK-V-NEXT: .cfi_offset s0, -16
; CHECK-V-NEXT: .cfi_offset s1, -24
; CHECK-V-NEXT: .cfi_offset s2, -32
-; CHECK-V-NEXT: mv s0, a0
-; CHECK-V-NEXT: fmv.w.x fa0, a1
+; CHECK-V-NEXT: mv s0, a1
+; CHECK-V-NEXT: fmv.w.x fa0, a0
; CHECK-V-NEXT: call __extendhfsf2
; CHECK-V-NEXT: call __fixunssfti
; CHECK-V-NEXT: mv s1, a0
@@ -3218,15 +3223,15 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s0
; CHECK-V-NEXT: call __extendhfsf2
; CHECK-V-NEXT: call __fixunssfti
-; CHECK-V-NEXT: snez a2, s2
; CHECK-V-NEXT: snez a1, a1
-; CHECK-V-NEXT: addi a1, a1, -1
-; CHECK-V-NEXT: and a0, a1, a0
+; CHECK-V-NEXT: snez a2, s2
; CHECK-V-NEXT: addi a2, a2, -1
; CHECK-V-NEXT: and a2, a2, s1
+; CHECK-V-NEXT: addi a1, a1, -1
+; CHECK-V-NEXT: and a0, a1, a0
; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT: vmv.s.x v9, a2
-; CHECK-V-NEXT: vmv.s.x v8, a0
+; CHECK-V-NEXT: vmv.s.x v9, a0
+; CHECK-V-NEXT: vmv.s.x v8, a2
; CHECK-V-NEXT: vslideup.vi v8, v9, 1
; CHECK-V-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-V-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
@@ -3269,32 +3274,32 @@ define <2 x i64> @ustest_f16i64(<2 x half> %x) {
; CHECK-NOV-NEXT: # %bb.1: # %entry
; CHECK-NOV-NEXT: li a2, 1
; CHECK-NOV-NEXT: .LBB26_2: # %entry
-; CHECK-NOV-NEXT: slti a4, s1, 1
; CHECK-NOV-NEXT: slti a3, a1, 1
+; CHECK-NOV-NEXT: slti a4, s1, 1
; CHECK-NOV-NEXT: blez a1, .LBB26_4
; CHECK-NOV-NEXT: # %bb.3: # %entry
; CHECK-NOV-NEXT: li a1, 1
; CHECK-NOV-NEXT: .LBB26_4: # %entry
+; CHECK-NOV-NEXT: neg a4, a4
; CHECK-NOV-NEXT: neg a3, a3
; CHECK-NOV-NEXT: and a3, a3, a0
-; CHECK-NOV-NEXT: neg a0, a4
; CHECK-NOV-NEXT: beqz a1, .LBB26_7
; CHECK-NOV-NEXT: # %bb.5: # %entry
; CHECK-NOV-NEXT: sgtz a1, a1
-; CHECK-NOV-NEXT: and a0, a0, s0
+; CHECK-NOV-NEXT: and a4, a4, s0
; CHECK-NOV-NEXT: bnez a2, .LBB26_8
; CHECK-NOV-NEXT: .LBB26_6:
-; CHECK-NOV-NEXT: snez a2, a0
+; CHECK-NOV-NEXT: snez a0, a4
; CHECK-NOV-NEXT: j .LBB26_9
; CHECK-NOV-NEXT: .LBB26_7:
; CHECK-NOV-NEXT: snez a1, a3
-; CHECK-NOV-NEXT: and a0, a0, s0
+; CHECK-NOV-NEXT: and a4, a4, s0
; CHECK-NOV-NEXT: beqz a2, .LBB26_6
; CHECK-NOV-NEXT: .LBB26_8: # %entry
-; CHECK-NOV-NEXT: sgtz a2, a2
+; CHECK-NOV-NEXT: sgtz a0, a2
; CHECK-NOV-NEXT: .LBB26_9: # %entry
-; CHECK-NOV-NEXT: neg a2, a2
-; CHECK-NOV-NEXT: and a0, a2, a0
+; CHECK-NOV-NEXT: neg a0, a0
+; CHECK-NOV-NEXT: and a0, a0, a4
; CHECK-NOV-NEXT: neg a1, a1
; CHECK-NOV-NEXT: and a1, a1, a3
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
@@ -3330,15 +3335,15 @@ define <2 x i64> @ustest_f16i64(<2 x half> %x) {
; CHECK-V-NEXT: # %bb.1: # %entry
; CHECK-V-NEXT: li a2, 1
; CHECK-V-NEXT: .LBB26_2: # %entry
-; CHECK-V-NEXT: slti a3, s1, 1
; CHECK-V-NEXT: slti a4, a1, 1
+; CHECK-V-NEXT: slti a3, s1, 1
; CHECK-V-NEXT: blez a1, .LBB26_4
; CHECK-V-NEXT: # %bb.3: # %entry
; CHECK-V-NEXT: li a1, 1
; CHECK-V-NEXT: .LBB26_4: # %entry
+; CHECK-V-NEXT: neg a3, a3
; CHECK-V-NEXT: neg a4, a4
; CHECK-V-NEXT: and a0, a4, a0
-; CHECK-V-NEXT: neg a3, a3
; CHECK-V-NEXT: beqz a1, .LBB26_7
; CHECK-V-NEXT: # %bb.5: # %entry
; CHECK-V-NEXT: sgtz a1, a1
@@ -5811,15 +5816,15 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
; CHECK-NOV-NEXT: mv s1, a1
; CHECK-NOV-NEXT: fmv.d fa0, fs0
; CHECK-NOV-NEXT: call __fixdfti
-; CHECK-NOV-NEXT: mv a2, s1
-; CHECK-NOV-NEXT: mv a3, a1
+; CHECK-NOV-NEXT: mv a2, a1
; CHECK-NOV-NEXT: blez a1, .LBB47_2
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: li a3, 1
+; CHECK-NOV-NEXT: li a2, 1
; CHECK-NOV-NEXT: .LBB47_2: # %entry
-; CHECK-NOV-NEXT: blez a2, .LBB47_4
+; CHECK-NOV-NEXT: mv a3, s1
+; CHECK-NOV-NEXT: blez s1, .LBB47_4
; CHECK-NOV-NEXT: # %bb.3: # %entry
-; CHECK-NOV-NEXT: li a2, 1
+; CHECK-NOV-NEXT: li a3, 1
; CHECK-NOV-NEXT: .LBB47_4: # %entry
; CHECK-NOV-NEXT: slti a1, a1, 1
; CHECK-NOV-NEXT: neg a1, a1
@@ -5827,11 +5832,11 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
; CHECK-NOV-NEXT: slti a0, s1, 1
; CHECK-NOV-NEXT: neg a0, a0
; CHECK-NOV-NEXT: and a0, a0, s0
+; CHECK-NOV-NEXT: slti a3, a3, 0
+; CHECK-NOV-NEXT: addi a3, a3, -1
+; CHECK-NOV-NEXT: and a0, a3, a0
; CHECK-NOV-NEXT: slti a2, a2, 0
; CHECK-NOV-NEXT: addi a2, a2, -1
-; CHECK-NOV-NEXT: and a0, a2, a0
-; CHECK-NOV-NEXT: slti a2, a3, 0
-; CHECK-NOV-NEXT: addi a2, a2, -1
; CHECK-NOV-NEXT: and a1, a2, a1
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
@@ -5867,15 +5872,15 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vfmv.f.s fa0, v8
; CHECK-V-NEXT: call __fixdfti
-; CHECK-V-NEXT: mv a2, s1
-; CHECK-V-NEXT: mv a3, a1
+; CHECK-V-NEXT: mv a2, a1
; CHECK-V-NEXT: blez a1, .LBB47_2
; CHECK-V-NEXT: # %bb.1: # %entry
-; CHECK-V-NEXT: li a3, 1
+; CHECK-V-NEXT: li a2, 1
; CHECK-V-NEXT: .LBB47_2: # %entry
-; CHECK-V-NEXT: blez a2, .LBB47_4
+; CHECK-V-NEXT: mv a3, s1
+; CHECK-V-NEXT: blez s1, .LBB47_4
; CHECK-V-NEXT: # %bb.3: # %entry
-; CHECK-V-NEXT: li a2, 1
+; CHECK-V-NEXT: li a3, 1
; CHECK-V-NEXT: .LBB47_4: # %entry
; CHECK-V-NEXT: slti a1, a1, 1
; CHECK-V-NEXT: neg a1, a1
@@ -5883,11 +5888,11 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
; CHECK-V-NEXT: slti a1, s1, 1
; CHECK-V-NEXT: neg a1, a1
; CHECK-V-NEXT: and a1, a1, s0
+; CHECK-V-NEXT: slti a3, a3, 0
+; CHECK-V-NEXT: addi a3, a3, -1
+; CHECK-V-NEXT: and a1, a3, a1
; CHECK-V-NEXT: slti a2, a2, 0
; CHECK-V-NEXT: addi a2, a2, -1
-; CHECK-V-NEXT: and a1, a2, a1
-; CHECK-V-NEXT: slti a2, a3, 0
-; CHECK-V-NEXT: addi a2, a2, -1
; CHECK-V-NEXT: and a0, a2, a0
; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
@@ -6197,15 +6202,15 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
; CHECK-NOV-NEXT: mv s1, a1
; CHECK-NOV-NEXT: fmv.s fa0, fs0
; CHECK-NOV-NEXT: call __fixsfti
-; CHECK-NOV-NEXT: mv a2, s1
-; CHECK-NOV-NEXT: mv a3, a1
+; CHECK-NOV-NEXT: mv a2, a1
; CHECK-NOV-NEXT: blez a1, .LBB50_2
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: li a3, 1
+; CHECK-NOV-NEXT: li a2, 1
; CHECK-NOV-NEXT: .LBB50_2: # %entry
-; CHECK-NOV-NEXT: blez a2, .LBB50_4
+; CHECK-NOV-NEXT: mv a3, s1
+; CHECK-NOV-NEXT: blez s1, .LBB50_4
; CHECK-NOV-NEXT: # %bb.3: # %entry
-; CHECK-NOV-NEXT: li a2, 1
+; CHECK-NOV-NEXT: li a3, 1
; CHECK-NOV-NEXT: .LBB50_4: # %entry
; CHECK-NOV-NEXT: slti a1, a1, 1
; CHECK-NOV-NEXT: neg a1, a1
@@ -6213,11 +6218,11 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
; CHECK-NOV-NEXT: slti a0, s1, 1
; CHECK-NOV-NEXT: neg a0, a0
; CHECK-NOV-NEXT: and a0, a0, s0
+; CHECK-NOV-NEXT: slti a3, a3, 0
+; CHECK-NOV-NEXT: addi a3, a3, -1
+; CHECK-NOV-NEXT: and a0, a3, a0
; CHECK-NOV-NEXT: slti a2, a2, 0
; CHECK-NOV-NEXT: addi a2, a2, -1
-; CHECK-NOV-NEXT: and a0, a2, a0
-; CHECK-NOV-NEXT: slti a2, a3, 0
-; CHECK-NOV-NEXT: addi a2, a2, -1
; CHECK-NOV-NEXT: and a1, a2, a1
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
@@ -6253,15 +6258,15 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vfmv.f.s fa0, v8
; CHECK-V-NEXT: call __fixsfti
-; CHECK-V-NEXT: mv a2, s1
-; CHECK-V-NEXT: mv a3, a1
+; CHECK-V-NEXT: mv a2, a1
; CHECK-V-NEXT: blez a1, .LBB50_2
; CHECK-V-NEXT: # %bb.1: # %entry
-; CHECK-V-NEXT: li a3, 1
+; CHECK-V-NEXT: li a2, 1
; CHECK-V-NEXT: .LBB50_2: # %entry
-; CHECK-V-NEXT: blez a2, .LBB50_4
+; CHECK-V-NEXT: mv a3, s1
+; CHECK-V-NEXT: blez s1, .LBB50_4
; CHECK-V-NEXT: # %bb.3: # %entry
-; CHECK-V-NEXT: li a2, 1
+; CHECK-V-NEXT: li a3, 1
; CHECK-V-NEXT: .LBB50_4: # %entry
; CHECK-V-NEXT: slti a1, a1, 1
; CHECK-V-NEXT: neg a1, a1
@@ -6269,11 +6274,11 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
; CHECK-V-NEXT: slti a1, s1, 1
; CHECK-V-NEXT: neg a1, a1
; CHECK-V-NEXT: and a1, a1, s0
+; CHECK-V-NEXT: slti a3, a3, 0
+; CHECK-V-NEXT: addi a3, a3, -1
+; CHECK-V-NEXT: and a1, a3, a1
; CHECK-V-NEXT: slti a2, a2, 0
; CHECK-V-NEXT: addi a2, a2, -1
-; CHECK-V-NEXT: and a1, a2, a1
-; CHECK-V-NEXT: slti a2, a3, 0
-; CHECK-V-NEXT: addi a2, a2, -1
; CHECK-V-NEXT: and a0, a2, a0
; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
@@ -6575,15 +6580,15 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
; CHECK-NOV-NEXT: fmv.w.x fa0, s2
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: call __fixsfti
-; CHECK-NOV-NEXT: mv a2, s1
-; CHECK-NOV-NEXT: mv a3, a1
+; CHECK-NOV-NEXT: mv a2, a1
; CHECK-NOV-NEXT: blez a1, .LBB53_2
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: li a3, 1
+; CHECK-NOV-NEXT: li a2, 1
; CHECK-NOV-NEXT: .LBB53_2: # %entry
-; CHECK-NOV-NEXT: blez a2, .LBB53_4
+; CHECK-NOV-NEXT: mv a3, s1
+; CHECK-NOV-NEXT: blez s1, .LBB53_4
; CHECK-NOV-NEXT: # %bb.3: # %entry
-; CHECK-NOV-NEXT: li a2, 1
+; CHECK-NOV-NEXT: li a3, 1
; CHECK-NOV-NEXT: .LBB53_4: # %entry
; CHECK-NOV-NEXT: slti a1, a1, 1
; CHECK-NOV-NEXT: neg a1, a1
@@ -6591,11 +6596,11 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
; CHECK-NOV-NEXT: slti a0, s1, 1
; CHECK-NOV-NEXT: neg a0, a0
; CHECK-NOV-NEXT: and a0, a0, s0
+; CHECK-NOV-NEXT: slti a3, a3, 0
+; CHECK-NOV-NEXT: addi a3, a3, -1
+; CHECK-NOV-NEXT: and a0, a3, a0
; CHECK-NOV-NEXT: slti a2, a2, 0
; CHECK-NOV-NEXT: addi a2, a2, -1
-; CHECK-NOV-NEXT: and a0, a2, a0
-; CHECK-NOV-NEXT: slti a2, a3, 0
-; CHECK-NOV-NEXT: addi a2, a2, -1
; CHECK-NOV-NEXT: and a1, a2, a1
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
@@ -6625,15 +6630,15 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s2
; CHECK-V-NEXT: call __extendhfsf2
; CHECK-V-NEXT: call __fixsfti
-; CHECK-V-NEXT: mv a2, s1
-; CHECK-V-NEXT: mv a3, a1
+; CHECK-V-NEXT: mv a2, a1
; CHECK-V-NEXT: blez a1, .LBB53_2
; CHECK-V-NEXT: # %bb.1: # %entry
-; CHECK-V-NEXT: li a3, 1
+; CHECK-V-NEXT: li a2, 1
; CHECK-V-NEXT: .LBB53_2: # %entry
-; CHECK-V-NEXT: blez a2, .LBB53_4
+; CHECK-V-NEXT: mv a3, s1
+; CHECK-V-NEXT: blez s1, .LBB53_4
; CHECK-V-NEXT: # %bb.3: # %entry
-; CHECK-V-NEXT: li a2, 1
+; CHECK-V-NEXT: li a3, 1
; CHECK-V-NEXT: .LBB53_4: # %entry
; CHECK-V-NEXT: slti a1, a1, 1
; CHECK-V-NEXT: neg a1, a1
@@ -6641,11 +6646,11 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
; CHECK-V-NEXT: slti a1, s1, 1
; CHECK-V-NEXT: neg a1, a1
; CHECK-V-NEXT: and a1, a1, s0
+; CHECK-V-NEXT: slti a3, a3, 0
+; CHECK-V-NEXT: addi a3, a3, -1
+; CHECK-V-NEXT: and a1, a3, a1
; CHECK-V-NEXT: slti a2, a2, 0
; CHECK-V-NEXT: addi a2, a2, -1
-; CHECK-V-NEXT: and a1, a2, a1
-; CHECK-V-NEXT: slti a2, a3, 0
-; CHECK-V-NEXT: addi a2, a2, -1
; CHECK-V-NEXT: and a0, a2, a0
; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-V-NEXT: vmv.s.x v9, a0
diff --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll
index dd180b67e492a0..0c33e8973c2d20 100644
--- a/llvm/test/CodeGen/X86/abdu-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll
@@ -715,43 +715,41 @@ define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwin
;
; SSE42-LABEL: abd_cmp_v2i64_multiuse_cmp:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT: movdqa %xmm0, %xmm2
+; SSE42-NEXT: psubq %xmm1, %xmm2
; SSE42-NEXT: movdqa %xmm1, %xmm3
-; SSE42-NEXT: pxor %xmm2, %xmm3
-; SSE42-NEXT: pxor %xmm0, %xmm2
-; SSE42-NEXT: pcmpgtq %xmm3, %xmm2
-; SSE42-NEXT: movdqa %xmm0, %xmm3
-; SSE42-NEXT: psubq %xmm1, %xmm3
-; SSE42-NEXT: psubq %xmm0, %xmm1
-; SSE42-NEXT: movdqa %xmm2, %xmm0
-; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT: paddq %xmm1, %xmm2
-; SSE42-NEXT: movdqa %xmm2, %xmm0
+; SSE42-NEXT: psubq %xmm0, %xmm3
+; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT: pxor %xmm4, %xmm1
+; SSE42-NEXT: pxor %xmm4, %xmm0
+; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; SSE42-NEXT: paddq %xmm3, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: abd_cmp_v2i64_multiuse_cmp:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: # xmm2 = mem[0,0]
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm3
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm4 = mem[0,0]
+; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm1
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_cmp_v2i64_multiuse_cmp:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
-; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm3
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm1
+; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_cmp_v2i64_multiuse_cmp:
diff --git a/llvm/test/CodeGen/X86/apx/kmov-postrapseudos.ll b/llvm/test/CodeGen/X86/apx/kmov-postrapseudos.ll
index 017024c173c3f7..b2cb2c3e04b3f4 100644
--- a/llvm/test/CodeGen/X86/apx/kmov-postrapseudos.ll
+++ b/llvm/test/CodeGen/X86/apx/kmov-postrapseudos.ll
@@ -52,10 +52,7 @@ alloca_21:
define i32 @kmovrk_1(<4 x ptr> %arg) {
; AVX512-LABEL: kmovrk_1:
; AVX512: # %bb.0: # %bb
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc0]
-; AVX512-NEXT: kmovw %k0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x93,0xc0]
-; AVX512-NEXT: testb $15, %al # encoding: [0xa8,0x0f]
+; AVX512-NEXT: vptest %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x17,0xc0]
; AVX512-NEXT: jne .LBB2_1 # encoding: [0x75,A]
; AVX512-NEXT: # fixup A - offset: 1, value: .LBB2_1-1, kind: FK_PCRel_1
; AVX512-NEXT: # %bb.2: # %bb3
@@ -66,10 +63,7 @@ define i32 @kmovrk_1(<4 x ptr> %arg) {
;
; AVX512BW-LABEL: kmovrk_1:
; AVX512BW: # %bb.0: # %bb
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc0]
-; AVX512BW-NEXT: kmovd %k0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x93,0xc0]
-; AVX512BW-NEXT: testb $15, %al # encoding: [0xa8,0x0f]
+; AVX512BW-NEXT: vptest %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x17,0xc0]
; AVX512BW-NEXT: jne .LBB2_1 # encoding: [0x75,A]
; AVX512BW-NEXT: # fixup A - offset: 1, value: .LBB2_1-1, kind: FK_PCRel_1
; AVX512BW-NEXT: # %bb.2: # %bb3
diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
index 13d1265a249d1f..7e48b3719cf0ff 100644
--- a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
@@ -30,13 +30,13 @@ define <64 x i8> @add_v64i8_broadcasts(<64 x i8> %a0, i64 %a1, i8 %a2) {
; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0
-; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm3
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm3
; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm4
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm4
+; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm3
; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; AVX512F-NEXT: vpternlogq $226, %zmm4, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index 8d2bb77a9e1af6..5d7bf4a2c9788f 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -80,13 +80,13 @@ define <4 x i32> @combine_vec_mul_pow2b(<4 x i32> %x) {
define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) {
; SSE-LABEL: combine_vec_mul_pow2c:
; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: paddq %xmm0, %xmm2
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: psllq $4, %xmm2
; SSE-NEXT: psllq $2, %xmm1
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: paddq %xmm0, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_pow2c:
@@ -399,14 +399,12 @@ define i64 @combine_mul_self_demandedbits(i64 %x) {
; SSE: # %bb.0:
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: imulq %rdi, %rax
-; SSE-NEXT: andq $-3, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: combine_mul_self_demandedbits:
; AVX: # %bb.0:
; AVX-NEXT: movq %rdi, %rax
; AVX-NEXT: imulq %rdi, %rax
-; AVX-NEXT: andq $-3, %rax
; AVX-NEXT: retq
%1 = mul i64 %x, %x
%2 = and i64 %1, -3
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index e12ca56023a7f2..33cc8e96f663f5 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -182,101 +182,101 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: sarl $31, %eax
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: sarl $31, %edi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: sarl $31, %ebx
; X86-NEXT: movl %eax, %esi
; X86-NEXT: xorl %ecx, %esi
; X86-NEXT: movl %esi, %ebp
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: movl %ecx, %edi
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %eax, %esi
; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi
; X86-NEXT: subl %eax, %esi
-; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
; X86-NEXT: sbbl %eax, %ebp
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %esi
+; X86-NEXT: movl %ebx, %esi
; X86-NEXT: xorl %edx, %esi
-; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl %ebx, %edx
; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %edi, %ebp
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: xorl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %ebx, %ebp
; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: subl %edi, %ebp
-; X86-NEXT: sbbl %edi, %ebx
-; X86-NEXT: sbbl %edi, %edx
-; X86-NEXT: sbbl %edi, %esi
-; X86-NEXT: xorl %eax, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: subl %ebx, %ebp
+; X86-NEXT: sbbl %ebx, %edi
+; X86-NEXT: sbbl %ebx, %edx
+; X86-NEXT: sbbl %ebx, %esi
+; X86-NEXT: xorl %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl %ebp, %ecx
; X86-NEXT: orl %edx, %ecx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: sete %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl (%esp), %edx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl (%esp), %edx # 4-byte Folded Reload
; X86-NEXT: orl %eax, %edx
; X86-NEXT: sete %al
; X86-NEXT: orb %cl, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: bsrl %esi, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl %edi, %ecx
+; X86-NEXT: bsrl %ebx, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: addl $32, %ecx
; X86-NEXT: testl %esi, %esi
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: bsrl %ebx, %edx
+; X86-NEXT: bsrl %edi, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: bsrl %ebp, %ebp
; X86-NEXT: xorl $31, %ebp
; X86-NEXT: addl $32, %ebp
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %edx, %ebp
; X86-NEXT: addl $64, %ebp
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %edi
+; X86-NEXT: orl %esi, %ebx
; X86-NEXT: cmovnel %ecx, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: bsrl %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NEXT: bsrl %eax, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: addl $32, %ecx
-; X86-NEXT: testl %edi, %edi
+; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: bsrl %edi, %esi
; X86-NEXT: xorl $31, %esi
-; X86-NEXT: bsrl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: xorl $31, %edx
; X86-NEXT: addl $32, %edx
-; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %esi, %edx
; X86-NEXT: addl $64, %edx
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: orl %edi, %esi
+; X86-NEXT: orl %ebx, %esi
; X86-NEXT: cmovnel %ecx, %edx
; X86-NEXT: xorl %esi, %esi
; X86-NEXT: subl %edx, %ebp
-; X86-NEXT: movl $0, %ebx
-; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
; X86-NEXT: movl $0, %edx
; X86-NEXT: sbbl %edx, %edx
; X86-NEXT: movl $0, %eax
@@ -284,40 +284,40 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl $127, %ecx
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: cmpl %ebp, %ecx
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: sbbl %edi, %ecx
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl $0, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: setb %cl
; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT: cmovnel %esi, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %ebx, %edx
; X86-NEXT: cmovnel %esi, %edx
+; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT: cmovnel %esi, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: cmovnel %esi, %eax
-; X86-NEXT: cmovel (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: jne .LBB4_8
; X86-NEXT: # %bb.1: # %_udiv-special-cases
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: xorl $127, %ebx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: xorl $127, %edi
+; X86-NEXT: orl %ebp, %edi
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: orl %edi, %ecx
; X86-NEXT: je .LBB4_8
; X86-NEXT: # %bb.2: # %udiv-bb1
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -332,234 +332,233 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $15, %al
; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %ebx
-; X86-NEXT: movl 144(%esp,%ebx), %edx
-; X86-NEXT: movl 148(%esp,%ebx), %edi
+; X86-NEXT: movsbl %al, %edi
+; X86-NEXT: movl 144(%esp,%edi), %edx
+; X86-NEXT: movl 148(%esp,%edi), %esi
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shll %cl, %edx
; X86-NEXT: notb %cl
-; X86-NEXT: movl 140(%esp,%ebx), %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: shrl %esi
-; X86-NEXT: shrl %cl, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: movl 136(%esp,%ebx), %esi
+; X86-NEXT: movl 140(%esp,%edi), %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: shrl %ebx
+; X86-NEXT: shrl %cl, %ebx
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: movl 136(%esp,%edi), %edx
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $0, %edx
; X86-NEXT: jae .LBB4_3
; X86-NEXT: # %bb.6:
-; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: xorl %edi, %edi
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: jmp .LBB4_7
; X86-NEXT: .LBB4_3: # %udiv-preheader
-; X86-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movb %bl, %ch
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movb %dl, %ch
; X86-NEXT: andb $7, %ch
-; X86-NEXT: movb %bl, %cl
+; X86-NEXT: movb %dl, %cl
; X86-NEXT: shrb $3, %cl
; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ebp
-; X86-NEXT: movl 100(%esp,%ebp), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 96(%esp,%ebp), %ebx
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrdl %cl, %esi, %edx
-; X86-NEXT: movl 88(%esp,%ebp), %ebp
-; X86-NEXT: movl 92(%esp,%eax), %esi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: shrl %cl, %eax
-; X86-NEXT: notb %cl
-; X86-NEXT: addl %ebx, %ebx
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: orl %eax, %ebx
+; X86-NEXT: movzbl %cl, %edx
+; X86-NEXT: movl 100(%esp,%edx), %esi
+; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT: movl 96(%esp,%edx), %edi
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %ebp
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: shrdl %cl, %esi, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: addl $-1, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl $-1, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl $-1, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 88(%esp,%edx), %ebx
+; X86-NEXT: movl 92(%esp,%edx), %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: shrl %cl, %edx
+; X86-NEXT: notb %cl
+; X86-NEXT: addl %edi, %edi
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movb %ch, %cl
+; X86-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
+; X86-NEXT: shrdl %cl, %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: adcl $-1, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: addl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB4_4: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, %ebx
+; X86-NEXT: movl %ebp, %esi
+; X86-NEXT: shldl $1, %ebp, (%esp) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: shldl $1, %ebp, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: shldl $1, %ebp, %edx
-; X86-NEXT: shldl $1, %edi, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl $1, %eax, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: shldl $1, %edx, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $1, %ecx, %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shldl $1, %eax, %ecx
-; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: orl %ebx, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %eax, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: orl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: sbbl %ebp, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: sbbl %esi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload
; X86-NEXT: sarl $31, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl $1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl %edi, %esi
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, %edi
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: subl %ecx, %ebp
-; X86-NEXT: sbbl %eax, %edx
+; X86-NEXT: subl %ecx, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: sbbl %edi, %edx
-; X86-NEXT: movl (%esp), %edi # 4-byte Reload
-; X86-NEXT: sbbl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl %eax, %ebp
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %edi, %esi
+; X86-NEXT: movl %esi, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %ebx, (%esp) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl $-1, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: adcl $-1, %esi
-; X86-NEXT: adcl $-1, %ebx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %ebx, %eax
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: orl %edi, %ecx
; X86-NEXT: jne .LBB4_4
; X86-NEXT: # %bb.5:
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: .LBB4_7: # %udiv-loop-exit
-; X86-NEXT: shldl $1, %edx, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: shldl $1, %eax, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: shldl $1, %ebx, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: shldl $1, %eax, %ebx
+; X86-NEXT: orl %ecx, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: shldl $1, %esi, %eax
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: addl %esi, %esi
-; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: orl %edi, %esi
; X86-NEXT: .LBB4_8: # %udiv-end
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, %edi
; X86-NEXT: xorl %ecx, %edx
+; X86-NEXT: xorl %ecx, %ebx
; X86-NEXT: xorl %ecx, %eax
; X86-NEXT: xorl %ecx, %esi
; X86-NEXT: subl %ecx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %ecx, %ebx
; X86-NEXT: sbbl %ecx, %edx
-; X86-NEXT: sbbl %ecx, %edi
-; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT: movl %esi, (%ebp)
-; X86-NEXT: movl %eax, 4(%ebp)
-; X86-NEXT: movl %edx, 8(%ebp)
-; X86-NEXT: movl %edi, 12(%ebp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %esi, (%ecx)
+; X86-NEXT: movl %eax, 4(%ecx)
+; X86-NEXT: movl %ebx, 8(%ecx)
+; X86-NEXT: movl %edx, 12(%ecx)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-NEXT: imull %eax, %ecx
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT: addl %edx, %edi
+; X86-NEXT: addl %ecx, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: imull %esi, %ecx
@@ -568,12 +567,12 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: mull %edx
; X86-NEXT: addl %edx, %ebp
; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %ebx, %ebp
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl %edi, %ebp
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: subl (%esp), %edx # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
diff --git a/llvm/test/CodeGen/X86/fold-masked-merge.ll b/llvm/test/CodeGen/X86/fold-masked-merge.ll
index 135494ac25f8cb..b2614c5fe0493c 100644
--- a/llvm/test/CodeGen/X86/fold-masked-merge.ll
+++ b/llvm/test/CodeGen/X86/fold-masked-merge.ll
@@ -56,9 +56,7 @@ define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
; NOBMI-LABEL: masked_merge2:
; NOBMI: # %bb.0:
; NOBMI-NEXT: movl %esi, %eax
-; NOBMI-NEXT: xorb %sil, %al
-; NOBMI-NEXT: andb %dil, %al
-; NOBMI-NEXT: xorb %sil, %al
+; NOBMI-NEXT: # kill: def $al killed $al killed $eax
; NOBMI-NEXT: retq
;
; BMI-LABEL: masked_merge2:
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index b212e9438e1b52..c79da37988e40b 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -459,8 +459,7 @@ define i32 @freeze_ashr(i32 %a0) nounwind {
; X64-LABEL: freeze_ashr:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: sarl $3, %eax
-; X64-NEXT: sarl $3, %eax
+; X64-NEXT: sarl $6, %eax
; X64-NEXT: retq
%x = ashr i32 %a0, 3
%y = freeze i32 %x
@@ -531,30 +530,12 @@ define i32 @freeze_ashr_outofrange(i32 %a0) nounwind {
define <8 x i16> @freeze_ashr_vec(<8 x i16> %a0) nounwind {
; X86-LABEL: freeze_ashr_vec:
; X86: # %bb.0:
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: psraw $1, %xmm2
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
-; X86-NEXT: movdqa %xmm1, %xmm3
-; X86-NEXT: pandn %xmm2, %xmm3
-; X86-NEXT: psraw $3, %xmm0
-; X86-NEXT: pand %xmm1, %xmm0
-; X86-NEXT: por %xmm3, %xmm0
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: psraw $3, %xmm2
-; X86-NEXT: psraw $1, %xmm0
-; X86-NEXT: pand %xmm1, %xmm0
-; X86-NEXT: pandn %xmm2, %xmm1
-; X86-NEXT: por %xmm1, %xmm0
+; X86-NEXT: psraw $4, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: freeze_ashr_vec:
; X64: # %bb.0:
-; X64-NEXT: vpsraw $1, %xmm0, %xmm1
-; X64-NEXT: vpsraw $3, %xmm0, %xmm0
-; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; X64-NEXT: vpsraw $3, %xmm0, %xmm1
-; X64-NEXT: vpsraw $1, %xmm0, %xmm0
-; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; X64-NEXT: vpsraw $4, %xmm0, %xmm0
; X64-NEXT: retq
%x = ashr <8 x i16> %a0, <i16 3, i16 1, i16 3, i16 1, i16 3, i16 1, i16 3, i16 1>
%y = freeze <8 x i16> %x
@@ -592,8 +573,7 @@ define i32 @freeze_lshr(i32 %a0) nounwind {
; X64-LABEL: freeze_lshr:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shrl $2, %eax
-; X64-NEXT: shrl %eax
+; X64-NEXT: shrl $3, %eax
; X64-NEXT: retq
%x = lshr i32 %a0, 2
%y = freeze i32 %x
@@ -664,30 +644,12 @@ define i32 @freeze_lshr_outofrange(i32 %a0) nounwind {
define <8 x i16> @freeze_lshr_vec(<8 x i16> %a0) nounwind {
; X86-LABEL: freeze_lshr_vec:
; X86: # %bb.0:
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: psrlw $1, %xmm2
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
-; X86-NEXT: movdqa %xmm1, %xmm3
-; X86-NEXT: pandn %xmm2, %xmm3
-; X86-NEXT: psrlw $2, %xmm0
-; X86-NEXT: pand %xmm1, %xmm0
-; X86-NEXT: por %xmm3, %xmm0
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: psrlw $2, %xmm2
-; X86-NEXT: psrlw $1, %xmm0
-; X86-NEXT: pand %xmm1, %xmm0
-; X86-NEXT: pandn %xmm2, %xmm1
-; X86-NEXT: por %xmm1, %xmm0
+; X86-NEXT: psrlw $3, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: freeze_lshr_vec:
; X64: # %bb.0:
-; X64-NEXT: vpsrlw $1, %xmm0, %xmm1
-; X64-NEXT: vpsrlw $2, %xmm0, %xmm0
-; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; X64-NEXT: vpsrlw $2, %xmm0, %xmm1
-; X64-NEXT: vpsrlw $1, %xmm0, %xmm0
-; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; X64-NEXT: vpsrlw $3, %xmm0, %xmm0
; X64-NEXT: retq
%x = lshr <8 x i16> %a0, <i16 2, i16 1, i16 2, i16 1, i16 2, i16 1, i16 2, i16 1>
%y = freeze <8 x i16> %x
diff --git a/llvm/test/CodeGen/X86/freeze-combine.ll b/llvm/test/CodeGen/X86/freeze-combine.ll
index b037a6d9a1b93b..1cfb8627a4dd45 100644
--- a/llvm/test/CodeGen/X86/freeze-combine.ll
+++ b/llvm/test/CodeGen/X86/freeze-combine.ll
@@ -3,9 +3,9 @@
define i32 @const() {
; CHECK-LABEL: name: const
; CHECK: bb.0 (%ir-block.0):
- ; CHECK: [[MOV32ri:%[0-9]+]]:gr32 = MOV32ri 1
- ; CHECK: $eax = COPY [[MOV32ri]]
- ; CHECK: RET 0, $eax
+ ; CHECK-NEXT: [[MOV32ri:%[0-9]+]]:gr32 = MOV32ri 1
+ ; CHECK-NEXT: $eax = COPY [[MOV32ri]]
+ ; CHECK-NEXT: RET 0, $eax
%y = freeze i32 1
ret i32 %y
}
@@ -13,11 +13,11 @@ define i32 @const() {
define i32 @fold(i32 %x) {
; CHECK-LABEL: name: fold
; CHECK: bb.0 (%ir-block.0):
- ; CHECK: liveins: $edi
- ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY $edi
- ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY [[COPY]]
- ; CHECK: $eax = COPY [[COPY1]]
- ; CHECK: RET 0, $eax
+ ; CHECK-NEXT: liveins: $edi
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edi
+ ; CHECK-NEXT: $eax = COPY [[COPY]]
+ ; CHECK-NEXT: RET 0, $eax
%y = freeze i32 %x
%z = freeze i32 %y
ret i32 %z
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index ee7f4aea02c00f..fe240286462e9a 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -672,3 +672,23 @@ define void @pr59677(i32 %x, ptr %out) nounwind {
ret void
}
declare <4 x float> @llvm.sin.v4f32(<4 x float>)
+
+; Test that we can eliminate freeze by changing the BUILD_VECTOR to a splat
+; zero vector.
+define void @freeze_buildvector_not_simple_type(ptr %dst) nounwind {
+; X86-LABEL: freeze_buildvector_not_simple_type:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movb $0, 4(%eax)
+; X86-NEXT: movl $0, (%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: freeze_buildvector_not_simple_type:
+; X64: # %bb.0:
+; X64-NEXT: movb $0, 4(%rdi)
+; X64-NEXT: movl $0, (%rdi)
+; X64-NEXT: retq
+ %i0 = freeze <5 x i8> <i8 poison, i8 0, i8 0, i8 undef, i8 0>
+ store <5 x i8> %i0, ptr %dst
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
index 0c341dc63a9ecc..afe0ebb9dcb4f0 100644
--- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
@@ -522,17 +522,17 @@ declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
define <16 x i8> @splatconstant_fshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; GFNISSE-LABEL: splatconstant_fshr_v16i8:
; GFNISSE: # %bb.0:
+; GFNISSE-NEXT: paddb %xmm0, %xmm0
; GFNISSE-NEXT: psrlw $7, %xmm1
; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
; GFNISSE-NEXT: por %xmm1, %xmm0
; GFNISSE-NEXT: retq
;
; GFNIAVX1OR2-LABEL: splatconstant_fshr_v16i8:
; GFNIAVX1OR2: # %bb.0:
+; GFNIAVX1OR2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: vpsrlw $7, %xmm1, %xmm1
; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; GFNIAVX1OR2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll
index 7ab8300b269a48..96aff5b2af3155 100644
--- a/llvm/test/CodeGen/X86/gfni-rotates.ll
+++ b/llvm/test/CodeGen/X86/gfni-rotates.ll
@@ -421,18 +421,18 @@ define <16 x i8> @splatconstant_rotr_v16i8(<16 x i8> %a) nounwind {
; GFNISSE-LABEL: splatconstant_rotr_v16i8:
; GFNISSE: # %bb.0:
; GFNISSE-NEXT: movdqa %xmm0, %xmm1
-; GFNISSE-NEXT: psrlw $7, %xmm1
-; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
+; GFNISSE-NEXT: paddb %xmm0, %xmm1
+; GFNISSE-NEXT: psrlw $7, %xmm0
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; GFNISSE-NEXT: por %xmm1, %xmm0
; GFNISSE-NEXT: retq
;
; GFNIAVX1OR2-LABEL: splatconstant_rotr_v16i8:
; GFNIAVX1OR2: # %bb.0:
-; GFNIAVX1OR2-NEXT: vpsrlw $7, %xmm0, %xmm1
-; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; GFNIAVX1OR2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT: vpaddb %xmm0, %xmm0, %xmm1
+; GFNIAVX1OR2-NEXT: vpsrlw $7, %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT: vpor %xmm0, %xmm1, %xmm0
; GFNIAVX1OR2-NEXT: retq
;
; GFNIAVX512-LABEL: splatconstant_rotr_v16i8:
diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index 39d02f9112f4fc..2f780e3c6fe1f1 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -676,12 +676,13 @@ define i32 @rotr_known_nonzero(i32 %xx, i32 %y) {
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: orl $256, %edi # imm = 0x100
+; X64-NEXT: movl %edi, %eax
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: rorl %cl, %edi
+; X64-NEXT: rorl %cl, %eax
; X64-NEXT: testl %edi, %edi
; X64-NEXT: je .LBB22_1
; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: rep bsfl %edi, %eax
+; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB22_1:
; X64-NEXT: movl $32, %eax
@@ -713,12 +714,13 @@ define i32 @rotr_maybe_zero(i32 %x, i32 %y) {
; X64-LABEL: rotr_maybe_zero:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: movl %edi, %eax
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: rorl %cl, %edi
+; X64-NEXT: rorl %cl, %eax
; X64-NEXT: testl %edi, %edi
; X64-NEXT: je .LBB23_1
; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: rep bsfl %edi, %eax
+; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB23_1:
; X64-NEXT: movl $32, %eax
@@ -773,12 +775,13 @@ define i32 @rotr_with_fshr_maybe_zero(i32 %x, i32 %y) {
; X64-LABEL: rotr_with_fshr_maybe_zero:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: movl %edi, %eax
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: rorl %cl, %edi
+; X64-NEXT: rorl %cl, %eax
; X64-NEXT: testl %edi, %edi
; X64-NEXT: je .LBB25_1
; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: rep bsfl %edi, %eax
+; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB25_1:
; X64-NEXT: movl $32, %eax
@@ -808,12 +811,13 @@ define i32 @rotl_known_nonzero(i32 %xx, i32 %y) {
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: orl $256, %edi # imm = 0x100
+; X64-NEXT: movl %edi, %eax
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: roll %cl, %edi
+; X64-NEXT: roll %cl, %eax
; X64-NEXT: testl %edi, %edi
; X64-NEXT: je .LBB26_1
; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: rep bsfl %edi, %eax
+; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB26_1:
; X64-NEXT: movl $32, %eax
@@ -845,12 +849,13 @@ define i32 @rotl_maybe_zero(i32 %x, i32 %y) {
; X64-LABEL: rotl_maybe_zero:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: movl %edi, %eax
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: roll %cl, %edi
+; X64-NEXT: roll %cl, %eax
; X64-NEXT: testl %edi, %edi
; X64-NEXT: je .LBB27_1
; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: rep bsfl %edi, %eax
+; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB27_1:
; X64-NEXT: movl $32, %eax
@@ -905,12 +910,13 @@ define i32 @rotl_with_fshl_maybe_zero(i32 %x, i32 %y) {
; X64-LABEL: rotl_with_fshl_maybe_zero:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: movl %edi, %eax
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: roll %cl, %edi
+; X64-NEXT: roll %cl, %eax
; X64-NEXT: testl %edi, %edi
; X64-NEXT: je .LBB29_1
; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: rep bsfl %edi, %eax
+; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB29_1:
; X64-NEXT: movl $32, %eax
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index d3cced3233ea65..5a6375e08bcaff 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -893,27 +893,26 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm6
-; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psubq %xmm1, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [1,1]
+; SSE41-NEXT: por %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psubq %xmm1, %xmm4
; SSE41-NEXT: psubq %xmm2, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: psrlq $1, %xmm0
; SSE41-NEXT: psrlq $33, %xmm1
-; SSE41-NEXT: pmuludq %xmm6, %xmm1
-; SSE41-NEXT: movdqa %xmm6, %xmm3
-; SSE41-NEXT: psrlq $32, %xmm3
-; SSE41-NEXT: pmuludq %xmm0, %xmm3
-; SSE41-NEXT: paddq %xmm1, %xmm3
-; SSE41-NEXT: psllq $32, %xmm3
-; SSE41-NEXT: pmuludq %xmm6, %xmm0
+; SSE41-NEXT: pmuludq %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psrlq $32, %xmm4
+; SSE41-NEXT: pmuludq %xmm0, %xmm4
+; SSE41-NEXT: paddq %xmm1, %xmm4
+; SSE41-NEXT: psllq $32, %xmm4
+; SSE41-NEXT: pmuludq %xmm3, %xmm0
; SSE41-NEXT: paddq %xmm2, %xmm0
-; SSE41-NEXT: paddq %xmm3, %xmm0
+; SSE41-NEXT: paddq %xmm4, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: vec128_i64_signed_reg_reg:
@@ -1077,27 +1076,26 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm6
-; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psubq %xmm1, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [1,1]
+; SSE41-NEXT: por %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psubq %xmm1, %xmm4
; SSE41-NEXT: psubq %xmm2, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: psrlq $1, %xmm0
; SSE41-NEXT: psrlq $33, %xmm1
-; SSE41-NEXT: pmuludq %xmm6, %xmm1
-; SSE41-NEXT: movdqa %xmm6, %xmm3
-; SSE41-NEXT: psrlq $32, %xmm3
-; SSE41-NEXT: pmuludq %xmm0, %xmm3
-; SSE41-NEXT: paddq %xmm1, %xmm3
-; SSE41-NEXT: psllq $32, %xmm3
-; SSE41-NEXT: pmuludq %xmm6, %xmm0
+; SSE41-NEXT: pmuludq %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: psrlq $32, %xmm4
+; SSE41-NEXT: pmuludq %xmm0, %xmm4
+; SSE41-NEXT: paddq %xmm1, %xmm4
+; SSE41-NEXT: psllq $32, %xmm4
+; SSE41-NEXT: pmuludq %xmm3, %xmm0
; SSE41-NEXT: paddq %xmm2, %xmm0
-; SSE41-NEXT: paddq %xmm3, %xmm0
+; SSE41-NEXT: paddq %xmm4, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: vec128_i64_unsigned_reg_reg:
@@ -1993,14 +1991,14 @@ define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) noun
;
; AVX512VL-FALLBACK-LABEL: vec128_i16_unsigned_reg_reg:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm2
-; AVX512VL-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
-; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
-; AVX512VL-FALLBACK-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1
+; AVX512VL-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm2
+; AVX512VL-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm1
+; AVX512VL-FALLBACK-NEXT: vpsubw %xmm1, %xmm2, %xmm2
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2
+; AVX512VL-FALLBACK-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1
+; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm1
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm1, %xmm2, %xmm2
+; AVX512VL-FALLBACK-NEXT: vpsubw %xmm1, %xmm2, %xmm1
; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; AVX512VL-FALLBACK-NEXT: retq
;
@@ -2786,14 +2784,14 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
;
; AVX512VL-FALLBACK-LABEL: vec128_i8_unsigned_reg_reg:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm2
-; AVX512VL-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
-; AVX512VL-FALLBACK-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2
-; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1
+; AVX512VL-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm2
+; AVX512VL-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm1
+; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm2
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2
+; AVX512VL-FALLBACK-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
+; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm1
+; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX512VL-FALLBACK-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index cc08396ae8c78f..e880a1acc9e83f 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -1445,14 +1445,14 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n
;
; AVX512VL-FALLBACK-LABEL: vec256_i16_unsigned_reg_reg:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm2, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpxor %ymm1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm2, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: retq
;
@@ -2210,14 +2210,14 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
;
; AVX512VL-FALLBACK-LABEL: vec256_i8_unsigned_reg_reg:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm2, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
index 2fdf6ef224ca96..366dad1612b437 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
@@ -684,22 +684,21 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5
-; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2
; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5
; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5
+; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2
-; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
@@ -715,22 +714,21 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5
; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
@@ -772,20 +770,19 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm6
; AVX512F-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7
; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512F-NEXT: vpmaxub %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpmaxub %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpandq %zmm6, %zmm4, %zmm4
-; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2
-; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
@@ -803,20 +800,19 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm6
; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm2, %ymm3, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4
-; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm4, %zmm4
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index 04aff9b7d2e586..ace78b38d53edb 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -22,7 +22,7 @@ define void @f() nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $176, %esp
+; X86-NEXT: subl $160, %esp
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
@@ -47,55 +47,54 @@ define void @f() nounwind {
; X86-NEXT: testl %edi, %edi
; X86-NEXT: jne .LBB0_1
; X86-NEXT: # %bb.2: # %BB_udiv-special-cases
-; X86-NEXT: bsrl %esi, %ecx
-; X86-NEXT: xorl $31, %ecx
-; X86-NEXT: addl $32, %ecx
+; X86-NEXT: bsrl %esi, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: addl $32, %eax
; X86-NEXT: jmp .LBB0_3
; X86-NEXT: .LBB0_1:
-; X86-NEXT: bsrl %edi, %ecx
-; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: bsrl %edi, %eax
+; X86-NEXT: xorl $31, %eax
; X86-NEXT: .LBB0_3: # %BB_udiv-special-cases
-; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: xorl %ecx, %ecx
; X86-NEXT: testl %edx, %edx
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: jne .LBB0_4
; X86-NEXT: # %bb.5: # %BB_udiv-special-cases
-; X86-NEXT: addl $64, %ecx
+; X86-NEXT: addl $64, %eax
; X86-NEXT: jmp .LBB0_6
; X86-NEXT: .LBB0_4:
-; X86-NEXT: bsrl %edx, %ecx
-; X86-NEXT: xorl $31, %ecx
-; X86-NEXT: addl $32, %ecx
+; X86-NEXT: bsrl %edx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: addl $32, %eax
; X86-NEXT: .LBB0_6: # %BB_udiv-special-cases
-; X86-NEXT: subl $62, %ecx
+; X86-NEXT: subl $62, %eax
; X86-NEXT: movl $0, %ebx
; X86-NEXT: sbbl %ebx, %ebx
-; X86-NEXT: sbbl %eax, %eax
-; X86-NEXT: addl $-66, %ecx
+; X86-NEXT: sbbl %ecx, %ecx
+; X86-NEXT: addl $-66, %eax
; X86-NEXT: adcl $-1, %ebx
-; X86-NEXT: adcl $3, %eax
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movb $1, %al
-; X86-NEXT: testb %al, %al
+; X86-NEXT: adcl $3, %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: movb $1, %cl
+; X86-NEXT: testb %cl, %cl
; X86-NEXT: jne .LBB0_11
; X86-NEXT: # %bb.7: # %BB_udiv-special-cases
-; X86-NEXT: andl $3, %edi
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: xorl $65, %eax
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: andl $3, %esi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: xorl $65, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: orl %ebx, %ecx
; X86-NEXT: je .LBB0_11
; X86-NEXT: # %bb.8: # %udiv-bb1
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: addl $1, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: andl $3, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: andl $3, %ebx
; X86-NEXT: movb $65, %cl
; X86-NEXT: subb %al, %cl
; X86-NEXT: movb %cl, %ch
@@ -112,29 +111,31 @@ define void @f() nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 136(%esp,%eax), %edx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 120(%esp,%eax), %edi
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shll %cl, %edx
+; X86-NEXT: shll %cl, %edi
; X86-NEXT: notb %cl
-; X86-NEXT: movl 128(%esp,%eax), %edi
-; X86-NEXT: movl 132(%esp,%eax), %esi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl 112(%esp,%eax), %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 116(%esp,%eax), %edx
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: shrl %eax
; X86-NEXT: shrl %cl, %eax
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: je .LBB0_11
; X86-NEXT: # %bb.9: # %udiv-preheader
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: andl $3, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: andl $3, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -149,20 +150,20 @@ define void @f() nounwind {
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $15, %al
-; X86-NEXT: movzbl %al, %esi
-; X86-NEXT: movl 80(%esp,%esi), %edx
-; X86-NEXT: movl 84(%esp,%esi), %eax
-; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 64(%esp,%eax), %edi
+; X86-NEXT: movl 68(%esp,%eax), %edx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrl %cl, %edi
+; X86-NEXT: shrl %cl, %esi
; X86-NEXT: notb %cl
-; X86-NEXT: movl 88(%esp,%esi), %esi
-; X86-NEXT: addl %esi, %esi
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: orl %edi, %esi
+; X86-NEXT: movl 72(%esp,%eax), %ebx
+; X86-NEXT: addl %ebx, %ebx
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: orl %esi, %ebx
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrdl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrdl %cl, %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl $-1, %eax
@@ -179,63 +180,62 @@ define void @f() nounwind {
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB0_10: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl $1, %esi, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: shldl $1, %ebx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: andl $2, %eax
-; X86-NEXT: shrl %eax
-; X86-NEXT: leal (%eax,%edx,2), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl $1, %edi, %ebx
+; X86-NEXT: shldl $1, %ebx, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: andl $2, %edx
+; X86-NEXT: shrl %edx
+; X86-NEXT: leal (%edx,%ebx,2), %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl $1, %eax, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %eax, %eax
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $3, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: sbbl %esi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: sbbl %ecx, %ebx
-; X86-NEXT: shll $30, %ebx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: sarl $30, %eax
-; X86-NEXT: sarl $31, %ebx
-; X86-NEXT: shrdl $1, %ebx, %eax
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: andl $1, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $3, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sbbl %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: sbbl %ecx, %esi
+; X86-NEXT: shll $30, %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: sarl $30, %edx
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: shrdl $1, %esi, %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: subl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %ebx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: subl %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %esi, %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: andl $3, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl $-1, %eax
-; X86-NEXT: adcl $-1, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl $3, %edi
-; X86-NEXT: andl $3, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $3, %esi
+; X86-NEXT: andl $3, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edx, %eax
; X86-NEXT: jne .LBB0_10
; X86-NEXT: .LBB0_11: # %udiv-end
; X86-NEXT: cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll
index 782c84408f25ab..1b13cee628df67 100644
--- a/llvm/test/CodeGen/X86/pr62286.ll
+++ b/llvm/test/CodeGen/X86/pr62286.ll
@@ -8,21 +8,20 @@ define i64 @PR62286(i32 %a) {
; SSE-LABEL: PR62286:
; SSE: # %bb.0:
; SSE-NEXT: movd %edi, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,0]
-; SSE-NEXT: paddd %xmm1, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,1,0]
+; SSE-NEXT: paddd %xmm0, %xmm0
; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT: pxor %xmm3, %xmm3
-; SSE-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0]
-; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: pcmpgtd %xmm0, %xmm2
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE-NEXT: paddq %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE-NEXT: pxor %xmm3, %xmm3
+; SSE-NEXT: pcmpgtd %xmm0, %xmm3
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE-NEXT: paddq %xmm0, %xmm1
-; SSE-NEXT: movq %xmm1, %rax
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: paddq %xmm1, %xmm0
+; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: PR62286:
@@ -47,10 +46,10 @@ define i64 @PR62286(i32 %a) {
; AVX2-LABEL: PR62286:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm1
+; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index a9f3e8b22fb69e..785b97d8c24027 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -14,7 +14,6 @@ define i256 @test1(i256 %a) nounwind {
; ILP: # %bb.0:
; ILP-NEXT: movq %rdi, %rax
; ILP-NEXT: leal (%rsi,%rsi), %ecx
-; ILP-NEXT: addb $3, %cl
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
@@ -23,6 +22,7 @@ define i256 @test1(i256 %a) nounwind {
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT: addb $3, %cl
; ILP-NEXT: movl %ecx, %edx
; ILP-NEXT: shrb $3, %dl
; ILP-NEXT: andb $7, %cl
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index 31297a06f80993..a1cabb433d879b 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -563,18 +563,20 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: subq $120, %rsp
; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
-; X64-NEXT: psllq $32, %xmm3
+; X64-NEXT: pxor %xmm3, %xmm3
+; X64-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
; X64-NEXT: psrad $31, %xmm2
; X64-NEXT: psrlq $31, %xmm3
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: movq %xmm0, %rbx
-; X64-NEXT: movq %rbx, %r13
-; X64-NEXT: sarq $63, %r13
-; X64-NEXT: shldq $31, %rbx, %r13
+; X64-NEXT: movq %xmm0, %rbp
+; X64-NEXT: movq %rbp, %r14
+; X64-NEXT: sarq $63, %r14
+; X64-NEXT: shldq $31, %rbp, %r14
+; X64-NEXT: movq %rbp, %r15
+; X64-NEXT: shlq $31, %r15
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X64-NEXT: pxor %xmm0, %xmm0
; X64-NEXT: pcmpgtd %xmm1, %xmm0
@@ -582,113 +584,113 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: movq %xmm1, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: sarq $63, %r15
-; X64-NEXT: movq %rbx, %r12
-; X64-NEXT: shlq $31, %r12
-; X64-NEXT: movq %r12, %rdi
-; X64-NEXT: movq %r13, %rsi
-; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: sarq $63, %rbx
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: movq %rbx, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: movq %rax, %r13
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rdx, %r12
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %rbp
-; X64-NEXT: sbbq $0, %r14
-; X64-NEXT: shrq $63, %rbx
-; X64-NEXT: xorl %r15d, %ebx
-; X64-NEXT: movq %r12, %rdi
-; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: subq $1, %r13
+; X64-NEXT: sbbq $0, %r12
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %r14, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: movq %rbx, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
+; X64-NEXT: shrq $63, %rbp
+; X64-NEXT: xorl %ebp, %ebx
; X64-NEXT: testb %bl, %al
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
; X64-NEXT: xorl %ecx, %ecx
; X64-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rdx, %rbp
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: cmpq %rdx, %r13
+; X64-NEXT: movq %r12, %rax
; X64-NEXT: sbbq $0, %rax
-; X64-NEXT: cmovgeq %rcx, %r14
-; X64-NEXT: cmovgeq %rdx, %rbp
+; X64-NEXT: cmovgeq %rdx, %r13
+; X64-NEXT: cmovgeq %rcx, %r12
; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %rbp, %rcx
+; X64-NEXT: cmpq %r13, %rcx
; X64-NEXT: movq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; X64-NEXT: movq $-1, %rax
-; X64-NEXT: sbbq %r14, %rax
-; X64-NEXT: cmovgeq %rcx, %rbp
-; X64-NEXT: movq %rbp, %xmm0
+; X64-NEXT: sbbq %r12, %rax
+; X64-NEXT: cmovgeq %rcx, %r13
+; X64-NEXT: movq %r13, %xmm0
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
-; X64-NEXT: movq %xmm0, %rbx
-; X64-NEXT: movq %rbx, %r13
-; X64-NEXT: sarq $63, %r13
-; X64-NEXT: shldq $31, %rbx, %r13
+; X64-NEXT: movq %xmm0, %rbp
+; X64-NEXT: movq %rbp, %r14
+; X64-NEXT: sarq $63, %r14
+; X64-NEXT: shldq $31, %rbp, %r14
+; X64-NEXT: movq %rbp, %r15
+; X64-NEXT: shlq $31, %r15
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
; X64-NEXT: movq %xmm0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: sarq $63, %r15
-; X64-NEXT: movq %rbx, %r12
-; X64-NEXT: shlq $31, %r12
-; X64-NEXT: movq %r12, %rdi
-; X64-NEXT: movq %r13, %rsi
-; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: sarq $63, %rbx
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: movq %rbx, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: movq %rax, %r13
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rdx, %r12
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %rbp
-; X64-NEXT: sbbq $0, %r14
-; X64-NEXT: shrq $63, %rbx
-; X64-NEXT: xorl %r15d, %ebx
-; X64-NEXT: movq %r12, %rdi
-; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: subq $1, %r13
+; X64-NEXT: sbbq $0, %r12
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %r14, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: movq %rbx, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
+; X64-NEXT: shrq $63, %rbp
+; X64-NEXT: xorl %ebp, %ebx
; X64-NEXT: testb %bl, %al
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rcx, %rbp
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: cmpq %rcx, %r13
+; X64-NEXT: movq %r12, %rax
; X64-NEXT: sbbq $0, %rax
+; X64-NEXT: cmovgeq %rcx, %r13
; X64-NEXT: movl $0, %eax
-; X64-NEXT: cmovgeq %rax, %r14
-; X64-NEXT: cmovgeq %rcx, %rbp
+; X64-NEXT: cmovgeq %rax, %r12
; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %rbp, %rcx
+; X64-NEXT: cmpq %r13, %rcx
; X64-NEXT: movq $-1, %rax
-; X64-NEXT: sbbq %r14, %rax
-; X64-NEXT: cmovgeq %rcx, %rbp
-; X64-NEXT: movq %rbp, %xmm0
+; X64-NEXT: sbbq %r12, %rax
+; X64-NEXT: cmovgeq %rcx, %r13
+; X64-NEXT: movq %r13, %xmm0
; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; X64-NEXT: psrlq $1, %xmm1
; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; X64-NEXT: # xmm0 = mem[0,1,1,3]
-; X64-NEXT: psllq $32, %xmm0
+; X64-NEXT: pxor %xmm0, %xmm0
+; X64-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
; X64-NEXT: psrad $31, %xmm1
; X64-NEXT: psrlq $31, %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: movq %xmm0, %rbx
-; X64-NEXT: movq %rbx, %r13
-; X64-NEXT: sarq $63, %r13
-; X64-NEXT: shldq $31, %rbx, %r13
+; X64-NEXT: movq %xmm0, %rbp
+; X64-NEXT: movq %rbp, %r14
+; X64-NEXT: sarq $63, %r14
+; X64-NEXT: shldq $31, %rbp, %r14
+; X64-NEXT: movq %rbp, %r15
+; X64-NEXT: shlq $31, %r15
; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; X64-NEXT: pxor %xmm1, %xmm1
; X64-NEXT: pcmpgtd %xmm0, %xmm1
@@ -696,94 +698,92 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: movq %xmm0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: sarq $63, %r15
-; X64-NEXT: movq %rbx, %r12
-; X64-NEXT: shlq $31, %r12
-; X64-NEXT: movq %r12, %rdi
-; X64-NEXT: movq %r13, %rsi
-; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: sarq $63, %rbx
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: movq %rbx, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: movq %rax, %r13
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rdx, %r12
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %rbp
-; X64-NEXT: sbbq $0, %r14
-; X64-NEXT: shrq $63, %rbx
-; X64-NEXT: xorl %r15d, %ebx
-; X64-NEXT: movq %r12, %rdi
-; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: subq $1, %r13
+; X64-NEXT: sbbq $0, %r12
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %r14, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: movq %rbx, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
+; X64-NEXT: shrq $63, %rbp
+; X64-NEXT: xorl %ebp, %ebx
; X64-NEXT: testb %bl, %al
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rcx, %rbp
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: cmpq %rcx, %r13
+; X64-NEXT: movq %r12, %rax
; X64-NEXT: sbbq $0, %rax
+; X64-NEXT: cmovgeq %rcx, %r13
; X64-NEXT: movl $0, %eax
-; X64-NEXT: cmovgeq %rax, %r14
-; X64-NEXT: cmovgeq %rcx, %rbp
+; X64-NEXT: cmovgeq %rax, %r12
; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %rbp, %rcx
+; X64-NEXT: cmpq %r13, %rcx
; X64-NEXT: movq $-1, %rax
-; X64-NEXT: sbbq %r14, %rax
-; X64-NEXT: cmovgeq %rcx, %rbp
-; X64-NEXT: movq %rbp, %xmm0
+; X64-NEXT: sbbq %r12, %rax
+; X64-NEXT: cmovgeq %rcx, %r13
+; X64-NEXT: movq %r13, %xmm0
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
-; X64-NEXT: movq %xmm0, %rbx
-; X64-NEXT: movq %rbx, %r13
-; X64-NEXT: sarq $63, %r13
-; X64-NEXT: shldq $31, %rbx, %r13
+; X64-NEXT: movq %xmm0, %rbp
+; X64-NEXT: movq %rbp, %r14
+; X64-NEXT: sarq $63, %r14
+; X64-NEXT: shldq $31, %rbp, %r14
+; X64-NEXT: movq %rbp, %r15
+; X64-NEXT: shlq $31, %r15
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
; X64-NEXT: movq %xmm0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: sarq $63, %r15
-; X64-NEXT: movq %rbx, %r12
-; X64-NEXT: shlq $31, %r12
-; X64-NEXT: movq %r12, %rdi
-; X64-NEXT: movq %r13, %rsi
-; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: sarq $63, %rbx
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: movq %rbx, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: movq %rax, %r13
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rdx, %r12
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %rbp
-; X64-NEXT: sbbq $0, %r14
-; X64-NEXT: shrq $63, %rbx
-; X64-NEXT: xorl %r15d, %ebx
-; X64-NEXT: movq %r12, %rdi
-; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: subq $1, %r13
+; X64-NEXT: sbbq $0, %r12
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %r14, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: movq %rbx, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
+; X64-NEXT: shrq $63, %rbp
+; X64-NEXT: xorl %ebp, %ebx
; X64-NEXT: testb %bl, %al
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rcx, %rbp
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: cmpq %rcx, %r13
+; X64-NEXT: movq %r12, %rax
; X64-NEXT: sbbq $0, %rax
+; X64-NEXT: cmovgeq %rcx, %r13
; X64-NEXT: movl $0, %eax
-; X64-NEXT: cmovgeq %rax, %r14
-; X64-NEXT: cmovgeq %rcx, %rbp
+; X64-NEXT: cmovgeq %rax, %r12
; X64-NEXT: movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %rbp, %rax
-; X64-NEXT: sbbq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT: cmovgeq %rax, %rbp
-; X64-NEXT: movq %rbp, %xmm1
+; X64-NEXT: cmpq %r13, %rax
+; X64-NEXT: sbbq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT: cmovgeq %rax, %r13
+; X64-NEXT: movq %r13, %xmm1
; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: psrlq $1, %xmm0
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index 97c3c2040b2914..a80d8d8cd01b85 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -46,7 +46,6 @@ define void @failing(ptr %0, ptr %1) nounwind {
; CHECK-NEXT: movq 24(%rsi), %rcx
; CHECK-NEXT: movq 32(%rsi), %rdx
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
-; CHECK-NEXT: xorl %esi, %esi
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1,1]
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2,2]
; CHECK-NEXT: .p2align 4, 0x90
@@ -54,39 +53,45 @@ define void @failing(ptr %0, ptr %1) nounwind {
; CHECK-NEXT: # =>This Loop Header: Depth=1
; CHECK-NEXT: # Child Loop BB0_2 Depth 2
; CHECK-NEXT: xorpd %xmm3, %xmm3
-; CHECK-NEXT: movq $-1024, %rdi # imm = 0xFC00
+; CHECK-NEXT: movq $-1024, %rsi # imm = 0xFC00
; CHECK-NEXT: movdqa %xmm0, %xmm4
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_2: # %vector.body
; CHECK-NEXT: # Parent Loop BB0_1 Depth=1
; CHECK-NEXT: # => This Inner Loop Header: Depth=2
-; CHECK-NEXT: cmpq 1024(%rdx,%rdi), %rsi
-; CHECK-NEXT: movq %rcx, %r8
-; CHECK-NEXT: sbbq 1032(%rdx,%rdi), %r8
-; CHECK-NEXT: setge %r8b
-; CHECK-NEXT: movzbl %r8b, %r8d
-; CHECK-NEXT: andl $1, %r8d
+; CHECK-NEXT: movdqu 1024(%rdx,%rsi), %xmm5
+; CHECK-NEXT: movdqu 1040(%rdx,%rsi), %xmm6
+; CHECK-NEXT: movq %xmm5, %rdi
+; CHECK-NEXT: movq %xmm6, %r8
+; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; CHECK-NEXT: movq %xmm5, %r9
+; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
+; CHECK-NEXT: movq %xmm5, %r10
; CHECK-NEXT: negq %r8
-; CHECK-NEXT: movq %r8, %xmm5
-; CHECK-NEXT: cmpq 1040(%rdx,%rdi), %rsi
; CHECK-NEXT: movq %rcx, %r8
-; CHECK-NEXT: sbbq 1048(%rdx,%rdi), %r8
+; CHECK-NEXT: sbbq %r10, %r8
; CHECK-NEXT: setge %r8b
; CHECK-NEXT: movzbl %r8b, %r8d
-; CHECK-NEXT: andl $1, %r8d
; CHECK-NEXT: negq %r8
-; CHECK-NEXT: movq %r8, %xmm6
-; CHECK-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
-; CHECK-NEXT: movdqa %xmm1, %xmm6
-; CHECK-NEXT: psllq %xmm4, %xmm6
+; CHECK-NEXT: movq %r8, %xmm5
+; CHECK-NEXT: negq %rdi
+; CHECK-NEXT: movq %rcx, %rdi
+; CHECK-NEXT: sbbq %r9, %rdi
+; CHECK-NEXT: setge %dil
+; CHECK-NEXT: movzbl %dil, %edi
+; CHECK-NEXT: negq %rdi
+; CHECK-NEXT: movq %rdi, %xmm6
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
+; CHECK-NEXT: movdqa %xmm1, %xmm5
+; CHECK-NEXT: psllq %xmm4, %xmm5
; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3]
; CHECK-NEXT: movdqa %xmm1, %xmm8
; CHECK-NEXT: psllq %xmm7, %xmm8
-; CHECK-NEXT: movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1]
-; CHECK-NEXT: andpd %xmm5, %xmm8
+; CHECK-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1]
+; CHECK-NEXT: andpd %xmm6, %xmm8
; CHECK-NEXT: orpd %xmm8, %xmm3
; CHECK-NEXT: paddq %xmm2, %xmm4
-; CHECK-NEXT: addq $32, %rdi
+; CHECK-NEXT: addq $32, %rsi
; CHECK-NEXT: jne .LBB0_2
; CHECK-NEXT: # %bb.3: # %middle.block
; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1
@@ -101,7 +106,6 @@ define void @failing(ptr %0, ptr %1) nounwind {
; CHECK-AVX2-NEXT: movq 24(%rsi), %rcx
; CHECK-AVX2-NEXT: movq 32(%rsi), %rdx
; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,1]
-; CHECK-AVX2-NEXT: xorl %esi, %esi
; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,1]
; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [2,2]
; CHECK-AVX2-NEXT: .p2align 4, 0x90
@@ -109,34 +113,40 @@ define void @failing(ptr %0, ptr %1) nounwind {
; CHECK-AVX2-NEXT: # =>This Loop Header: Depth=1
; CHECK-AVX2-NEXT: # Child Loop BB0_2 Depth 2
; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT: movq $-1024, %rdi # imm = 0xFC00
+; CHECK-AVX2-NEXT: movq $-1024, %rsi # imm = 0xFC00
; CHECK-AVX2-NEXT: vmovdqa %xmm0, %xmm4
; CHECK-AVX2-NEXT: .p2align 4, 0x90
; CHECK-AVX2-NEXT: .LBB0_2: # %vector.body
; CHECK-AVX2-NEXT: # Parent Loop BB0_1 Depth=1
; CHECK-AVX2-NEXT: # => This Inner Loop Header: Depth=2
-; CHECK-AVX2-NEXT: cmpq 1024(%rdx,%rdi), %rsi
-; CHECK-AVX2-NEXT: movq %rcx, %r8
-; CHECK-AVX2-NEXT: sbbq 1032(%rdx,%rdi), %r8
+; CHECK-AVX2-NEXT: vmovdqu 1024(%rdx,%rsi), %xmm5
+; CHECK-AVX2-NEXT: vmovdqu 1040(%rdx,%rsi), %xmm6
+; CHECK-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm5[0],xmm6[0]
+; CHECK-AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; CHECK-AVX2-NEXT: vmovq %xmm5, %rdi
+; CHECK-AVX2-NEXT: vpextrq $1, %xmm5, %r8
+; CHECK-AVX2-NEXT: vmovq %xmm7, %r9
+; CHECK-AVX2-NEXT: vpextrq $1, %xmm7, %r10
+; CHECK-AVX2-NEXT: negq %r10
+; CHECK-AVX2-NEXT: movq %rcx, %r10
+; CHECK-AVX2-NEXT: sbbq %r8, %r10
; CHECK-AVX2-NEXT: setge %r8b
; CHECK-AVX2-NEXT: movzbl %r8b, %r8d
-; CHECK-AVX2-NEXT: andl $1, %r8d
; CHECK-AVX2-NEXT: negq %r8
; CHECK-AVX2-NEXT: vmovq %r8, %xmm5
-; CHECK-AVX2-NEXT: cmpq 1040(%rdx,%rdi), %rsi
+; CHECK-AVX2-NEXT: negq %r9
; CHECK-AVX2-NEXT: movq %rcx, %r8
-; CHECK-AVX2-NEXT: sbbq 1048(%rdx,%rdi), %r8
-; CHECK-AVX2-NEXT: setge %r8b
-; CHECK-AVX2-NEXT: movzbl %r8b, %r8d
-; CHECK-AVX2-NEXT: andl $1, %r8d
-; CHECK-AVX2-NEXT: negq %r8
-; CHECK-AVX2-NEXT: vmovq %r8, %xmm6
-; CHECK-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
+; CHECK-AVX2-NEXT: sbbq %rdi, %r8
+; CHECK-AVX2-NEXT: setge %dil
+; CHECK-AVX2-NEXT: movzbl %dil, %edi
+; CHECK-AVX2-NEXT: negq %rdi
+; CHECK-AVX2-NEXT: vmovq %rdi, %xmm6
+; CHECK-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0]
; CHECK-AVX2-NEXT: vpsllvq %xmm4, %xmm1, %xmm6
; CHECK-AVX2-NEXT: vpand %xmm6, %xmm5, %xmm5
; CHECK-AVX2-NEXT: vpor %xmm3, %xmm5, %xmm3
; CHECK-AVX2-NEXT: vpaddq %xmm2, %xmm4, %xmm4
-; CHECK-AVX2-NEXT: addq $32, %rdi
+; CHECK-AVX2-NEXT: addq $32, %rsi
; CHECK-AVX2-NEXT: jne .LBB0_2
; CHECK-AVX2-NEXT: # %bb.3: # %middle.block
; CHECK-AVX2-NEXT: # in Loop: Header=BB0_1 Depth=1
diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index cee30f5fe5da9e..460c5fe11f82a5 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -1045,16 +1045,12 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
;
; AVX512-LABEL: saddo_v4i1:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k0
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
-; AVX512-NEXT: vpslld $31, %xmm1, %xmm0
-; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1
-; AVX512-NEXT: kxorw %k1, %k0, %k2
-; AVX512-NEXT: kandw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: kshiftlw $12, %k2, %k0
-; AVX512-NEXT: kshiftrw $12, %k0, %k0
+; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll
index 64ed081048851b..d06993da6365d8 100644
--- a/llvm/test/CodeGen/X86/vec_ssubo.ll
+++ b/llvm/test/CodeGen/X86/vec_ssubo.ll
@@ -1062,16 +1062,12 @@ define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
;
; AVX512-LABEL: ssubo_v4i1:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k0
+; AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
-; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512-NEXT: kxorw %k1, %k0, %k0
-; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 {%k1}
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: kshiftlw $12, %k0, %k0
-; AVX512-NEXT: kshiftrw $12, %k0, %k0
+; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index 950e943bd90201..bac118095331ca 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -1098,16 +1098,12 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
;
; AVX512-LABEL: uaddo_v4i1:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k0
+; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
-; AVX512-NEXT: vpslld $31, %xmm1, %xmm0
-; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1
-; AVX512-NEXT: kxorw %k1, %k0, %k2
-; AVX512-NEXT: kandw %k1, %k0, %k1
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: kshiftlw $12, %k2, %k0
-; AVX512-NEXT: kshiftrw $12, %k0, %k0
+; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll
index 7de972770d8da4..ab75ada72f2565 100644
--- a/llvm/test/CodeGen/X86/vec_usubo.ll
+++ b/llvm/test/CodeGen/X86/vec_usubo.ll
@@ -1145,16 +1145,12 @@ define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
;
; AVX512-LABEL: usubo_v4i1:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k0
+; AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
-; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512-NEXT: kxorw %k1, %k0, %k0
-; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 {%k1}
-; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT: kshiftlw $12, %k0, %k0
-; AVX512-NEXT: kshiftrw $12, %k0, %k0
+; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll
index 78797b9acc2e6e..27aaad6353ed68 100644
--- a/llvm/test/CodeGen/X86/vector-bo-select.ll
+++ b/llvm/test/CodeGen/X86/vector-bo-select.ll
@@ -3137,11 +3137,11 @@ define <8 x i64> @mul_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef
; AVX512-LABEL: mul_v8i64_cast_cond:
; AVX512: # %bb.0:
; AVX512-NEXT: kmovw %edi, %k1
-; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm2
-; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm2
-; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm3
-; AVX512-NEXT: vpmuludq %zmm1, %zmm3, %zmm3
-; AVX512-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm2
+; AVX512-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
+; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm3
+; AVX512-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512-NEXT: vpaddq %zmm2, %zmm3, %zmm2
; AVX512-NEXT: vpsllq $32, %zmm2, %zmm2
; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm1
; AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm0 {%k1}
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index b839452725a95f..3aaa9268a8d888 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -58,12 +58,12 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; SSE41-NEXT: psrlq %xmm4, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: pandn %xmm3, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
; SSE41-NEXT: paddq %xmm0, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psllq %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psllq %xmm2, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; SSE41-NEXT: psllq %xmm2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: retq
;
@@ -76,11 +76,11 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -158,13 +158,13 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; XOPAVX1-LABEL: var_funnnel_v2i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT: vpshlq %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
@@ -366,13 +366,13 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
; XOPAVX1-LABEL: var_funnnel_v4i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT: vpshld %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
@@ -646,26 +646,26 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; XOPAVX1-LABEL: var_funnnel_v8i16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX1-NEXT: vpsubw %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT: vpshlw %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshlw %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshlw %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpshlw %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_funnnel_v8i16:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX2-NEXT: vpsubw %xmm4, %xmm5, %xmm4
-; XOPAVX2-NEXT: vpshlw %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
; XOPAVX2-NEXT: vpaddw %xmm0, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1
; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
@@ -995,26 +995,26 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
; XOPAVX1-LABEL: var_funnnel_v16i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX1-NEXT: vpsubb %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
; XOPAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshlb %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_funnnel_v16i8:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX2-NEXT: vpsubb %xmm4, %xmm5, %xmm4
-; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
; XOPAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpshlb %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1
; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 7b6b0ea83c7eea..fc65f759f5fbed 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -486,22 +486,22 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; XOPAVX2-LABEL: var_funnnel_v16i16:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
+; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; XOPAVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; XOPAVX2-NEXT: vpsubw %xmm5, %xmm6, %xmm5
-; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm7
-; XOPAVX2-NEXT: vpshlw %xmm5, %xmm7, %xmm5
-; XOPAVX2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
-; XOPAVX2-NEXT: vpshlw %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
-; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
; XOPAVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0
-; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; XOPAVX2-NEXT: vpshlw %xmm3, %xmm4, %xmm3
-; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
+; XOPAVX2-NEXT: vpshlw %xmm5, %xmm6, %xmm5
+; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT: vpsubw %xmm3, %xmm4, %xmm3
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
+; XOPAVX2-NEXT: vpshlw %xmm3, %xmm5, %xmm3
+; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm2
+; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt)
diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
index 0426c48aecfcff..a6067a960fc0d6 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
@@ -185,13 +185,13 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt)
; XOPAVX1-LABEL: var_funnnel_v2i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT: vpshld %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index c54da38ef10cc1..75baba5f35f792 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -927,9 +927,9 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psllq $7, %xmm1
-; SSE2-NEXT: paddq %xmm0, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE2-NEXT: paddq %xmm0, %xmm1
+; SSE2-NEXT: psllq $7, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i64:
@@ -975,9 +975,9 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
; X86-SSE-LABEL: constant_shift_v2i64:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE-NEXT: psllq $7, %xmm1
-; X86-SSE-NEXT: paddq %xmm0, %xmm0
-; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; X86-SSE-NEXT: paddq %xmm0, %xmm1
+; X86-SSE-NEXT: psllq $7, %xmm0
+; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X86-SSE-NEXT: retl
%shift = shl <2 x i64> %a, <i64 1, i64 7>
ret <2 x i64> %shift
More information about the llvm-commits
mailing list