[llvm] 16bd10a - Revert "[SelectionDAG] Handle more opcodes in canCreateUndefOrPoison (#84921)" and more...
David Spickett via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 29 01:47:55 PDT 2024
Author: David Spickett
Date: 2024-04-29T09:47:41+01:00
New Revision: 16bd10a38730fed27a3bf111076b8ef7a7e7b3ee
URL: https://github.com/llvm/llvm-project/commit/16bd10a38730fed27a3bf111076b8ef7a7e7b3ee
DIFF: https://github.com/llvm/llvm-project/commit/16bd10a38730fed27a3bf111076b8ef7a7e7b3ee.diff
LOG: Revert "[SelectionDAG] Handle more opcodes in canCreateUndefOrPoison (#84921)" and more...
This reverts:
b3c55b707110084a9f50a16aade34c3be6fa18da - "[SelectionDAG] Handle more opcodes in canCreateUndefOrPoison (#84921)"
(because it updates a test case that I don't know how to resolve the conflict for)
8e2f6495c0bac1dd6ee32b6a0d24152c9c343624 - "[DAGCombiner] Do not always fold FREEZE over BUILD_VECTOR (#85932)"
73472c5996716cda0dbb3ddb788304e0e7e6a323 - "[SelectionDAG] Treat CopyFromReg as freezing the value (#85932)"
Due to a test suite failure on AArch64 when compiling for SVE.
https://lab.llvm.org/buildbot/#/builders/197/builds/13955
clang: ../llvm/llvm/include/llvm/CodeGen/ValueTypes.h:307: MVT llvm::EVT::getSimpleVT() const: Assertion `isSimple() && "Expected a SimpleValueType!"' failed.
Added:
Modified:
llvm/include/llvm/CodeGen/ISDOpcodes.h
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
llvm/test/CodeGen/AArch64/combine-mul.ll
llvm/test/CodeGen/AMDGPU/div_i128.ll
llvm/test/CodeGen/AMDGPU/rem_i128.ll
llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
llvm/test/CodeGen/RISCV/alu64.ll
llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
llvm/test/CodeGen/RISCV/bfloat-convert.ll
llvm/test/CodeGen/RISCV/double-convert.ll
llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
llvm/test/CodeGen/RISCV/float-convert.ll
llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
llvm/test/CodeGen/RISCV/forced-atomics.ll
llvm/test/CodeGen/RISCV/fpclamptosat.ll
llvm/test/CodeGen/RISCV/half-convert.ll
llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
llvm/test/CodeGen/RISCV/iabs.ll
llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
llvm/test/CodeGen/X86/abdu-vector-128.ll
llvm/test/CodeGen/X86/apx/kmov-postrapseudos.ll
llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
llvm/test/CodeGen/X86/combine-mul.ll
llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
llvm/test/CodeGen/X86/fold-masked-merge.ll
llvm/test/CodeGen/X86/freeze-binary.ll
llvm/test/CodeGen/X86/freeze-combine.ll
llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
llvm/test/CodeGen/X86/gfni-rotates.ll
llvm/test/CodeGen/X86/known-never-zero.ll
llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
llvm/test/CodeGen/X86/pr38539.ll
llvm/test/CodeGen/X86/pr62286.ll
llvm/test/CodeGen/X86/scheduler-backtracking.ll
llvm/test/CodeGen/X86/sdiv_fix_sat.ll
llvm/test/CodeGen/X86/setcc-non-simple-type.ll
llvm/test/CodeGen/X86/vec_saddo.ll
llvm/test/CodeGen/X86/vec_ssubo.ll
llvm/test/CodeGen/X86/vec_uaddo.ll
llvm/test/CodeGen/X86/vec_usubo.ll
llvm/test/CodeGen/X86/vector-bo-select.ll
llvm/test/CodeGen/X86/vector-fshr-128.ll
llvm/test/CodeGen/X86/vector-fshr-256.ll
llvm/test/CodeGen/X86/vector-fshr-sub128.ll
llvm/test/CodeGen/X86/vector-shift-shl-128.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 6429947958ee91d..078a936b061a329 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -205,7 +205,6 @@ enum NodeType {
/// CopyFromReg - This node indicates that the input value is a virtual or
/// physical register that is defined outside of the scope of this
/// SelectionDAG. The register is available from the RegisterSDNode object.
- /// Note that CopyFromReg is considered as also freezing the value.
CopyFromReg,
/// UNDEF - An undefined node.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 326a004d06f20a7..f8949b926922e42 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15459,12 +15459,6 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, /*PoisonOnly*/ false))
return N0;
- // We currently avoid folding freeze over SRA/SRL, due to the problems seen
- // with (freeze (assert ext)) blocking simplifications of SRA/SRL. See for
- // example https://reviews.llvm.org/D136529#4120959.
- if (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)
- return SDValue();
-
// Fold freeze(op(x, ...)) -> op(freeze(x), ...).
// Try to push freeze through instructions that propagate but don't produce
// poison as far as possible. If an operand of freeze follows three
@@ -15481,26 +15475,6 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
N0.getOpcode() == ISD::BUILD_PAIR ||
N0.getOpcode() == ISD::CONCAT_VECTORS;
- // Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all
- // ones" or "constant" into something that depends on FrozenUndef. We can
- // instead pick undef values to keep those properties, while at the same time
- // folding away the freeze.
- // If we implement a more general solution for folding away freeze(undef) in
- // the future, then this special handling can be removed.
- if (N0.getOpcode() == ISD::BUILD_VECTOR) {
- SDLoc DL(N0);
- MVT VT = N0.getSimpleValueType();
- if (llvm::ISD::isBuildVectorAllOnes(N0.getNode()))
- return DAG.getAllOnesConstant(DL, VT);
- if (llvm::ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
- SmallVector<SDValue, 8> NewVecC;
- for (const SDValue &Op : N0->op_values())
- NewVecC.push_back(
- Op.isUndef() ? DAG.getConstant(0, DL, Op.getValueType()) : Op);
- return DAG.getBuildVector(VT, DL, NewVecC);
- }
- }
-
SmallSetVector<SDValue, 8> MaybePoisonOperands;
for (SDValue Op : N0->ops()) {
if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly*/ false,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index dfbfaa8c894f55f..224c0c5ee9706cb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5063,7 +5063,6 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
case ISD::VALUETYPE:
case ISD::FrameIndex:
case ISD::TargetFrameIndex:
- case ISD::CopyFromReg:
return true;
case ISD::UNDEF:
@@ -5137,16 +5136,6 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
case ISD::FREEZE:
case ISD::CONCAT_VECTORS:
case ISD::INSERT_SUBVECTOR:
- case ISD::SADDSAT:
- case ISD::UADDSAT:
- case ISD::SSUBSAT:
- case ISD::USUBSAT:
- case ISD::MULHU:
- case ISD::MULHS:
- case ISD::SMIN:
- case ISD::SMAX:
- case ISD::UMIN:
- case ISD::UMAX:
case ISD::AND:
case ISD::XOR:
case ISD::ROTL:
@@ -5167,7 +5156,6 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
case ISD::BUILD_PAIR:
return false;
- case ISD::SELECT_CC:
case ISD::SETCC: {
// Integer setcc cannot create undef or poison.
if (Op.getOperand(0).getValueType().isInteger())
@@ -5177,8 +5165,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
// based on options and flags. The options and flags also cause special
// nonan condition codes to be used. Those condition codes may be preserved
// even if the nonan flag is dropped somewhere.
- unsigned CCOp = Opcode == ISD::SETCC ? 2 : 4;
- ISD::CondCode CCCode = cast<CondCodeSDNode>(Op.getOperand(CCOp))->get();
+ ISD::CondCode CCCode = cast<CondCodeSDNode>(Op.getOperand(2))->get();
if (((unsigned)CCCode & 0x10U))
return true;
@@ -5195,8 +5182,6 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
return false;
case ISD::SHL:
- case ISD::SRL:
- case ISD::SRA:
// If the max shift amount isn't in range, then the shift can create poison.
return !getValidMaximumShiftAmountConstant(Op, DemandedElts);
diff --git a/llvm/test/CodeGen/AArch64/combine-mul.ll b/llvm/test/CodeGen/AArch64/combine-mul.ll
index c49e5ae6620a9ee..a2b0425308093d7 100644
--- a/llvm/test/CodeGen/AArch64/combine-mul.ll
+++ b/llvm/test/CodeGen/AArch64/combine-mul.ll
@@ -44,7 +44,8 @@ define <4 x i1> @PR48683_vec_undef(<4 x i32> %x) {
define i64 @combine_mul_self_demandedbits(i64 %x) {
; CHECK-LABEL: combine_mul_self_demandedbits:
; CHECK: // %bb.0:
-; CHECK-NEXT: mul x0, x0, x0
+; CHECK-NEXT: mul x8, x0, x0
+; CHECK-NEXT: and x0, x8, #0xfffffffffffffffd
; CHECK-NEXT: ret
%1 = mul i64 %x, %x
%2 = and i64 %1, -3
@@ -76,7 +77,7 @@ define i8 @one_demanded_bit(i8 %x) {
define <2 x i64> @one_demanded_bit_splat(<2 x i64> %x) {
; CHECK-LABEL: one_demanded_bit_splat:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #32 // =0x20
+; CHECK-NEXT: mov w8, #32
; CHECK-NEXT: shl v0.2d, v0.2d, #5
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
@@ -130,7 +131,7 @@ define i32 @squared_demanded_2_low_bits(i32 %x) {
define <2 x i64> @squared_demanded_2_low_bits_splat(<2 x i64> %x) {
; CHECK-LABEL: squared_demanded_2_low_bits_splat:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #-2 // =0xfffffffffffffffe
+; CHECK-NEXT: mov x8, #-2
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index b2f9bf89d9ec604..cf99b5d80e13a87 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -282,21 +282,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v16, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v16
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v14
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14
; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v4
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v6, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v10, v4, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v10, v4, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v6, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1
+; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
; GFX9-O0-NEXT: v_xor_b32_e64 v1, v5, v1
@@ -312,21 +312,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7
-; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12
; GFX9-O0-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v3
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v8, v5, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v7, v3, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v2, v5, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v8, v3, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8
+; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7
+; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7
; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, v6
; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v4
; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
@@ -339,26 +339,18 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2
; GFX9-O0-NEXT: v_or_b32_e64 v3, v8, v7
@@ -411,8 +403,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
@@ -448,8 +439,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[8:9]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
@@ -700,10 +690,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
@@ -913,14 +903,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(9)
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
@@ -1038,10 +1028,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index b068d87c4d6f48e..6ba66ccf71868e9 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -242,137 +242,130 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0: ; %bb.0: ; %_udiv-special-cases
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9
+; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v1
; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: s_mov_b32 s4, 63
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT: v_ashrrev_i64 v[11:12], s4, v[10:11]
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15
+; GFX9-O0-NEXT: v_ashrrev_i64 v[12:13], s4, v[6:7]
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v7
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6
-; GFX9-O0-NEXT: v_ashrrev_i64 v[15:16], s4, v[13:14]
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12
-; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v10
-; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v11
-; GFX9-O0-NEXT: v_xor_b32_e64 v13, v8, v12
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9
+; GFX9-O0-NEXT: v_ashrrev_i64 v[6:7], s4, v[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v15
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13
+; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14
+; GFX9-O0-NEXT: v_xor_b32_e64 v13, v11, v12
; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v10
-; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v12
-; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16
-; GFX9-O0-NEXT: v_xor_b32_e64 v9, v8, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v15
-; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, v6
-; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5
-; GFX9-O0-NEXT: v_xor_b32_e64 v9, v9, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v10
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, v6
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1
-; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT: v_xor_b32_e64 v15, v4, v12
+; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v16, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7
+; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT: v_xor_b32_e64 v7, v5, v6
+; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v4
+; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT: v_xor_b32_e64 v2, v2, v6
+; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v16
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v14
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14
; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v12
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v10, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v11, v12, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v11, v12, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v10, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2
+; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5
+; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8
; GFX9-O0-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v6
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v3, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v5, v6, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v2, v3, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -445,8 +438,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
@@ -482,8 +474,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[8:9]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
@@ -598,27 +589,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-O0-NEXT: ; %bb.2: ; %Flow
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(6)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_5
; GFX9-O0-NEXT: .LBB0_3: ; %Flow2
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
@@ -633,22 +624,22 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_9
; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b32 s4, 1
; GFX9-O0-NEXT: s_waitcnt vmcnt(2)
; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1]
@@ -688,27 +679,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6
; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_4
; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while
; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -718,30 +709,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8
; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b32 s4, 63
; GFX9-O0-NEXT: s_waitcnt vmcnt(16)
; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3]
@@ -881,24 +872,24 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v13
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4
; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5
@@ -908,42 +899,42 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6
; GFX9-O0-NEXT: s_branch .LBB0_1
; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
@@ -1027,12 +1018,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v15, s9
; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8
@@ -1043,30 +1034,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_6
; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
@@ -1108,14 +1099,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f
; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4
; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12]
@@ -1161,12 +1152,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10
; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v4
@@ -1181,18 +1172,18 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec
; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7]
@@ -1212,18 +1203,18 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b32 s4, 32
; GFX9-O0-NEXT: s_waitcnt vmcnt(2)
; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[5:6]
@@ -1495,11 +1486,11 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5
; GFX9-O0-NEXT: ; kill: killed $vgpr4
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
index 25106b456d2f7a6..6629d34405492cc 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
@@ -123,9 +123,10 @@ define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a3, 4, 0
-; CHECK-NEXT: st.b $a2, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0
+; CHECK-NEXT: addi.d $a3, $sp, 0
+; CHECK-NEXT: bstrins.d $a3, $a0, 4, 0
+; CHECK-NEXT: st.b $a2, $a3, 0
; CHECK-NEXT: xvld $xr0, $sp, 0
; CHECK-NEXT: xvst $xr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
@@ -149,9 +150,10 @@ define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a3, 4, 1
-; CHECK-NEXT: st.h $a2, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0
+; CHECK-NEXT: addi.d $a3, $sp, 0
+; CHECK-NEXT: bstrins.d $a3, $a0, 4, 1
+; CHECK-NEXT: st.h $a2, $a3, 0
; CHECK-NEXT: xvld $xr0, $sp, 0
; CHECK-NEXT: xvst $xr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
@@ -175,9 +177,10 @@ define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a3, 4, 2
-; CHECK-NEXT: st.w $a2, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0
+; CHECK-NEXT: addi.d $a3, $sp, 0
+; CHECK-NEXT: bstrins.d $a3, $a0, 4, 2
+; CHECK-NEXT: st.w $a2, $a3, 0
; CHECK-NEXT: xvld $xr0, $sp, 0
; CHECK-NEXT: xvst $xr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
@@ -201,9 +204,10 @@ define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a3, 4, 3
-; CHECK-NEXT: st.d $a2, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0
+; CHECK-NEXT: addi.d $a3, $sp, 0
+; CHECK-NEXT: bstrins.d $a3, $a0, 4, 3
+; CHECK-NEXT: st.d $a2, $a3, 0
; CHECK-NEXT: xvld $xr0, $sp, 0
; CHECK-NEXT: xvst $xr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
@@ -227,9 +231,10 @@ define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwin
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr1, $a0, 0
; CHECK-NEXT: xvst $xr1, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a2, 4, 2
-; CHECK-NEXT: fst.s $fa0, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT: addi.d $a2, $sp, 0
+; CHECK-NEXT: bstrins.d $a2, $a0, 4, 2
+; CHECK-NEXT: fst.s $fa0, $a2, 0
; CHECK-NEXT: xvld $xr0, $sp, 0
; CHECK-NEXT: xvst $xr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
@@ -253,9 +258,10 @@ define void @insert_4xdouble_idx(ptr %src, ptr %dst, double %in, i32 %idx) nounw
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr1, $a0, 0
; CHECK-NEXT: xvst $xr1, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a2, 4, 3
-; CHECK-NEXT: fst.d $fa0, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT: addi.d $a2, $sp, 0
+; CHECK-NEXT: bstrins.d $a2, $a0, 4, 3
+; CHECK-NEXT: fst.d $fa0, $a2, 0
; CHECK-NEXT: xvld $xr0, $sp, 0
; CHECK-NEXT: xvst $xr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
index 7f232073ae129c4..19171b7d8ed7845 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
@@ -87,9 +87,10 @@ define void @insert_16xi8_idx(ptr %src, ptr %dst, i8 %ins, i32 %idx) nounwind {
; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: vld $vr0, $a0, 0
; CHECK-NEXT: vst $vr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a3, 3, 0
-; CHECK-NEXT: st.b $a2, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0
+; CHECK-NEXT: addi.d $a3, $sp, 0
+; CHECK-NEXT: bstrins.d $a3, $a0, 3, 0
+; CHECK-NEXT: st.b $a2, $a3, 0
; CHECK-NEXT: vld $vr0, $sp, 0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $sp, 16
@@ -106,9 +107,10 @@ define void @insert_8xi16_idx(ptr %src, ptr %dst, i16 %ins, i32 %idx) nounwind {
; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: vld $vr0, $a0, 0
; CHECK-NEXT: vst $vr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a3, 3, 1
-; CHECK-NEXT: st.h $a2, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0
+; CHECK-NEXT: addi.d $a3, $sp, 0
+; CHECK-NEXT: bstrins.d $a3, $a0, 3, 1
+; CHECK-NEXT: st.h $a2, $a3, 0
; CHECK-NEXT: vld $vr0, $sp, 0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $sp, 16
@@ -125,9 +127,10 @@ define void @insert_4xi32_idx(ptr %src, ptr %dst, i32 %ins, i32 %idx) nounwind {
; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: vld $vr0, $a0, 0
; CHECK-NEXT: vst $vr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a3, 3, 2
-; CHECK-NEXT: st.w $a2, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0
+; CHECK-NEXT: addi.d $a3, $sp, 0
+; CHECK-NEXT: bstrins.d $a3, $a0, 3, 2
+; CHECK-NEXT: st.w $a2, $a3, 0
; CHECK-NEXT: vld $vr0, $sp, 0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $sp, 16
@@ -144,9 +147,10 @@ define void @insert_2xi64_idx(ptr %src, ptr %dst, i64 %ins, i32 %idx) nounwind {
; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: vld $vr0, $a0, 0
; CHECK-NEXT: vst $vr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a3, 3, 3
-; CHECK-NEXT: st.d $a2, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0
+; CHECK-NEXT: addi.d $a3, $sp, 0
+; CHECK-NEXT: bstrins.d $a3, $a0, 3, 3
+; CHECK-NEXT: st.d $a2, $a3, 0
; CHECK-NEXT: vld $vr0, $sp, 0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $sp, 16
@@ -163,9 +167,10 @@ define void @insert_4xfloat_idx(ptr %src, ptr %dst, float %ins, i32 %idx) nounwi
; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: vld $vr1, $a0, 0
; CHECK-NEXT: vst $vr1, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a2, 3, 2
-; CHECK-NEXT: fst.s $fa0, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT: addi.d $a2, $sp, 0
+; CHECK-NEXT: bstrins.d $a2, $a0, 3, 2
+; CHECK-NEXT: fst.s $fa0, $a2, 0
; CHECK-NEXT: vld $vr0, $sp, 0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $sp, 16
@@ -182,9 +187,10 @@ define void @insert_2xdouble_idx(ptr %src, ptr %dst, double %ins, i32 %idx) noun
; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: vld $vr1, $a0, 0
; CHECK-NEXT: vst $vr1, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a2, 3, 3
-; CHECK-NEXT: fst.d $fa0, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT: addi.d $a2, $sp, 0
+; CHECK-NEXT: bstrins.d $a2, $a0, 3, 3
+; CHECK-NEXT: fst.d $fa0, $a2, 0
; CHECK-NEXT: vld $vr0, $sp, 0
; CHECK-NEXT: vst $vr0, $a1, 0
; CHECK-NEXT: addi.d $sp, $sp, 16
diff --git a/llvm/test/CodeGen/RISCV/alu64.ll b/llvm/test/CodeGen/RISCV/alu64.ll
index f032756e007b684..d2ee80e6aa9513d 100644
--- a/llvm/test/CodeGen/RISCV/alu64.ll
+++ b/llvm/test/CodeGen/RISCV/alu64.ll
@@ -57,8 +57,8 @@ define i64 @sltiu(i64 %a) nounwind {
;
; RV32I-LABEL: sltiu:
; RV32I: # %bb.0:
-; RV32I-NEXT: sltiu a0, a0, 3
; RV32I-NEXT: seqz a1, a1
+; RV32I-NEXT: sltiu a0, a0, 3
; RV32I-NEXT: and a0, a1, a0
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
index a5a2ae79966c3f4..f96e1bad2e3895f 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
@@ -372,10 +372,10 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
; RV32IA-NEXT: # =>This Loop Header: Depth=1
; RV32IA-NEXT: # Child Loop BB2_3 Depth 2
; RV32IA-NEXT: mv a3, a2
-; RV32IA-NEXT: addi a2, a2, 1
-; RV32IA-NEXT: sltu a4, a3, a1
-; RV32IA-NEXT: neg a4, a4
-; RV32IA-NEXT: and a4, a4, a2
+; RV32IA-NEXT: addi a4, a2, 1
+; RV32IA-NEXT: sltu a2, a2, a1
+; RV32IA-NEXT: neg a2, a2
+; RV32IA-NEXT: and a4, a2, a4
; RV32IA-NEXT: .LBB2_3: # %atomicrmw.start
; RV32IA-NEXT: # Parent Loop BB2_1 Depth=1
; RV32IA-NEXT: # => This Inner Loop Header: Depth=2
@@ -607,10 +607,10 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
; RV64IA-NEXT: # =>This Loop Header: Depth=1
; RV64IA-NEXT: # Child Loop BB3_3 Depth 2
; RV64IA-NEXT: mv a3, a2
-; RV64IA-NEXT: addi a2, a2, 1
-; RV64IA-NEXT: sltu a4, a3, a1
-; RV64IA-NEXT: neg a4, a4
-; RV64IA-NEXT: and a4, a4, a2
+; RV64IA-NEXT: addi a4, a2, 1
+; RV64IA-NEXT: sltu a2, a2, a1
+; RV64IA-NEXT: neg a2, a2
+; RV64IA-NEXT: and a4, a2, a4
; RV64IA-NEXT: .LBB3_3: # %atomicrmw.start
; RV64IA-NEXT: # Parent Loop BB3_1 Depth=1
; RV64IA-NEXT: # => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/RISCV/bfloat-convert.ll b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
index 770dcccee882bec..9e2b0b5c3cbb411 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-convert.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
@@ -456,80 +456,92 @@ define i64 @fcvt_l_bf16(bfloat %a) nounwind {
define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind {
; RV32IZFBFMIN-LABEL: fcvt_l_bf16_sat:
; RV32IZFBFMIN: # %bb.0: # %start
-; RV32IZFBFMIN-NEXT: addi sp, sp, -16
-; RV32IZFBFMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IZFBFMIN-NEXT: addi sp, sp, -32
+; RV32IZFBFMIN-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IZFBFMIN-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IZFBFMIN-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IZFBFMIN-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IZFBFMIN-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IZFBFMIN-NEXT: fsw fs0, 8(sp) # 4-byte Folded Spill
+; RV32IZFBFMIN-NEXT: lui a0, %hi(.LCPI10_0)
+; RV32IZFBFMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a0)
; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fs0, fa0
+; RV32IZFBFMIN-NEXT: flt.s s0, fa5, fs0
+; RV32IZFBFMIN-NEXT: neg s1, s0
; RV32IZFBFMIN-NEXT: lui a0, 913408
; RV32IZFBFMIN-NEXT: fmv.w.x fa5, a0
-; RV32IZFBFMIN-NEXT: fle.s s0, fa5, fs0
+; RV32IZFBFMIN-NEXT: fle.s s2, fa5, fs0
+; RV32IZFBFMIN-NEXT: neg s3, s2
; RV32IZFBFMIN-NEXT: fmv.s fa0, fs0
; RV32IZFBFMIN-NEXT: call __fixsfdi
+; RV32IZFBFMIN-NEXT: and a0, s3, a0
+; RV32IZFBFMIN-NEXT: or a0, s1, a0
+; RV32IZFBFMIN-NEXT: feq.s a2, fs0, fs0
+; RV32IZFBFMIN-NEXT: neg a2, a2
; RV32IZFBFMIN-NEXT: lui a4, 524288
-; RV32IZFBFMIN-NEXT: lui a2, 524288
-; RV32IZFBFMIN-NEXT: beqz s0, .LBB10_2
+; RV32IZFBFMIN-NEXT: lui a3, 524288
+; RV32IZFBFMIN-NEXT: beqz s2, .LBB10_2
; RV32IZFBFMIN-NEXT: # %bb.1: # %start
-; RV32IZFBFMIN-NEXT: mv a2, a1
+; RV32IZFBFMIN-NEXT: mv a3, a1
; RV32IZFBFMIN-NEXT: .LBB10_2: # %start
-; RV32IZFBFMIN-NEXT: lui a1, %hi(.LCPI10_0)
-; RV32IZFBFMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1)
-; RV32IZFBFMIN-NEXT: flt.s a3, fa5, fs0
-; RV32IZFBFMIN-NEXT: beqz a3, .LBB10_4
+; RV32IZFBFMIN-NEXT: and a0, a2, a0
+; RV32IZFBFMIN-NEXT: beqz s0, .LBB10_4
; RV32IZFBFMIN-NEXT: # %bb.3:
-; RV32IZFBFMIN-NEXT: addi a2, a4, -1
+; RV32IZFBFMIN-NEXT: addi a3, a4, -1
; RV32IZFBFMIN-NEXT: .LBB10_4: # %start
-; RV32IZFBFMIN-NEXT: feq.s a1, fs0, fs0
-; RV32IZFBFMIN-NEXT: neg a4, a1
-; RV32IZFBFMIN-NEXT: and a1, a4, a2
-; RV32IZFBFMIN-NEXT: neg a2, a3
-; RV32IZFBFMIN-NEXT: neg a3, s0
-; RV32IZFBFMIN-NEXT: and a0, a3, a0
-; RV32IZFBFMIN-NEXT: or a0, a2, a0
-; RV32IZFBFMIN-NEXT: and a0, a4, a0
-; RV32IZFBFMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT: addi sp, sp, 16
+; RV32IZFBFMIN-NEXT: and a1, a2, a3
+; RV32IZFBFMIN-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IZFBFMIN-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IZFBFMIN-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IZFBFMIN-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IZFBFMIN-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32IZFBFMIN-NEXT: flw fs0, 8(sp) # 4-byte Folded Reload
+; RV32IZFBFMIN-NEXT: addi sp, sp, 32
; RV32IZFBFMIN-NEXT: ret
;
; R32IDZFBFMIN-LABEL: fcvt_l_bf16_sat:
; R32IDZFBFMIN: # %bb.0: # %start
-; R32IDZFBFMIN-NEXT: addi sp, sp, -16
-; R32IDZFBFMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; R32IDZFBFMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; R32IDZFBFMIN-NEXT: addi sp, sp, -32
+; R32IDZFBFMIN-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; R32IDZFBFMIN-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; R32IDZFBFMIN-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; R32IDZFBFMIN-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; R32IDZFBFMIN-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; R32IDZFBFMIN-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill
+; R32IDZFBFMIN-NEXT: lui a0, %hi(.LCPI10_0)
+; R32IDZFBFMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a0)
; R32IDZFBFMIN-NEXT: fcvt.s.bf16 fs0, fa0
+; R32IDZFBFMIN-NEXT: flt.s s0, fa5, fs0
+; R32IDZFBFMIN-NEXT: neg s1, s0
; R32IDZFBFMIN-NEXT: lui a0, 913408
; R32IDZFBFMIN-NEXT: fmv.w.x fa5, a0
-; R32IDZFBFMIN-NEXT: fle.s s0, fa5, fs0
+; R32IDZFBFMIN-NEXT: fle.s s2, fa5, fs0
+; R32IDZFBFMIN-NEXT: neg s3, s2
; R32IDZFBFMIN-NEXT: fmv.s fa0, fs0
; R32IDZFBFMIN-NEXT: call __fixsfdi
+; R32IDZFBFMIN-NEXT: and a0, s3, a0
+; R32IDZFBFMIN-NEXT: or a0, s1, a0
+; R32IDZFBFMIN-NEXT: feq.s a2, fs0, fs0
+; R32IDZFBFMIN-NEXT: neg a2, a2
; R32IDZFBFMIN-NEXT: lui a4, 524288
-; R32IDZFBFMIN-NEXT: lui a2, 524288
-; R32IDZFBFMIN-NEXT: beqz s0, .LBB10_2
+; R32IDZFBFMIN-NEXT: lui a3, 524288
+; R32IDZFBFMIN-NEXT: beqz s2, .LBB10_2
; R32IDZFBFMIN-NEXT: # %bb.1: # %start
-; R32IDZFBFMIN-NEXT: mv a2, a1
+; R32IDZFBFMIN-NEXT: mv a3, a1
; R32IDZFBFMIN-NEXT: .LBB10_2: # %start
-; R32IDZFBFMIN-NEXT: lui a1, %hi(.LCPI10_0)
-; R32IDZFBFMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1)
-; R32IDZFBFMIN-NEXT: flt.s a3, fa5, fs0
-; R32IDZFBFMIN-NEXT: beqz a3, .LBB10_4
+; R32IDZFBFMIN-NEXT: and a0, a2, a0
+; R32IDZFBFMIN-NEXT: beqz s0, .LBB10_4
; R32IDZFBFMIN-NEXT: # %bb.3:
-; R32IDZFBFMIN-NEXT: addi a2, a4, -1
+; R32IDZFBFMIN-NEXT: addi a3, a4, -1
; R32IDZFBFMIN-NEXT: .LBB10_4: # %start
-; R32IDZFBFMIN-NEXT: feq.s a1, fs0, fs0
-; R32IDZFBFMIN-NEXT: neg a4, a1
-; R32IDZFBFMIN-NEXT: and a1, a4, a2
-; R32IDZFBFMIN-NEXT: neg a2, a3
-; R32IDZFBFMIN-NEXT: neg a3, s0
-; R32IDZFBFMIN-NEXT: and a0, a3, a0
-; R32IDZFBFMIN-NEXT: or a0, a2, a0
-; R32IDZFBFMIN-NEXT: and a0, a4, a0
-; R32IDZFBFMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; R32IDZFBFMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
+; R32IDZFBFMIN-NEXT: and a1, a2, a3
+; R32IDZFBFMIN-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; R32IDZFBFMIN-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; R32IDZFBFMIN-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; R32IDZFBFMIN-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; R32IDZFBFMIN-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
; R32IDZFBFMIN-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
-; R32IDZFBFMIN-NEXT: addi sp, sp, 16
+; R32IDZFBFMIN-NEXT: addi sp, sp, 32
; R32IDZFBFMIN-NEXT: ret
;
; RV32ID-LABEL: fcvt_l_bf16_sat:
diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll
index 6024a29da33d2e3..c147d6ec6d9b15b 100644
--- a/llvm/test/CodeGen/RISCV/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert.ll
@@ -692,27 +692,28 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
; RV32IFD-NEXT: fmv.d fs0, fa0
; RV32IFD-NEXT: fle.d s0, fa5, fa0
; RV32IFD-NEXT: call __fixdfdi
-; RV32IFD-NEXT: lui a4, 524288
+; RV32IFD-NEXT: lui a3, 524288
+; RV32IFD-NEXT: li a4, 1
; RV32IFD-NEXT: lui a2, 524288
-; RV32IFD-NEXT: beqz s0, .LBB12_2
+; RV32IFD-NEXT: bne s0, a4, .LBB12_2
; RV32IFD-NEXT: # %bb.1: # %start
; RV32IFD-NEXT: mv a2, a1
; RV32IFD-NEXT: .LBB12_2: # %start
; RV32IFD-NEXT: lui a1, %hi(.LCPI12_1)
; RV32IFD-NEXT: fld fa5, %lo(.LCPI12_1)(a1)
-; RV32IFD-NEXT: flt.d a3, fa5, fs0
-; RV32IFD-NEXT: beqz a3, .LBB12_4
+; RV32IFD-NEXT: flt.d a4, fa5, fs0
+; RV32IFD-NEXT: beqz a4, .LBB12_4
; RV32IFD-NEXT: # %bb.3:
-; RV32IFD-NEXT: addi a2, a4, -1
+; RV32IFD-NEXT: addi a2, a3, -1
; RV32IFD-NEXT: .LBB12_4: # %start
; RV32IFD-NEXT: feq.d a1, fs0, fs0
-; RV32IFD-NEXT: neg a4, a1
-; RV32IFD-NEXT: and a1, a4, a2
-; RV32IFD-NEXT: neg a2, a3
-; RV32IFD-NEXT: neg a3, s0
-; RV32IFD-NEXT: and a0, a3, a0
-; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: neg a3, a1
+; RV32IFD-NEXT: and a1, a3, a2
+; RV32IFD-NEXT: neg a2, a4
+; RV32IFD-NEXT: neg a4, s0
; RV32IFD-NEXT: and a0, a4, a0
+; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: and a0, a3, a0
; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
@@ -789,32 +790,33 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a1
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: lui a3, 278016
-; RV32I-NEXT: addi a3, a3, -1
-; RV32I-NEXT: li a2, -1
-; RV32I-NEXT: call __gtdf2
-; RV32I-NEXT: mv s2, a0
; RV32I-NEXT: lui a3, 802304
-; RV32I-NEXT: mv a0, s1
-; RV32I-NEXT: mv a1, s0
; RV32I-NEXT: li a2, 0
; RV32I-NEXT: call __gedf2
-; RV32I-NEXT: mv s3, a0
+; RV32I-NEXT: mv s2, a0
; RV32I-NEXT: mv a0, s1
; RV32I-NEXT: mv a1, s0
; RV32I-NEXT: call __fixdfdi
-; RV32I-NEXT: mv s4, a0
-; RV32I-NEXT: mv s5, a1
-; RV32I-NEXT: lui a0, 524288
-; RV32I-NEXT: bgez s3, .LBB12_2
+; RV32I-NEXT: mv s3, a0
+; RV32I-NEXT: mv s4, a1
+; RV32I-NEXT: lui s6, 524288
+; RV32I-NEXT: bgez s2, .LBB12_2
; RV32I-NEXT: # %bb.1: # %start
-; RV32I-NEXT: lui s5, 524288
+; RV32I-NEXT: lui s4, 524288
; RV32I-NEXT: .LBB12_2: # %start
-; RV32I-NEXT: blez s2, .LBB12_4
+; RV32I-NEXT: lui a3, 278016
+; RV32I-NEXT: addi a3, a3, -1
+; RV32I-NEXT: li a2, -1
+; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a1, s0
+; RV32I-NEXT: call __gtdf2
+; RV32I-NEXT: mv s5, a0
+; RV32I-NEXT: blez a0, .LBB12_4
; RV32I-NEXT: # %bb.3: # %start
-; RV32I-NEXT: addi s5, a0, -1
+; RV32I-NEXT: addi s4, s6, -1
; RV32I-NEXT: .LBB12_4: # %start
; RV32I-NEXT: mv a0, s1
; RV32I-NEXT: mv a1, s0
@@ -823,11 +825,11 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
; RV32I-NEXT: call __unorddf2
; RV32I-NEXT: snez a0, a0
; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: and a1, a0, s5
-; RV32I-NEXT: slti a2, s3, 0
+; RV32I-NEXT: and a1, a0, s4
+; RV32I-NEXT: slti a2, s2, 0
; RV32I-NEXT: addi a2, a2, -1
-; RV32I-NEXT: and a2, a2, s4
-; RV32I-NEXT: sgtz a3, s2
+; RV32I-NEXT: and a2, a2, s3
+; RV32I-NEXT: sgtz a3, s5
; RV32I-NEXT: neg a3, a3
; RV32I-NEXT: or a2, a3, a2
; RV32I-NEXT: and a0, a0, a2
@@ -838,6 +840,7 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s5, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s6, 0(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@@ -946,23 +949,22 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind {
; RV32IFD-NEXT: addi sp, sp, -16
; RV32IFD-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IFD-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT: fmv.d fs0, fa0
+; RV32IFD-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT: lui a0, %hi(.LCPI14_0)
+; RV32IFD-NEXT: fld fa5, %lo(.LCPI14_0)(a0)
+; RV32IFD-NEXT: flt.d a0, fa5, fa0
+; RV32IFD-NEXT: neg s0, a0
; RV32IFD-NEXT: fcvt.d.w fa5, zero
; RV32IFD-NEXT: fle.d a0, fa5, fa0
-; RV32IFD-NEXT: neg s0, a0
+; RV32IFD-NEXT: neg s1, a0
; RV32IFD-NEXT: call __fixunsdfdi
-; RV32IFD-NEXT: lui a2, %hi(.LCPI14_0)
-; RV32IFD-NEXT: fld fa5, %lo(.LCPI14_0)(a2)
-; RV32IFD-NEXT: and a0, s0, a0
-; RV32IFD-NEXT: flt.d a2, fa5, fs0
-; RV32IFD-NEXT: neg a2, a2
-; RV32IFD-NEXT: or a0, a2, a0
-; RV32IFD-NEXT: and a1, s0, a1
-; RV32IFD-NEXT: or a1, a2, a1
+; RV32IFD-NEXT: and a0, s1, a0
+; RV32IFD-NEXT: or a0, s0, a0
+; RV32IFD-NEXT: and a1, s1, a1
+; RV32IFD-NEXT: or a1, s0, a1
; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: addi sp, sp, 16
; RV32IFD-NEXT: ret
;
@@ -981,24 +983,27 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind {
; RV32IZFINXZDINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFINXZDINX-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; RV32IZFINXZDINX-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s2, 0(sp) # 4-byte Folded Spill
; RV32IZFINXZDINX-NEXT: mv s1, a1
+; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero
; RV32IZFINXZDINX-NEXT: mv s0, a0
+; RV32IZFINXZDINX-NEXT: fle.d a0, a2, s0
+; RV32IZFINXZDINX-NEXT: neg s2, a0
+; RV32IZFINXZDINX-NEXT: mv a0, s0
; RV32IZFINXZDINX-NEXT: call __fixunsdfdi
-; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero
-; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI14_0)
-; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI14_0+4)(a4)
-; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI14_0)(a4)
-; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0
+; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI14_0)
+; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI14_0+4)(a2)
+; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI14_0)(a2)
+; RV32IZFINXZDINX-NEXT: and a0, s2, a0
+; RV32IZFINXZDINX-NEXT: flt.d a2, a2, s0
; RV32IZFINXZDINX-NEXT: neg a2, a2
-; RV32IZFINXZDINX-NEXT: and a0, a2, a0
-; RV32IZFINXZDINX-NEXT: flt.d a3, a4, s0
-; RV32IZFINXZDINX-NEXT: neg a3, a3
-; RV32IZFINXZDINX-NEXT: or a0, a3, a0
-; RV32IZFINXZDINX-NEXT: and a1, a2, a1
-; RV32IZFINXZDINX-NEXT: or a1, a3, a1
+; RV32IZFINXZDINX-NEXT: or a0, a2, a0
+; RV32IZFINXZDINX-NEXT: and a1, s2, a1
+; RV32IZFINXZDINX-NEXT: or a1, a2, a1
; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZFINXZDINX-NEXT: addi sp, sp, 16
; RV32IZFINXZDINX-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
index 927eee2e9e54512..f1c56b320b76c46 100644
--- a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
@@ -53,27 +53,28 @@ define i64 @test_floor_si64(double %x) nounwind {
; RV32IFD-NEXT: fmv.d fs0, fa0
; RV32IFD-NEXT: fle.d s0, fa5, fa0
; RV32IFD-NEXT: call __fixdfdi
-; RV32IFD-NEXT: lui a4, 524288
+; RV32IFD-NEXT: lui a3, 524288
+; RV32IFD-NEXT: li a4, 1
; RV32IFD-NEXT: lui a2, 524288
-; RV32IFD-NEXT: beqz s0, .LBB1_2
+; RV32IFD-NEXT: bne s0, a4, .LBB1_2
; RV32IFD-NEXT: # %bb.1:
; RV32IFD-NEXT: mv a2, a1
; RV32IFD-NEXT: .LBB1_2:
; RV32IFD-NEXT: lui a1, %hi(.LCPI1_1)
; RV32IFD-NEXT: fld fa5, %lo(.LCPI1_1)(a1)
-; RV32IFD-NEXT: flt.d a3, fa5, fs0
-; RV32IFD-NEXT: beqz a3, .LBB1_4
+; RV32IFD-NEXT: flt.d a4, fa5, fs0
+; RV32IFD-NEXT: beqz a4, .LBB1_4
; RV32IFD-NEXT: # %bb.3:
-; RV32IFD-NEXT: addi a2, a4, -1
+; RV32IFD-NEXT: addi a2, a3, -1
; RV32IFD-NEXT: .LBB1_4:
; RV32IFD-NEXT: feq.d a1, fs0, fs0
-; RV32IFD-NEXT: neg a4, a1
-; RV32IFD-NEXT: and a1, a4, a2
-; RV32IFD-NEXT: neg a2, a3
-; RV32IFD-NEXT: neg a3, s0
-; RV32IFD-NEXT: and a0, a3, a0
-; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: neg a3, a1
+; RV32IFD-NEXT: and a1, a3, a2
+; RV32IFD-NEXT: neg a2, a4
+; RV32IFD-NEXT: neg a4, s0
; RV32IFD-NEXT: and a0, a4, a0
+; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: and a0, a3, a0
; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
@@ -298,27 +299,28 @@ define i64 @test_ceil_si64(double %x) nounwind {
; RV32IFD-NEXT: fmv.d fs0, fa0
; RV32IFD-NEXT: fle.d s0, fa5, fa0
; RV32IFD-NEXT: call __fixdfdi
-; RV32IFD-NEXT: lui a4, 524288
+; RV32IFD-NEXT: lui a3, 524288
+; RV32IFD-NEXT: li a4, 1
; RV32IFD-NEXT: lui a2, 524288
-; RV32IFD-NEXT: beqz s0, .LBB5_2
+; RV32IFD-NEXT: bne s0, a4, .LBB5_2
; RV32IFD-NEXT: # %bb.1:
; RV32IFD-NEXT: mv a2, a1
; RV32IFD-NEXT: .LBB5_2:
; RV32IFD-NEXT: lui a1, %hi(.LCPI5_1)
; RV32IFD-NEXT: fld fa5, %lo(.LCPI5_1)(a1)
-; RV32IFD-NEXT: flt.d a3, fa5, fs0
-; RV32IFD-NEXT: beqz a3, .LBB5_4
+; RV32IFD-NEXT: flt.d a4, fa5, fs0
+; RV32IFD-NEXT: beqz a4, .LBB5_4
; RV32IFD-NEXT: # %bb.3:
-; RV32IFD-NEXT: addi a2, a4, -1
+; RV32IFD-NEXT: addi a2, a3, -1
; RV32IFD-NEXT: .LBB5_4:
; RV32IFD-NEXT: feq.d a1, fs0, fs0
-; RV32IFD-NEXT: neg a4, a1
-; RV32IFD-NEXT: and a1, a4, a2
-; RV32IFD-NEXT: neg a2, a3
-; RV32IFD-NEXT: neg a3, s0
-; RV32IFD-NEXT: and a0, a3, a0
-; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: neg a3, a1
+; RV32IFD-NEXT: and a1, a3, a2
+; RV32IFD-NEXT: neg a2, a4
+; RV32IFD-NEXT: neg a4, s0
; RV32IFD-NEXT: and a0, a4, a0
+; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: and a0, a3, a0
; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
@@ -543,27 +545,28 @@ define i64 @test_trunc_si64(double %x) nounwind {
; RV32IFD-NEXT: fmv.d fs0, fa0
; RV32IFD-NEXT: fle.d s0, fa5, fa0
; RV32IFD-NEXT: call __fixdfdi
-; RV32IFD-NEXT: lui a4, 524288
+; RV32IFD-NEXT: lui a3, 524288
+; RV32IFD-NEXT: li a4, 1
; RV32IFD-NEXT: lui a2, 524288
-; RV32IFD-NEXT: beqz s0, .LBB9_2
+; RV32IFD-NEXT: bne s0, a4, .LBB9_2
; RV32IFD-NEXT: # %bb.1:
; RV32IFD-NEXT: mv a2, a1
; RV32IFD-NEXT: .LBB9_2:
; RV32IFD-NEXT: lui a1, %hi(.LCPI9_1)
; RV32IFD-NEXT: fld fa5, %lo(.LCPI9_1)(a1)
-; RV32IFD-NEXT: flt.d a3, fa5, fs0
-; RV32IFD-NEXT: beqz a3, .LBB9_4
+; RV32IFD-NEXT: flt.d a4, fa5, fs0
+; RV32IFD-NEXT: beqz a4, .LBB9_4
; RV32IFD-NEXT: # %bb.3:
-; RV32IFD-NEXT: addi a2, a4, -1
+; RV32IFD-NEXT: addi a2, a3, -1
; RV32IFD-NEXT: .LBB9_4:
; RV32IFD-NEXT: feq.d a1, fs0, fs0
-; RV32IFD-NEXT: neg a4, a1
-; RV32IFD-NEXT: and a1, a4, a2
-; RV32IFD-NEXT: neg a2, a3
-; RV32IFD-NEXT: neg a3, s0
-; RV32IFD-NEXT: and a0, a3, a0
-; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: neg a3, a1
+; RV32IFD-NEXT: and a1, a3, a2
+; RV32IFD-NEXT: neg a2, a4
+; RV32IFD-NEXT: neg a4, s0
; RV32IFD-NEXT: and a0, a4, a0
+; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: and a0, a3, a0
; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
@@ -788,27 +791,28 @@ define i64 @test_round_si64(double %x) nounwind {
; RV32IFD-NEXT: fmv.d fs0, fa0
; RV32IFD-NEXT: fle.d s0, fa5, fa0
; RV32IFD-NEXT: call __fixdfdi
-; RV32IFD-NEXT: lui a4, 524288
+; RV32IFD-NEXT: lui a3, 524288
+; RV32IFD-NEXT: li a4, 1
; RV32IFD-NEXT: lui a2, 524288
-; RV32IFD-NEXT: beqz s0, .LBB13_2
+; RV32IFD-NEXT: bne s0, a4, .LBB13_2
; RV32IFD-NEXT: # %bb.1:
; RV32IFD-NEXT: mv a2, a1
; RV32IFD-NEXT: .LBB13_2:
; RV32IFD-NEXT: lui a1, %hi(.LCPI13_1)
; RV32IFD-NEXT: fld fa5, %lo(.LCPI13_1)(a1)
-; RV32IFD-NEXT: flt.d a3, fa5, fs0
-; RV32IFD-NEXT: beqz a3, .LBB13_4
+; RV32IFD-NEXT: flt.d a4, fa5, fs0
+; RV32IFD-NEXT: beqz a4, .LBB13_4
; RV32IFD-NEXT: # %bb.3:
-; RV32IFD-NEXT: addi a2, a4, -1
+; RV32IFD-NEXT: addi a2, a3, -1
; RV32IFD-NEXT: .LBB13_4:
; RV32IFD-NEXT: feq.d a1, fs0, fs0
-; RV32IFD-NEXT: neg a4, a1
-; RV32IFD-NEXT: and a1, a4, a2
-; RV32IFD-NEXT: neg a2, a3
-; RV32IFD-NEXT: neg a3, s0
-; RV32IFD-NEXT: and a0, a3, a0
-; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: neg a3, a1
+; RV32IFD-NEXT: and a1, a3, a2
+; RV32IFD-NEXT: neg a2, a4
+; RV32IFD-NEXT: neg a4, s0
; RV32IFD-NEXT: and a0, a4, a0
+; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: and a0, a3, a0
; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
@@ -1033,27 +1037,28 @@ define i64 @test_roundeven_si64(double %x) nounwind {
; RV32IFD-NEXT: fmv.d fs0, fa0
; RV32IFD-NEXT: fle.d s0, fa5, fa0
; RV32IFD-NEXT: call __fixdfdi
-; RV32IFD-NEXT: lui a4, 524288
+; RV32IFD-NEXT: lui a3, 524288
+; RV32IFD-NEXT: li a4, 1
; RV32IFD-NEXT: lui a2, 524288
-; RV32IFD-NEXT: beqz s0, .LBB17_2
+; RV32IFD-NEXT: bne s0, a4, .LBB17_2
; RV32IFD-NEXT: # %bb.1:
; RV32IFD-NEXT: mv a2, a1
; RV32IFD-NEXT: .LBB17_2:
; RV32IFD-NEXT: lui a1, %hi(.LCPI17_1)
; RV32IFD-NEXT: fld fa5, %lo(.LCPI17_1)(a1)
-; RV32IFD-NEXT: flt.d a3, fa5, fs0
-; RV32IFD-NEXT: beqz a3, .LBB17_4
+; RV32IFD-NEXT: flt.d a4, fa5, fs0
+; RV32IFD-NEXT: beqz a4, .LBB17_4
; RV32IFD-NEXT: # %bb.3:
-; RV32IFD-NEXT: addi a2, a4, -1
+; RV32IFD-NEXT: addi a2, a3, -1
; RV32IFD-NEXT: .LBB17_4:
; RV32IFD-NEXT: feq.d a1, fs0, fs0
-; RV32IFD-NEXT: neg a4, a1
-; RV32IFD-NEXT: and a1, a4, a2
-; RV32IFD-NEXT: neg a2, a3
-; RV32IFD-NEXT: neg a3, s0
-; RV32IFD-NEXT: and a0, a3, a0
-; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: neg a3, a1
+; RV32IFD-NEXT: and a1, a3, a2
+; RV32IFD-NEXT: neg a2, a4
+; RV32IFD-NEXT: neg a4, s0
; RV32IFD-NEXT: and a0, a4, a0
+; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: and a0, a3, a0
; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
@@ -1278,27 +1283,28 @@ define i64 @test_rint_si64(double %x) nounwind {
; RV32IFD-NEXT: fmv.d fs0, fa0
; RV32IFD-NEXT: fle.d s0, fa5, fa0
; RV32IFD-NEXT: call __fixdfdi
-; RV32IFD-NEXT: lui a4, 524288
+; RV32IFD-NEXT: lui a3, 524288
+; RV32IFD-NEXT: li a4, 1
; RV32IFD-NEXT: lui a2, 524288
-; RV32IFD-NEXT: beqz s0, .LBB21_2
+; RV32IFD-NEXT: bne s0, a4, .LBB21_2
; RV32IFD-NEXT: # %bb.1:
; RV32IFD-NEXT: mv a2, a1
; RV32IFD-NEXT: .LBB21_2:
; RV32IFD-NEXT: lui a1, %hi(.LCPI21_1)
; RV32IFD-NEXT: fld fa5, %lo(.LCPI21_1)(a1)
-; RV32IFD-NEXT: flt.d a3, fa5, fs0
-; RV32IFD-NEXT: beqz a3, .LBB21_4
+; RV32IFD-NEXT: flt.d a4, fa5, fs0
+; RV32IFD-NEXT: beqz a4, .LBB21_4
; RV32IFD-NEXT: # %bb.3:
-; RV32IFD-NEXT: addi a2, a4, -1
+; RV32IFD-NEXT: addi a2, a3, -1
; RV32IFD-NEXT: .LBB21_4:
; RV32IFD-NEXT: feq.d a1, fs0, fs0
-; RV32IFD-NEXT: neg a4, a1
-; RV32IFD-NEXT: and a1, a4, a2
-; RV32IFD-NEXT: neg a2, a3
-; RV32IFD-NEXT: neg a3, s0
-; RV32IFD-NEXT: and a0, a3, a0
-; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: neg a3, a1
+; RV32IFD-NEXT: and a1, a3, a2
+; RV32IFD-NEXT: neg a2, a4
+; RV32IFD-NEXT: neg a4, s0
; RV32IFD-NEXT: and a0, a4, a0
+; RV32IFD-NEXT: or a0, a2, a0
+; RV32IFD-NEXT: and a0, a3, a0
; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll
index 7eabd3f5f2273af..653b64ec730496d 100644
--- a/llvm/test/CodeGen/RISCV/float-convert.ll
+++ b/llvm/test/CodeGen/RISCV/float-convert.ll
@@ -275,24 +275,26 @@ define i32 @fcvt_wu_s_sat(float %a) nounwind {
; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 0(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a0
+; RV32I-NEXT: lui a1, 325632
+; RV32I-NEXT: addi a1, a1, -1
+; RV32I-NEXT: call __gtsf2
+; RV32I-NEXT: sgtz a0, a0
+; RV32I-NEXT: neg s1, a0
+; RV32I-NEXT: mv a0, s0
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: call __gesf2
; RV32I-NEXT: slti a0, a0, 0
-; RV32I-NEXT: addi s1, a0, -1
+; RV32I-NEXT: addi s2, a0, -1
; RV32I-NEXT: mv a0, s0
; RV32I-NEXT: call __fixunssfsi
-; RV32I-NEXT: and s1, s1, a0
-; RV32I-NEXT: lui a1, 325632
-; RV32I-NEXT: addi a1, a1, -1
-; RV32I-NEXT: mv a0, s0
-; RV32I-NEXT: call __gtsf2
-; RV32I-NEXT: sgtz a0, a0
-; RV32I-NEXT: neg a0, a0
-; RV32I-NEXT: or a0, a0, s1
+; RV32I-NEXT: and a0, s2, a0
+; RV32I-NEXT: or a0, s1, a0
; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 16
; RV32I-NEXT: ret
;
@@ -616,36 +618,38 @@ define i64 @fcvt_l_s_sat(float %a) nounwind {
; RV32IF-NEXT: addi sp, sp, -16
; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IF-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
; RV32IF-NEXT: fmv.s fs0, fa0
; RV32IF-NEXT: lui a0, 913408
; RV32IF-NEXT: fmv.w.x fa5, a0
; RV32IF-NEXT: fle.s s0, fa5, fa0
+; RV32IF-NEXT: neg s1, s0
; RV32IF-NEXT: call __fixsfdi
+; RV32IF-NEXT: lui a2, %hi(.LCPI12_0)
+; RV32IF-NEXT: flw fa5, %lo(.LCPI12_0)(a2)
+; RV32IF-NEXT: and a0, s1, a0
+; RV32IF-NEXT: flt.s a3, fa5, fs0
+; RV32IF-NEXT: neg a2, a3
+; RV32IF-NEXT: or a0, a2, a0
+; RV32IF-NEXT: feq.s a2, fs0, fs0
+; RV32IF-NEXT: neg a2, a2
+; RV32IF-NEXT: lui a5, 524288
; RV32IF-NEXT: lui a4, 524288
-; RV32IF-NEXT: lui a2, 524288
; RV32IF-NEXT: beqz s0, .LBB12_2
; RV32IF-NEXT: # %bb.1: # %start
-; RV32IF-NEXT: mv a2, a1
+; RV32IF-NEXT: mv a4, a1
; RV32IF-NEXT: .LBB12_2: # %start
-; RV32IF-NEXT: lui a1, %hi(.LCPI12_0)
-; RV32IF-NEXT: flw fa5, %lo(.LCPI12_0)(a1)
-; RV32IF-NEXT: flt.s a3, fa5, fs0
+; RV32IF-NEXT: and a0, a2, a0
; RV32IF-NEXT: beqz a3, .LBB12_4
; RV32IF-NEXT: # %bb.3:
-; RV32IF-NEXT: addi a2, a4, -1
+; RV32IF-NEXT: addi a4, a5, -1
; RV32IF-NEXT: .LBB12_4: # %start
-; RV32IF-NEXT: feq.s a1, fs0, fs0
-; RV32IF-NEXT: neg a4, a1
-; RV32IF-NEXT: and a1, a4, a2
-; RV32IF-NEXT: neg a2, s0
-; RV32IF-NEXT: and a0, a2, a0
-; RV32IF-NEXT: neg a2, a3
-; RV32IF-NEXT: or a0, a2, a0
-; RV32IF-NEXT: and a0, a4, a0
+; RV32IF-NEXT: and a1, a2, a4
; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
; RV32IF-NEXT: addi sp, sp, 16
; RV32IF-NEXT: ret
;
@@ -863,23 +867,22 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
; RV32IF-NEXT: addi sp, sp, -16
; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IF-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: fmv.s fs0, fa0
+; RV32IF-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: lui a0, %hi(.LCPI14_0)
+; RV32IF-NEXT: flw fa5, %lo(.LCPI14_0)(a0)
+; RV32IF-NEXT: flt.s a0, fa5, fa0
+; RV32IF-NEXT: neg s0, a0
; RV32IF-NEXT: fmv.w.x fa5, zero
; RV32IF-NEXT: fle.s a0, fa5, fa0
-; RV32IF-NEXT: neg s0, a0
+; RV32IF-NEXT: neg s1, a0
; RV32IF-NEXT: call __fixunssfdi
-; RV32IF-NEXT: lui a2, %hi(.LCPI14_0)
-; RV32IF-NEXT: flw fa5, %lo(.LCPI14_0)(a2)
-; RV32IF-NEXT: and a0, s0, a0
-; RV32IF-NEXT: flt.s a2, fa5, fs0
-; RV32IF-NEXT: neg a2, a2
-; RV32IF-NEXT: or a0, a2, a0
-; RV32IF-NEXT: and a1, s0, a1
-; RV32IF-NEXT: or a1, a2, a1
+; RV32IF-NEXT: and a0, s1, a0
+; RV32IF-NEXT: or a0, s0, a0
+; RV32IF-NEXT: and a1, s1, a1
+; RV32IF-NEXT: or a1, s0, a1
; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IF-NEXT: addi sp, sp, 16
; RV32IF-NEXT: ret
;
@@ -898,19 +901,17 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
; RV32IZFINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFINX-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; RV32IZFINX-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFINX-NEXT: mv s0, a0
-; RV32IZFINX-NEXT: fle.s a0, zero, a0
-; RV32IZFINX-NEXT: neg s1, a0
-; RV32IZFINX-NEXT: mv a0, s0
+; RV32IZFINX-NEXT: lui a1, %hi(.LCPI14_0)
+; RV32IZFINX-NEXT: lw a1, %lo(.LCPI14_0)(a1)
+; RV32IZFINX-NEXT: flt.s a1, a1, a0
+; RV32IZFINX-NEXT: neg s0, a1
+; RV32IZFINX-NEXT: fle.s a1, zero, a0
+; RV32IZFINX-NEXT: neg s1, a1
; RV32IZFINX-NEXT: call __fixunssfdi
-; RV32IZFINX-NEXT: lui a2, %hi(.LCPI14_0)
-; RV32IZFINX-NEXT: lw a2, %lo(.LCPI14_0)(a2)
; RV32IZFINX-NEXT: and a0, s1, a0
-; RV32IZFINX-NEXT: flt.s a2, a2, s0
-; RV32IZFINX-NEXT: neg a2, a2
-; RV32IZFINX-NEXT: or a0, a2, a0
+; RV32IZFINX-NEXT: or a0, s0, a0
; RV32IZFINX-NEXT: and a1, s1, a1
-; RV32IZFINX-NEXT: or a1, a2, a1
+; RV32IZFINX-NEXT: or a1, s0, a1
; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -928,36 +929,33 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
;
; RV32I-LABEL: fcvt_lu_s_sat:
; RV32I: # %bb.0: # %start
-; RV32I-NEXT: addi sp, sp, -32
-; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 0(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a0
+; RV32I-NEXT: lui a1, 391168
+; RV32I-NEXT: addi a1, a1, -1
+; RV32I-NEXT: call __gtsf2
+; RV32I-NEXT: sgtz a0, a0
+; RV32I-NEXT: neg s1, a0
+; RV32I-NEXT: mv a0, s0
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: call __gesf2
; RV32I-NEXT: slti a0, a0, 0
; RV32I-NEXT: addi s2, a0, -1
; RV32I-NEXT: mv a0, s0
; RV32I-NEXT: call __fixunssfdi
-; RV32I-NEXT: mv s1, a1
-; RV32I-NEXT: and s3, s2, a0
-; RV32I-NEXT: lui a1, 391168
-; RV32I-NEXT: addi a1, a1, -1
-; RV32I-NEXT: mv a0, s0
-; RV32I-NEXT: call __gtsf2
-; RV32I-NEXT: sgtz a0, a0
-; RV32I-NEXT: neg a1, a0
-; RV32I-NEXT: or a0, a1, s3
-; RV32I-NEXT: and a2, s2, s1
-; RV32I-NEXT: or a1, a1, a2
-; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 32
+; RV32I-NEXT: and a0, s2, a0
+; RV32I-NEXT: or a0, s1, a0
+; RV32I-NEXT: and a1, s2, a1
+; RV32I-NEXT: or a1, s1, a1
+; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
; RV32I-NEXT: ret
;
; RV64I-LABEL: fcvt_lu_s_sat:
@@ -2091,24 +2089,26 @@ define zeroext i32 @fcvt_wu_s_sat_zext(float %a) nounwind {
; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 0(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a0
+; RV32I-NEXT: lui a1, 325632
+; RV32I-NEXT: addi a1, a1, -1
+; RV32I-NEXT: call __gtsf2
+; RV32I-NEXT: sgtz a0, a0
+; RV32I-NEXT: neg s1, a0
+; RV32I-NEXT: mv a0, s0
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: call __gesf2
; RV32I-NEXT: slti a0, a0, 0
-; RV32I-NEXT: addi s1, a0, -1
+; RV32I-NEXT: addi s2, a0, -1
; RV32I-NEXT: mv a0, s0
; RV32I-NEXT: call __fixunssfsi
-; RV32I-NEXT: and s1, s1, a0
-; RV32I-NEXT: lui a1, 325632
-; RV32I-NEXT: addi a1, a1, -1
-; RV32I-NEXT: mv a0, s0
-; RV32I-NEXT: call __gtsf2
-; RV32I-NEXT: sgtz a0, a0
-; RV32I-NEXT: neg a0, a0
-; RV32I-NEXT: or a0, a0, s1
+; RV32I-NEXT: and a0, s2, a0
+; RV32I-NEXT: or a0, s1, a0
; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 16
; RV32I-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
index 5e99c7eb9056282..4f747c278da03c1 100644
--- a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
@@ -37,7 +37,8 @@ define i64 @test_floor_si64(float %x) nounwind {
; RV32IF-NEXT: addi sp, sp, -16
; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IF-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
; RV32IF-NEXT: fmv.s fs0, fa0
; RV32IF-NEXT: lui a0, 307200
; RV32IF-NEXT: fmv.w.x fa5, a0
@@ -52,32 +53,33 @@ define i64 @test_floor_si64(float %x) nounwind {
; RV32IF-NEXT: lui a0, 913408
; RV32IF-NEXT: fmv.w.x fa5, a0
; RV32IF-NEXT: fle.s s0, fa5, fs0
+; RV32IF-NEXT: neg s1, s0
; RV32IF-NEXT: fmv.s fa0, fs0
; RV32IF-NEXT: call __fixsfdi
+; RV32IF-NEXT: lui a2, %hi(.LCPI1_0)
+; RV32IF-NEXT: flw fa5, %lo(.LCPI1_0)(a2)
+; RV32IF-NEXT: and a0, s1, a0
+; RV32IF-NEXT: flt.s a3, fa5, fs0
+; RV32IF-NEXT: neg a2, a3
+; RV32IF-NEXT: or a0, a2, a0
+; RV32IF-NEXT: feq.s a2, fs0, fs0
+; RV32IF-NEXT: neg a2, a2
+; RV32IF-NEXT: lui a5, 524288
; RV32IF-NEXT: lui a4, 524288
-; RV32IF-NEXT: lui a2, 524288
; RV32IF-NEXT: beqz s0, .LBB1_4
; RV32IF-NEXT: # %bb.3:
-; RV32IF-NEXT: mv a2, a1
+; RV32IF-NEXT: mv a4, a1
; RV32IF-NEXT: .LBB1_4:
-; RV32IF-NEXT: lui a1, %hi(.LCPI1_0)
-; RV32IF-NEXT: flw fa5, %lo(.LCPI1_0)(a1)
-; RV32IF-NEXT: flt.s a3, fa5, fs0
+; RV32IF-NEXT: and a0, a2, a0
; RV32IF-NEXT: beqz a3, .LBB1_6
; RV32IF-NEXT: # %bb.5:
-; RV32IF-NEXT: addi a2, a4, -1
+; RV32IF-NEXT: addi a4, a5, -1
; RV32IF-NEXT: .LBB1_6:
-; RV32IF-NEXT: feq.s a1, fs0, fs0
-; RV32IF-NEXT: neg a4, a1
-; RV32IF-NEXT: and a1, a4, a2
-; RV32IF-NEXT: neg a2, s0
-; RV32IF-NEXT: and a0, a2, a0
-; RV32IF-NEXT: neg a2, a3
-; RV32IF-NEXT: or a0, a2, a0
-; RV32IF-NEXT: and a0, a4, a0
+; RV32IF-NEXT: and a1, a2, a4
; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
; RV32IF-NEXT: addi sp, sp, 16
; RV32IF-NEXT: ret
;
@@ -115,23 +117,23 @@ define i64 @test_floor_si64(float %x) nounwind {
; RV32IZFINX-NEXT: lui a2, %hi(.LCPI1_0)
; RV32IZFINX-NEXT: lw a2, %lo(.LCPI1_0)(a2)
; RV32IZFINX-NEXT: and a0, s2, a0
-; RV32IZFINX-NEXT: flt.s a4, a2, s0
-; RV32IZFINX-NEXT: neg a2, a4
+; RV32IZFINX-NEXT: flt.s a3, a2, s0
+; RV32IZFINX-NEXT: neg a2, a3
; RV32IZFINX-NEXT: or a0, a2, a0
; RV32IZFINX-NEXT: feq.s a2, s0, s0
; RV32IZFINX-NEXT: neg a2, a2
; RV32IZFINX-NEXT: lui a5, 524288
-; RV32IZFINX-NEXT: lui a3, 524288
+; RV32IZFINX-NEXT: lui a4, 524288
; RV32IZFINX-NEXT: beqz s1, .LBB1_4
; RV32IZFINX-NEXT: # %bb.3:
-; RV32IZFINX-NEXT: mv a3, a1
+; RV32IZFINX-NEXT: mv a4, a1
; RV32IZFINX-NEXT: .LBB1_4:
; RV32IZFINX-NEXT: and a0, a2, a0
-; RV32IZFINX-NEXT: beqz a4, .LBB1_6
+; RV32IZFINX-NEXT: beqz a3, .LBB1_6
; RV32IZFINX-NEXT: # %bb.5:
-; RV32IZFINX-NEXT: addi a3, a5, -1
+; RV32IZFINX-NEXT: addi a4, a5, -1
; RV32IZFINX-NEXT: .LBB1_6:
-; RV32IZFINX-NEXT: and a1, a2, a3
+; RV32IZFINX-NEXT: and a1, a2, a4
; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -297,7 +299,8 @@ define i64 @test_ceil_si64(float %x) nounwind {
; RV32IF-NEXT: addi sp, sp, -16
; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IF-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
; RV32IF-NEXT: fmv.s fs0, fa0
; RV32IF-NEXT: lui a0, 307200
; RV32IF-NEXT: fmv.w.x fa5, a0
@@ -312,32 +315,33 @@ define i64 @test_ceil_si64(float %x) nounwind {
; RV32IF-NEXT: lui a0, 913408
; RV32IF-NEXT: fmv.w.x fa5, a0
; RV32IF-NEXT: fle.s s0, fa5, fs0
+; RV32IF-NEXT: neg s1, s0
; RV32IF-NEXT: fmv.s fa0, fs0
; RV32IF-NEXT: call __fixsfdi
+; RV32IF-NEXT: lui a2, %hi(.LCPI5_0)
+; RV32IF-NEXT: flw fa5, %lo(.LCPI5_0)(a2)
+; RV32IF-NEXT: and a0, s1, a0
+; RV32IF-NEXT: flt.s a3, fa5, fs0
+; RV32IF-NEXT: neg a2, a3
+; RV32IF-NEXT: or a0, a2, a0
+; RV32IF-NEXT: feq.s a2, fs0, fs0
+; RV32IF-NEXT: neg a2, a2
+; RV32IF-NEXT: lui a5, 524288
; RV32IF-NEXT: lui a4, 524288
-; RV32IF-NEXT: lui a2, 524288
; RV32IF-NEXT: beqz s0, .LBB5_4
; RV32IF-NEXT: # %bb.3:
-; RV32IF-NEXT: mv a2, a1
+; RV32IF-NEXT: mv a4, a1
; RV32IF-NEXT: .LBB5_4:
-; RV32IF-NEXT: lui a1, %hi(.LCPI5_0)
-; RV32IF-NEXT: flw fa5, %lo(.LCPI5_0)(a1)
-; RV32IF-NEXT: flt.s a3, fa5, fs0
+; RV32IF-NEXT: and a0, a2, a0
; RV32IF-NEXT: beqz a3, .LBB5_6
; RV32IF-NEXT: # %bb.5:
-; RV32IF-NEXT: addi a2, a4, -1
+; RV32IF-NEXT: addi a4, a5, -1
; RV32IF-NEXT: .LBB5_6:
-; RV32IF-NEXT: feq.s a1, fs0, fs0
-; RV32IF-NEXT: neg a4, a1
-; RV32IF-NEXT: and a1, a4, a2
-; RV32IF-NEXT: neg a2, s0
-; RV32IF-NEXT: and a0, a2, a0
-; RV32IF-NEXT: neg a2, a3
-; RV32IF-NEXT: or a0, a2, a0
-; RV32IF-NEXT: and a0, a4, a0
+; RV32IF-NEXT: and a1, a2, a4
; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
; RV32IF-NEXT: addi sp, sp, 16
; RV32IF-NEXT: ret
;
@@ -375,23 +379,23 @@ define i64 @test_ceil_si64(float %x) nounwind {
; RV32IZFINX-NEXT: lui a2, %hi(.LCPI5_0)
; RV32IZFINX-NEXT: lw a2, %lo(.LCPI5_0)(a2)
; RV32IZFINX-NEXT: and a0, s2, a0
-; RV32IZFINX-NEXT: flt.s a4, a2, s0
-; RV32IZFINX-NEXT: neg a2, a4
+; RV32IZFINX-NEXT: flt.s a3, a2, s0
+; RV32IZFINX-NEXT: neg a2, a3
; RV32IZFINX-NEXT: or a0, a2, a0
; RV32IZFINX-NEXT: feq.s a2, s0, s0
; RV32IZFINX-NEXT: neg a2, a2
; RV32IZFINX-NEXT: lui a5, 524288
-; RV32IZFINX-NEXT: lui a3, 524288
+; RV32IZFINX-NEXT: lui a4, 524288
; RV32IZFINX-NEXT: beqz s1, .LBB5_4
; RV32IZFINX-NEXT: # %bb.3:
-; RV32IZFINX-NEXT: mv a3, a1
+; RV32IZFINX-NEXT: mv a4, a1
; RV32IZFINX-NEXT: .LBB5_4:
; RV32IZFINX-NEXT: and a0, a2, a0
-; RV32IZFINX-NEXT: beqz a4, .LBB5_6
+; RV32IZFINX-NEXT: beqz a3, .LBB5_6
; RV32IZFINX-NEXT: # %bb.5:
-; RV32IZFINX-NEXT: addi a3, a5, -1
+; RV32IZFINX-NEXT: addi a4, a5, -1
; RV32IZFINX-NEXT: .LBB5_6:
-; RV32IZFINX-NEXT: and a1, a2, a3
+; RV32IZFINX-NEXT: and a1, a2, a4
; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -557,7 +561,8 @@ define i64 @test_trunc_si64(float %x) nounwind {
; RV32IF-NEXT: addi sp, sp, -16
; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IF-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
; RV32IF-NEXT: fmv.s fs0, fa0
; RV32IF-NEXT: lui a0, 307200
; RV32IF-NEXT: fmv.w.x fa5, a0
@@ -572,32 +577,33 @@ define i64 @test_trunc_si64(float %x) nounwind {
; RV32IF-NEXT: lui a0, 913408
; RV32IF-NEXT: fmv.w.x fa5, a0
; RV32IF-NEXT: fle.s s0, fa5, fs0
+; RV32IF-NEXT: neg s1, s0
; RV32IF-NEXT: fmv.s fa0, fs0
; RV32IF-NEXT: call __fixsfdi
+; RV32IF-NEXT: lui a2, %hi(.LCPI9_0)
+; RV32IF-NEXT: flw fa5, %lo(.LCPI9_0)(a2)
+; RV32IF-NEXT: and a0, s1, a0
+; RV32IF-NEXT: flt.s a3, fa5, fs0
+; RV32IF-NEXT: neg a2, a3
+; RV32IF-NEXT: or a0, a2, a0
+; RV32IF-NEXT: feq.s a2, fs0, fs0
+; RV32IF-NEXT: neg a2, a2
+; RV32IF-NEXT: lui a5, 524288
; RV32IF-NEXT: lui a4, 524288
-; RV32IF-NEXT: lui a2, 524288
; RV32IF-NEXT: beqz s0, .LBB9_4
; RV32IF-NEXT: # %bb.3:
-; RV32IF-NEXT: mv a2, a1
+; RV32IF-NEXT: mv a4, a1
; RV32IF-NEXT: .LBB9_4:
-; RV32IF-NEXT: lui a1, %hi(.LCPI9_0)
-; RV32IF-NEXT: flw fa5, %lo(.LCPI9_0)(a1)
-; RV32IF-NEXT: flt.s a3, fa5, fs0
+; RV32IF-NEXT: and a0, a2, a0
; RV32IF-NEXT: beqz a3, .LBB9_6
; RV32IF-NEXT: # %bb.5:
-; RV32IF-NEXT: addi a2, a4, -1
+; RV32IF-NEXT: addi a4, a5, -1
; RV32IF-NEXT: .LBB9_6:
-; RV32IF-NEXT: feq.s a1, fs0, fs0
-; RV32IF-NEXT: neg a4, a1
-; RV32IF-NEXT: and a1, a4, a2
-; RV32IF-NEXT: neg a2, s0
-; RV32IF-NEXT: and a0, a2, a0
-; RV32IF-NEXT: neg a2, a3
-; RV32IF-NEXT: or a0, a2, a0
-; RV32IF-NEXT: and a0, a4, a0
+; RV32IF-NEXT: and a1, a2, a4
; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
; RV32IF-NEXT: addi sp, sp, 16
; RV32IF-NEXT: ret
;
@@ -635,23 +641,23 @@ define i64 @test_trunc_si64(float %x) nounwind {
; RV32IZFINX-NEXT: lui a2, %hi(.LCPI9_0)
; RV32IZFINX-NEXT: lw a2, %lo(.LCPI9_0)(a2)
; RV32IZFINX-NEXT: and a0, s2, a0
-; RV32IZFINX-NEXT: flt.s a4, a2, s0
-; RV32IZFINX-NEXT: neg a2, a4
+; RV32IZFINX-NEXT: flt.s a3, a2, s0
+; RV32IZFINX-NEXT: neg a2, a3
; RV32IZFINX-NEXT: or a0, a2, a0
; RV32IZFINX-NEXT: feq.s a2, s0, s0
; RV32IZFINX-NEXT: neg a2, a2
; RV32IZFINX-NEXT: lui a5, 524288
-; RV32IZFINX-NEXT: lui a3, 524288
+; RV32IZFINX-NEXT: lui a4, 524288
; RV32IZFINX-NEXT: beqz s1, .LBB9_4
; RV32IZFINX-NEXT: # %bb.3:
-; RV32IZFINX-NEXT: mv a3, a1
+; RV32IZFINX-NEXT: mv a4, a1
; RV32IZFINX-NEXT: .LBB9_4:
; RV32IZFINX-NEXT: and a0, a2, a0
-; RV32IZFINX-NEXT: beqz a4, .LBB9_6
+; RV32IZFINX-NEXT: beqz a3, .LBB9_6
; RV32IZFINX-NEXT: # %bb.5:
-; RV32IZFINX-NEXT: addi a3, a5, -1
+; RV32IZFINX-NEXT: addi a4, a5, -1
; RV32IZFINX-NEXT: .LBB9_6:
-; RV32IZFINX-NEXT: and a1, a2, a3
+; RV32IZFINX-NEXT: and a1, a2, a4
; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -817,7 +823,8 @@ define i64 @test_round_si64(float %x) nounwind {
; RV32IF-NEXT: addi sp, sp, -16
; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IF-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
; RV32IF-NEXT: fmv.s fs0, fa0
; RV32IF-NEXT: lui a0, 307200
; RV32IF-NEXT: fmv.w.x fa5, a0
@@ -832,32 +839,33 @@ define i64 @test_round_si64(float %x) nounwind {
; RV32IF-NEXT: lui a0, 913408
; RV32IF-NEXT: fmv.w.x fa5, a0
; RV32IF-NEXT: fle.s s0, fa5, fs0
+; RV32IF-NEXT: neg s1, s0
; RV32IF-NEXT: fmv.s fa0, fs0
; RV32IF-NEXT: call __fixsfdi
+; RV32IF-NEXT: lui a2, %hi(.LCPI13_0)
+; RV32IF-NEXT: flw fa5, %lo(.LCPI13_0)(a2)
+; RV32IF-NEXT: and a0, s1, a0
+; RV32IF-NEXT: flt.s a3, fa5, fs0
+; RV32IF-NEXT: neg a2, a3
+; RV32IF-NEXT: or a0, a2, a0
+; RV32IF-NEXT: feq.s a2, fs0, fs0
+; RV32IF-NEXT: neg a2, a2
+; RV32IF-NEXT: lui a5, 524288
; RV32IF-NEXT: lui a4, 524288
-; RV32IF-NEXT: lui a2, 524288
; RV32IF-NEXT: beqz s0, .LBB13_4
; RV32IF-NEXT: # %bb.3:
-; RV32IF-NEXT: mv a2, a1
+; RV32IF-NEXT: mv a4, a1
; RV32IF-NEXT: .LBB13_4:
-; RV32IF-NEXT: lui a1, %hi(.LCPI13_0)
-; RV32IF-NEXT: flw fa5, %lo(.LCPI13_0)(a1)
-; RV32IF-NEXT: flt.s a3, fa5, fs0
+; RV32IF-NEXT: and a0, a2, a0
; RV32IF-NEXT: beqz a3, .LBB13_6
; RV32IF-NEXT: # %bb.5:
-; RV32IF-NEXT: addi a2, a4, -1
+; RV32IF-NEXT: addi a4, a5, -1
; RV32IF-NEXT: .LBB13_6:
-; RV32IF-NEXT: feq.s a1, fs0, fs0
-; RV32IF-NEXT: neg a4, a1
-; RV32IF-NEXT: and a1, a4, a2
-; RV32IF-NEXT: neg a2, s0
-; RV32IF-NEXT: and a0, a2, a0
-; RV32IF-NEXT: neg a2, a3
-; RV32IF-NEXT: or a0, a2, a0
-; RV32IF-NEXT: and a0, a4, a0
+; RV32IF-NEXT: and a1, a2, a4
; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
; RV32IF-NEXT: addi sp, sp, 16
; RV32IF-NEXT: ret
;
@@ -895,23 +903,23 @@ define i64 @test_round_si64(float %x) nounwind {
; RV32IZFINX-NEXT: lui a2, %hi(.LCPI13_0)
; RV32IZFINX-NEXT: lw a2, %lo(.LCPI13_0)(a2)
; RV32IZFINX-NEXT: and a0, s2, a0
-; RV32IZFINX-NEXT: flt.s a4, a2, s0
-; RV32IZFINX-NEXT: neg a2, a4
+; RV32IZFINX-NEXT: flt.s a3, a2, s0
+; RV32IZFINX-NEXT: neg a2, a3
; RV32IZFINX-NEXT: or a0, a2, a0
; RV32IZFINX-NEXT: feq.s a2, s0, s0
; RV32IZFINX-NEXT: neg a2, a2
; RV32IZFINX-NEXT: lui a5, 524288
-; RV32IZFINX-NEXT: lui a3, 524288
+; RV32IZFINX-NEXT: lui a4, 524288
; RV32IZFINX-NEXT: beqz s1, .LBB13_4
; RV32IZFINX-NEXT: # %bb.3:
-; RV32IZFINX-NEXT: mv a3, a1
+; RV32IZFINX-NEXT: mv a4, a1
; RV32IZFINX-NEXT: .LBB13_4:
; RV32IZFINX-NEXT: and a0, a2, a0
-; RV32IZFINX-NEXT: beqz a4, .LBB13_6
+; RV32IZFINX-NEXT: beqz a3, .LBB13_6
; RV32IZFINX-NEXT: # %bb.5:
-; RV32IZFINX-NEXT: addi a3, a5, -1
+; RV32IZFINX-NEXT: addi a4, a5, -1
; RV32IZFINX-NEXT: .LBB13_6:
-; RV32IZFINX-NEXT: and a1, a2, a3
+; RV32IZFINX-NEXT: and a1, a2, a4
; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -1077,7 +1085,8 @@ define i64 @test_roundeven_si64(float %x) nounwind {
; RV32IF-NEXT: addi sp, sp, -16
; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IF-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
; RV32IF-NEXT: fmv.s fs0, fa0
; RV32IF-NEXT: lui a0, 307200
; RV32IF-NEXT: fmv.w.x fa5, a0
@@ -1092,32 +1101,33 @@ define i64 @test_roundeven_si64(float %x) nounwind {
; RV32IF-NEXT: lui a0, 913408
; RV32IF-NEXT: fmv.w.x fa5, a0
; RV32IF-NEXT: fle.s s0, fa5, fs0
+; RV32IF-NEXT: neg s1, s0
; RV32IF-NEXT: fmv.s fa0, fs0
; RV32IF-NEXT: call __fixsfdi
+; RV32IF-NEXT: lui a2, %hi(.LCPI17_0)
+; RV32IF-NEXT: flw fa5, %lo(.LCPI17_0)(a2)
+; RV32IF-NEXT: and a0, s1, a0
+; RV32IF-NEXT: flt.s a3, fa5, fs0
+; RV32IF-NEXT: neg a2, a3
+; RV32IF-NEXT: or a0, a2, a0
+; RV32IF-NEXT: feq.s a2, fs0, fs0
+; RV32IF-NEXT: neg a2, a2
+; RV32IF-NEXT: lui a5, 524288
; RV32IF-NEXT: lui a4, 524288
-; RV32IF-NEXT: lui a2, 524288
; RV32IF-NEXT: beqz s0, .LBB17_4
; RV32IF-NEXT: # %bb.3:
-; RV32IF-NEXT: mv a2, a1
+; RV32IF-NEXT: mv a4, a1
; RV32IF-NEXT: .LBB17_4:
-; RV32IF-NEXT: lui a1, %hi(.LCPI17_0)
-; RV32IF-NEXT: flw fa5, %lo(.LCPI17_0)(a1)
-; RV32IF-NEXT: flt.s a3, fa5, fs0
+; RV32IF-NEXT: and a0, a2, a0
; RV32IF-NEXT: beqz a3, .LBB17_6
; RV32IF-NEXT: # %bb.5:
-; RV32IF-NEXT: addi a2, a4, -1
+; RV32IF-NEXT: addi a4, a5, -1
; RV32IF-NEXT: .LBB17_6:
-; RV32IF-NEXT: feq.s a1, fs0, fs0
-; RV32IF-NEXT: neg a4, a1
-; RV32IF-NEXT: and a1, a4, a2
-; RV32IF-NEXT: neg a2, s0
-; RV32IF-NEXT: and a0, a2, a0
-; RV32IF-NEXT: neg a2, a3
-; RV32IF-NEXT: or a0, a2, a0
-; RV32IF-NEXT: and a0, a4, a0
+; RV32IF-NEXT: and a1, a2, a4
; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
; RV32IF-NEXT: addi sp, sp, 16
; RV32IF-NEXT: ret
;
@@ -1155,23 +1165,23 @@ define i64 @test_roundeven_si64(float %x) nounwind {
; RV32IZFINX-NEXT: lui a2, %hi(.LCPI17_0)
; RV32IZFINX-NEXT: lw a2, %lo(.LCPI17_0)(a2)
; RV32IZFINX-NEXT: and a0, s2, a0
-; RV32IZFINX-NEXT: flt.s a4, a2, s0
-; RV32IZFINX-NEXT: neg a2, a4
+; RV32IZFINX-NEXT: flt.s a3, a2, s0
+; RV32IZFINX-NEXT: neg a2, a3
; RV32IZFINX-NEXT: or a0, a2, a0
; RV32IZFINX-NEXT: feq.s a2, s0, s0
; RV32IZFINX-NEXT: neg a2, a2
; RV32IZFINX-NEXT: lui a5, 524288
-; RV32IZFINX-NEXT: lui a3, 524288
+; RV32IZFINX-NEXT: lui a4, 524288
; RV32IZFINX-NEXT: beqz s1, .LBB17_4
; RV32IZFINX-NEXT: # %bb.3:
-; RV32IZFINX-NEXT: mv a3, a1
+; RV32IZFINX-NEXT: mv a4, a1
; RV32IZFINX-NEXT: .LBB17_4:
; RV32IZFINX-NEXT: and a0, a2, a0
-; RV32IZFINX-NEXT: beqz a4, .LBB17_6
+; RV32IZFINX-NEXT: beqz a3, .LBB17_6
; RV32IZFINX-NEXT: # %bb.5:
-; RV32IZFINX-NEXT: addi a3, a5, -1
+; RV32IZFINX-NEXT: addi a4, a5, -1
; RV32IZFINX-NEXT: .LBB17_6:
-; RV32IZFINX-NEXT: and a1, a2, a3
+; RV32IZFINX-NEXT: and a1, a2, a4
; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -1337,7 +1347,8 @@ define i64 @test_rint_si64(float %x) nounwind {
; RV32IF-NEXT: addi sp, sp, -16
; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IF-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IF-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
; RV32IF-NEXT: fmv.s fs0, fa0
; RV32IF-NEXT: lui a0, 307200
; RV32IF-NEXT: fmv.w.x fa5, a0
@@ -1352,32 +1363,33 @@ define i64 @test_rint_si64(float %x) nounwind {
; RV32IF-NEXT: lui a0, 913408
; RV32IF-NEXT: fmv.w.x fa5, a0
; RV32IF-NEXT: fle.s s0, fa5, fs0
+; RV32IF-NEXT: neg s1, s0
; RV32IF-NEXT: fmv.s fa0, fs0
; RV32IF-NEXT: call __fixsfdi
+; RV32IF-NEXT: lui a2, %hi(.LCPI21_0)
+; RV32IF-NEXT: flw fa5, %lo(.LCPI21_0)(a2)
+; RV32IF-NEXT: and a0, s1, a0
+; RV32IF-NEXT: flt.s a3, fa5, fs0
+; RV32IF-NEXT: neg a2, a3
+; RV32IF-NEXT: or a0, a2, a0
+; RV32IF-NEXT: feq.s a2, fs0, fs0
+; RV32IF-NEXT: neg a2, a2
+; RV32IF-NEXT: lui a5, 524288
; RV32IF-NEXT: lui a4, 524288
-; RV32IF-NEXT: lui a2, 524288
; RV32IF-NEXT: beqz s0, .LBB21_4
; RV32IF-NEXT: # %bb.3:
-; RV32IF-NEXT: mv a2, a1
+; RV32IF-NEXT: mv a4, a1
; RV32IF-NEXT: .LBB21_4:
-; RV32IF-NEXT: lui a1, %hi(.LCPI21_0)
-; RV32IF-NEXT: flw fa5, %lo(.LCPI21_0)(a1)
-; RV32IF-NEXT: flt.s a3, fa5, fs0
+; RV32IF-NEXT: and a0, a2, a0
; RV32IF-NEXT: beqz a3, .LBB21_6
; RV32IF-NEXT: # %bb.5:
-; RV32IF-NEXT: addi a2, a4, -1
+; RV32IF-NEXT: addi a4, a5, -1
; RV32IF-NEXT: .LBB21_6:
-; RV32IF-NEXT: feq.s a1, fs0, fs0
-; RV32IF-NEXT: neg a4, a1
-; RV32IF-NEXT: and a1, a4, a2
-; RV32IF-NEXT: neg a2, s0
-; RV32IF-NEXT: and a0, a2, a0
-; RV32IF-NEXT: neg a2, a3
-; RV32IF-NEXT: or a0, a2, a0
-; RV32IF-NEXT: and a0, a4, a0
+; RV32IF-NEXT: and a1, a2, a4
; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IF-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
; RV32IF-NEXT: addi sp, sp, 16
; RV32IF-NEXT: ret
;
@@ -1415,23 +1427,23 @@ define i64 @test_rint_si64(float %x) nounwind {
; RV32IZFINX-NEXT: lui a2, %hi(.LCPI21_0)
; RV32IZFINX-NEXT: lw a2, %lo(.LCPI21_0)(a2)
; RV32IZFINX-NEXT: and a0, s2, a0
-; RV32IZFINX-NEXT: flt.s a4, a2, s0
-; RV32IZFINX-NEXT: neg a2, a4
+; RV32IZFINX-NEXT: flt.s a3, a2, s0
+; RV32IZFINX-NEXT: neg a2, a3
; RV32IZFINX-NEXT: or a0, a2, a0
; RV32IZFINX-NEXT: feq.s a2, s0, s0
; RV32IZFINX-NEXT: neg a2, a2
; RV32IZFINX-NEXT: lui a5, 524288
-; RV32IZFINX-NEXT: lui a3, 524288
+; RV32IZFINX-NEXT: lui a4, 524288
; RV32IZFINX-NEXT: beqz s1, .LBB21_4
; RV32IZFINX-NEXT: # %bb.3:
-; RV32IZFINX-NEXT: mv a3, a1
+; RV32IZFINX-NEXT: mv a4, a1
; RV32IZFINX-NEXT: .LBB21_4:
; RV32IZFINX-NEXT: and a0, a2, a0
-; RV32IZFINX-NEXT: beqz a4, .LBB21_6
+; RV32IZFINX-NEXT: beqz a3, .LBB21_6
; RV32IZFINX-NEXT: # %bb.5:
-; RV32IZFINX-NEXT: addi a3, a5, -1
+; RV32IZFINX-NEXT: addi a4, a5, -1
; RV32IZFINX-NEXT: .LBB21_6:
-; RV32IZFINX-NEXT: and a1, a2, a3
+; RV32IZFINX-NEXT: and a1, a2, a4
; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/forced-atomics.ll b/llvm/test/CodeGen/RISCV/forced-atomics.ll
index f6a53a9d76dd356..c303690aadfff8c 100644
--- a/llvm/test/CodeGen/RISCV/forced-atomics.ll
+++ b/llvm/test/CodeGen/RISCV/forced-atomics.ll
@@ -3567,8 +3567,8 @@ define i64 @rmw64_umax_seq_cst(ptr %p) nounwind {
; RV32-NEXT: # in Loop: Header=BB51_2 Depth=1
; RV32-NEXT: neg a3, a0
; RV32-NEXT: and a3, a3, a1
-; RV32-NEXT: sw a4, 0(sp)
; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a4, 0(sp)
; RV32-NEXT: mv a1, sp
; RV32-NEXT: li a4, 5
; RV32-NEXT: li a5, 5
@@ -3659,8 +3659,8 @@ define i64 @rmw64_umin_seq_cst(ptr %p) nounwind {
; RV32-NEXT: # in Loop: Header=BB52_2 Depth=1
; RV32-NEXT: neg a3, a0
; RV32-NEXT: and a3, a3, a1
-; RV32-NEXT: sw a4, 0(sp)
; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a4, 0(sp)
; RV32-NEXT: mv a1, sp
; RV32-NEXT: li a4, 5
; RV32-NEXT: li a5, 5
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index deb5a6d4013d494..06ab813faf0253b 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -114,8 +114,8 @@ define i32 @utest_f64i32(double %x) {
; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IF-NEXT: .cfi_offset ra, -4
; RV32IF-NEXT: call __fixunsdfdi
-; RV32IF-NEXT: sltiu a2, a0, -1
; RV32IF-NEXT: seqz a1, a1
+; RV32IF-NEXT: sltiu a2, a0, -1
; RV32IF-NEXT: and a1, a1, a2
; RV32IF-NEXT: addi a1, a1, -1
; RV32IF-NEXT: or a0, a1, a0
@@ -429,8 +429,8 @@ define i32 @utesth_f16i32(half %x) {
; RV32-NEXT: .cfi_offset ra, -4
; RV32-NEXT: call __extendhfsf2
; RV32-NEXT: call __fixunssfdi
-; RV32-NEXT: sltiu a2, a0, -1
; RV32-NEXT: seqz a1, a1
+; RV32-NEXT: sltiu a2, a0, -1
; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: or a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll
index 31fb6e2ee9c8409..277749c75bbbf19 100644
--- a/llvm/test/CodeGen/RISCV/half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert.ll
@@ -2145,41 +2145,47 @@ define i64 @fcvt_l_h(half %a) nounwind {
define i64 @fcvt_l_h_sat(half %a) nounwind {
; RV32IZFH-LABEL: fcvt_l_h_sat:
; RV32IZFH: # %bb.0: # %start
-; RV32IZFH-NEXT: addi sp, sp, -16
-; RV32IZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: addi sp, sp, -32
+; RV32IZFH-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: fsw fs0, 8(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: lui a0, %hi(.LCPI10_0)
+; RV32IZFH-NEXT: flw fa5, %lo(.LCPI10_0)(a0)
; RV32IZFH-NEXT: fcvt.s.h fs0, fa0
+; RV32IZFH-NEXT: flt.s s0, fa5, fs0
+; RV32IZFH-NEXT: neg s1, s0
; RV32IZFH-NEXT: lui a0, 913408
; RV32IZFH-NEXT: fmv.w.x fa5, a0
-; RV32IZFH-NEXT: fle.s s0, fa5, fs0
+; RV32IZFH-NEXT: fle.s s2, fa5, fs0
+; RV32IZFH-NEXT: neg s3, s2
; RV32IZFH-NEXT: fmv.s fa0, fs0
; RV32IZFH-NEXT: call __fixsfdi
+; RV32IZFH-NEXT: and a0, s3, a0
+; RV32IZFH-NEXT: or a0, s1, a0
+; RV32IZFH-NEXT: feq.s a2, fs0, fs0
+; RV32IZFH-NEXT: neg a2, a2
; RV32IZFH-NEXT: lui a4, 524288
-; RV32IZFH-NEXT: lui a2, 524288
-; RV32IZFH-NEXT: beqz s0, .LBB10_2
+; RV32IZFH-NEXT: lui a3, 524288
+; RV32IZFH-NEXT: beqz s2, .LBB10_2
; RV32IZFH-NEXT: # %bb.1: # %start
-; RV32IZFH-NEXT: mv a2, a1
+; RV32IZFH-NEXT: mv a3, a1
; RV32IZFH-NEXT: .LBB10_2: # %start
-; RV32IZFH-NEXT: lui a1, %hi(.LCPI10_0)
-; RV32IZFH-NEXT: flw fa5, %lo(.LCPI10_0)(a1)
-; RV32IZFH-NEXT: flt.s a3, fa5, fs0
-; RV32IZFH-NEXT: beqz a3, .LBB10_4
+; RV32IZFH-NEXT: and a0, a2, a0
+; RV32IZFH-NEXT: beqz s0, .LBB10_4
; RV32IZFH-NEXT: # %bb.3:
-; RV32IZFH-NEXT: addi a2, a4, -1
+; RV32IZFH-NEXT: addi a3, a4, -1
; RV32IZFH-NEXT: .LBB10_4: # %start
-; RV32IZFH-NEXT: feq.s a1, fs0, fs0
-; RV32IZFH-NEXT: neg a4, a1
-; RV32IZFH-NEXT: and a1, a4, a2
-; RV32IZFH-NEXT: neg a2, a3
-; RV32IZFH-NEXT: neg a3, s0
-; RV32IZFH-NEXT: and a0, a3, a0
-; RV32IZFH-NEXT: or a0, a2, a0
-; RV32IZFH-NEXT: and a0, a4, a0
-; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: addi sp, sp, 16
+; RV32IZFH-NEXT: and a1, a2, a3
+; RV32IZFH-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: flw fs0, 8(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: addi sp, sp, 32
; RV32IZFH-NEXT: ret
;
; RV64IZFH-LABEL: fcvt_l_h_sat:
@@ -2193,41 +2199,47 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
;
; RV32IDZFH-LABEL: fcvt_l_h_sat:
; RV32IDZFH: # %bb.0: # %start
-; RV32IDZFH-NEXT: addi sp, sp, -16
-; RV32IDZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32IDZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; RV32IDZFH-NEXT: addi sp, sp, -32
+; RV32IDZFH-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IDZFH-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IDZFH-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IDZFH-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IDZFH-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32IDZFH-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill
+; RV32IDZFH-NEXT: lui a0, %hi(.LCPI10_0)
+; RV32IDZFH-NEXT: flw fa5, %lo(.LCPI10_0)(a0)
; RV32IDZFH-NEXT: fcvt.s.h fs0, fa0
+; RV32IDZFH-NEXT: flt.s s0, fa5, fs0
+; RV32IDZFH-NEXT: neg s1, s0
; RV32IDZFH-NEXT: lui a0, 913408
; RV32IDZFH-NEXT: fmv.w.x fa5, a0
-; RV32IDZFH-NEXT: fle.s s0, fa5, fs0
+; RV32IDZFH-NEXT: fle.s s2, fa5, fs0
+; RV32IDZFH-NEXT: neg s3, s2
; RV32IDZFH-NEXT: fmv.s fa0, fs0
; RV32IDZFH-NEXT: call __fixsfdi
+; RV32IDZFH-NEXT: and a0, s3, a0
+; RV32IDZFH-NEXT: or a0, s1, a0
+; RV32IDZFH-NEXT: feq.s a2, fs0, fs0
+; RV32IDZFH-NEXT: neg a2, a2
; RV32IDZFH-NEXT: lui a4, 524288
-; RV32IDZFH-NEXT: lui a2, 524288
-; RV32IDZFH-NEXT: beqz s0, .LBB10_2
+; RV32IDZFH-NEXT: lui a3, 524288
+; RV32IDZFH-NEXT: beqz s2, .LBB10_2
; RV32IDZFH-NEXT: # %bb.1: # %start
-; RV32IDZFH-NEXT: mv a2, a1
+; RV32IDZFH-NEXT: mv a3, a1
; RV32IDZFH-NEXT: .LBB10_2: # %start
-; RV32IDZFH-NEXT: lui a1, %hi(.LCPI10_0)
-; RV32IDZFH-NEXT: flw fa5, %lo(.LCPI10_0)(a1)
-; RV32IDZFH-NEXT: flt.s a3, fa5, fs0
-; RV32IDZFH-NEXT: beqz a3, .LBB10_4
+; RV32IDZFH-NEXT: and a0, a2, a0
+; RV32IDZFH-NEXT: beqz s0, .LBB10_4
; RV32IDZFH-NEXT: # %bb.3:
-; RV32IDZFH-NEXT: addi a2, a4, -1
+; RV32IDZFH-NEXT: addi a3, a4, -1
; RV32IDZFH-NEXT: .LBB10_4: # %start
-; RV32IDZFH-NEXT: feq.s a1, fs0, fs0
-; RV32IDZFH-NEXT: neg a4, a1
-; RV32IDZFH-NEXT: and a1, a4, a2
-; RV32IDZFH-NEXT: neg a2, a3
-; RV32IDZFH-NEXT: neg a3, s0
-; RV32IDZFH-NEXT: and a0, a3, a0
-; RV32IDZFH-NEXT: or a0, a2, a0
-; RV32IDZFH-NEXT: and a0, a4, a0
-; RV32IDZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32IDZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
+; RV32IDZFH-NEXT: and a1, a2, a3
+; RV32IDZFH-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IDZFH-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IDZFH-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IDZFH-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IDZFH-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
; RV32IDZFH-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
-; RV32IDZFH-NEXT: addi sp, sp, 16
+; RV32IDZFH-NEXT: addi sp, sp, 32
; RV32IDZFH-NEXT: ret
;
; RV64IDZFH-LABEL: fcvt_l_h_sat:
@@ -2503,41 +2515,47 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
;
; RV32ID-LABEL: fcvt_l_h_sat:
; RV32ID: # %bb.0: # %start
-; RV32ID-NEXT: addi sp, sp, -16
-; RV32ID-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32ID-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; RV32ID-NEXT: addi sp, sp, -32
+; RV32ID-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32ID-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32ID-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32ID-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32ID-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32ID-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill
; RV32ID-NEXT: call __extendhfsf2
+; RV32ID-NEXT: lui a0, %hi(.LCPI10_0)
+; RV32ID-NEXT: flw fa5, %lo(.LCPI10_0)(a0)
; RV32ID-NEXT: fmv.s fs0, fa0
+; RV32ID-NEXT: flt.s s0, fa5, fa0
+; RV32ID-NEXT: neg s1, s0
; RV32ID-NEXT: lui a0, 913408
; RV32ID-NEXT: fmv.w.x fa5, a0
-; RV32ID-NEXT: fle.s s0, fa5, fa0
+; RV32ID-NEXT: fle.s s2, fa5, fa0
+; RV32ID-NEXT: neg s3, s2
; RV32ID-NEXT: call __fixsfdi
+; RV32ID-NEXT: and a0, s3, a0
+; RV32ID-NEXT: or a0, s1, a0
+; RV32ID-NEXT: feq.s a2, fs0, fs0
+; RV32ID-NEXT: neg a2, a2
; RV32ID-NEXT: lui a4, 524288
-; RV32ID-NEXT: lui a2, 524288
-; RV32ID-NEXT: beqz s0, .LBB10_2
+; RV32ID-NEXT: lui a3, 524288
+; RV32ID-NEXT: beqz s2, .LBB10_2
; RV32ID-NEXT: # %bb.1: # %start
-; RV32ID-NEXT: mv a2, a1
+; RV32ID-NEXT: mv a3, a1
; RV32ID-NEXT: .LBB10_2: # %start
-; RV32ID-NEXT: lui a1, %hi(.LCPI10_0)
-; RV32ID-NEXT: flw fa5, %lo(.LCPI10_0)(a1)
-; RV32ID-NEXT: flt.s a3, fa5, fs0
-; RV32ID-NEXT: beqz a3, .LBB10_4
+; RV32ID-NEXT: and a0, a2, a0
+; RV32ID-NEXT: beqz s0, .LBB10_4
; RV32ID-NEXT: # %bb.3:
-; RV32ID-NEXT: addi a2, a4, -1
+; RV32ID-NEXT: addi a3, a4, -1
; RV32ID-NEXT: .LBB10_4: # %start
-; RV32ID-NEXT: feq.s a1, fs0, fs0
-; RV32ID-NEXT: neg a4, a1
-; RV32ID-NEXT: and a1, a4, a2
-; RV32ID-NEXT: neg a2, s0
-; RV32ID-NEXT: and a0, a2, a0
-; RV32ID-NEXT: neg a2, a3
-; RV32ID-NEXT: or a0, a2, a0
-; RV32ID-NEXT: and a0, a4, a0
-; RV32ID-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32ID-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
+; RV32ID-NEXT: and a1, a2, a3
+; RV32ID-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32ID-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32ID-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; RV32ID-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; RV32ID-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
; RV32ID-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
-; RV32ID-NEXT: addi sp, sp, 16
+; RV32ID-NEXT: addi sp, sp, 32
; RV32ID-NEXT: ret
;
; RV64ID-LABEL: fcvt_l_h_sat:
@@ -2556,41 +2574,47 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
;
; RV32IFZFHMIN-LABEL: fcvt_l_h_sat:
; RV32IFZFHMIN: # %bb.0: # %start
-; RV32IFZFHMIN-NEXT: addi sp, sp, -16
-; RV32IFZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IFZFHMIN-NEXT: addi sp, sp, -32
+; RV32IFZFHMIN-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IFZFHMIN-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IFZFHMIN-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IFZFHMIN-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IFZFHMIN-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IFZFHMIN-NEXT: fsw fs0, 8(sp) # 4-byte Folded Spill
+; RV32IFZFHMIN-NEXT: lui a0, %hi(.LCPI10_0)
+; RV32IFZFHMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a0)
; RV32IFZFHMIN-NEXT: fcvt.s.h fs0, fa0
+; RV32IFZFHMIN-NEXT: flt.s s0, fa5, fs0
+; RV32IFZFHMIN-NEXT: neg s1, s0
; RV32IFZFHMIN-NEXT: lui a0, 913408
; RV32IFZFHMIN-NEXT: fmv.w.x fa5, a0
-; RV32IFZFHMIN-NEXT: fle.s s0, fa5, fs0
+; RV32IFZFHMIN-NEXT: fle.s s2, fa5, fs0
+; RV32IFZFHMIN-NEXT: neg s3, s2
; RV32IFZFHMIN-NEXT: fmv.s fa0, fs0
; RV32IFZFHMIN-NEXT: call __fixsfdi
+; RV32IFZFHMIN-NEXT: and a0, s3, a0
+; RV32IFZFHMIN-NEXT: or a0, s1, a0
+; RV32IFZFHMIN-NEXT: feq.s a2, fs0, fs0
+; RV32IFZFHMIN-NEXT: neg a2, a2
; RV32IFZFHMIN-NEXT: lui a4, 524288
-; RV32IFZFHMIN-NEXT: lui a2, 524288
-; RV32IFZFHMIN-NEXT: beqz s0, .LBB10_2
+; RV32IFZFHMIN-NEXT: lui a3, 524288
+; RV32IFZFHMIN-NEXT: beqz s2, .LBB10_2
; RV32IFZFHMIN-NEXT: # %bb.1: # %start
-; RV32IFZFHMIN-NEXT: mv a2, a1
+; RV32IFZFHMIN-NEXT: mv a3, a1
; RV32IFZFHMIN-NEXT: .LBB10_2: # %start
-; RV32IFZFHMIN-NEXT: lui a1, %hi(.LCPI10_0)
-; RV32IFZFHMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1)
-; RV32IFZFHMIN-NEXT: flt.s a3, fa5, fs0
-; RV32IFZFHMIN-NEXT: beqz a3, .LBB10_4
+; RV32IFZFHMIN-NEXT: and a0, a2, a0
+; RV32IFZFHMIN-NEXT: beqz s0, .LBB10_4
; RV32IFZFHMIN-NEXT: # %bb.3:
-; RV32IFZFHMIN-NEXT: addi a2, a4, -1
+; RV32IFZFHMIN-NEXT: addi a3, a4, -1
; RV32IFZFHMIN-NEXT: .LBB10_4: # %start
-; RV32IFZFHMIN-NEXT: feq.s a1, fs0, fs0
-; RV32IFZFHMIN-NEXT: neg a4, a1
-; RV32IFZFHMIN-NEXT: and a1, a4, a2
-; RV32IFZFHMIN-NEXT: neg a2, a3
-; RV32IFZFHMIN-NEXT: neg a3, s0
-; RV32IFZFHMIN-NEXT: and a0, a3, a0
-; RV32IFZFHMIN-NEXT: or a0, a2, a0
-; RV32IFZFHMIN-NEXT: and a0, a4, a0
-; RV32IFZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT: addi sp, sp, 16
+; RV32IFZFHMIN-NEXT: and a1, a2, a3
+; RV32IFZFHMIN-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IFZFHMIN-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IFZFHMIN-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IFZFHMIN-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IFZFHMIN-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32IFZFHMIN-NEXT: flw fs0, 8(sp) # 4-byte Folded Reload
+; RV32IFZFHMIN-NEXT: addi sp, sp, 32
; RV32IFZFHMIN-NEXT: ret
;
; CHECK64-IZFHMIN-LABEL: fcvt_l_h_sat:
@@ -2605,41 +2629,47 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
;
; RV32IDZFHMIN-LABEL: fcvt_l_h_sat:
; RV32IDZFHMIN: # %bb.0: # %start
-; RV32IDZFHMIN-NEXT: addi sp, sp, -16
-; RV32IDZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32IDZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; RV32IDZFHMIN-NEXT: addi sp, sp, -32
+; RV32IDZFHMIN-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IDZFHMIN-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IDZFHMIN-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IDZFHMIN-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IDZFHMIN-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32IDZFHMIN-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill
+; RV32IDZFHMIN-NEXT: lui a0, %hi(.LCPI10_0)
+; RV32IDZFHMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a0)
; RV32IDZFHMIN-NEXT: fcvt.s.h fs0, fa0
+; RV32IDZFHMIN-NEXT: flt.s s0, fa5, fs0
+; RV32IDZFHMIN-NEXT: neg s1, s0
; RV32IDZFHMIN-NEXT: lui a0, 913408
; RV32IDZFHMIN-NEXT: fmv.w.x fa5, a0
-; RV32IDZFHMIN-NEXT: fle.s s0, fa5, fs0
+; RV32IDZFHMIN-NEXT: fle.s s2, fa5, fs0
+; RV32IDZFHMIN-NEXT: neg s3, s2
; RV32IDZFHMIN-NEXT: fmv.s fa0, fs0
; RV32IDZFHMIN-NEXT: call __fixsfdi
+; RV32IDZFHMIN-NEXT: and a0, s3, a0
+; RV32IDZFHMIN-NEXT: or a0, s1, a0
+; RV32IDZFHMIN-NEXT: feq.s a2, fs0, fs0
+; RV32IDZFHMIN-NEXT: neg a2, a2
; RV32IDZFHMIN-NEXT: lui a4, 524288
-; RV32IDZFHMIN-NEXT: lui a2, 524288
-; RV32IDZFHMIN-NEXT: beqz s0, .LBB10_2
+; RV32IDZFHMIN-NEXT: lui a3, 524288
+; RV32IDZFHMIN-NEXT: beqz s2, .LBB10_2
; RV32IDZFHMIN-NEXT: # %bb.1: # %start
-; RV32IDZFHMIN-NEXT: mv a2, a1
+; RV32IDZFHMIN-NEXT: mv a3, a1
; RV32IDZFHMIN-NEXT: .LBB10_2: # %start
-; RV32IDZFHMIN-NEXT: lui a1, %hi(.LCPI10_0)
-; RV32IDZFHMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1)
-; RV32IDZFHMIN-NEXT: flt.s a3, fa5, fs0
-; RV32IDZFHMIN-NEXT: beqz a3, .LBB10_4
+; RV32IDZFHMIN-NEXT: and a0, a2, a0
+; RV32IDZFHMIN-NEXT: beqz s0, .LBB10_4
; RV32IDZFHMIN-NEXT: # %bb.3:
-; RV32IDZFHMIN-NEXT: addi a2, a4, -1
+; RV32IDZFHMIN-NEXT: addi a3, a4, -1
; RV32IDZFHMIN-NEXT: .LBB10_4: # %start
-; RV32IDZFHMIN-NEXT: feq.s a1, fs0, fs0
-; RV32IDZFHMIN-NEXT: neg a4, a1
-; RV32IDZFHMIN-NEXT: and a1, a4, a2
-; RV32IDZFHMIN-NEXT: neg a2, a3
-; RV32IDZFHMIN-NEXT: neg a3, s0
-; RV32IDZFHMIN-NEXT: and a0, a3, a0
-; RV32IDZFHMIN-NEXT: or a0, a2, a0
-; RV32IDZFHMIN-NEXT: and a0, a4, a0
-; RV32IDZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32IDZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
+; RV32IDZFHMIN-NEXT: and a1, a2, a3
+; RV32IDZFHMIN-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IDZFHMIN-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IDZFHMIN-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IDZFHMIN-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IDZFHMIN-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
; RV32IDZFHMIN-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload
-; RV32IDZFHMIN-NEXT: addi sp, sp, 16
+; RV32IDZFHMIN-NEXT: addi sp, sp, 32
; RV32IDZFHMIN-NEXT: ret
;
; CHECK32-IZHINXMIN-LABEL: fcvt_l_h_sat:
diff --git a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
index 04a8a66f44598f4..9c95210bfa7c01c 100644
--- a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
@@ -108,38 +108,40 @@ define i64 @test_floor_si64(half %x) nounwind {
; RV32IZFH-NEXT: addi sp, sp, -16
; RV32IZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: fcvt.s.h fs0, fa0
; RV32IZFH-NEXT: lui a0, 913408
; RV32IZFH-NEXT: fmv.w.x fa5, a0
; RV32IZFH-NEXT: fle.s s0, fa5, fs0
+; RV32IZFH-NEXT: neg s1, s0
; RV32IZFH-NEXT: fmv.s fa0, fs0
; RV32IZFH-NEXT: call __fixsfdi
+; RV32IZFH-NEXT: lui a2, %hi(.LCPI1_1)
+; RV32IZFH-NEXT: flw fa5, %lo(.LCPI1_1)(a2)
+; RV32IZFH-NEXT: and a0, s1, a0
+; RV32IZFH-NEXT: flt.s a3, fa5, fs0
+; RV32IZFH-NEXT: neg a2, a3
+; RV32IZFH-NEXT: or a0, a2, a0
+; RV32IZFH-NEXT: feq.s a2, fs0, fs0
+; RV32IZFH-NEXT: neg a2, a2
+; RV32IZFH-NEXT: lui a5, 524288
; RV32IZFH-NEXT: lui a4, 524288
-; RV32IZFH-NEXT: lui a2, 524288
; RV32IZFH-NEXT: beqz s0, .LBB1_4
; RV32IZFH-NEXT: # %bb.3:
-; RV32IZFH-NEXT: mv a2, a1
+; RV32IZFH-NEXT: mv a4, a1
; RV32IZFH-NEXT: .LBB1_4:
-; RV32IZFH-NEXT: lui a1, %hi(.LCPI1_1)
-; RV32IZFH-NEXT: flw fa5, %lo(.LCPI1_1)(a1)
-; RV32IZFH-NEXT: flt.s a3, fa5, fs0
-; RV32IZFH-NEXT: beqz a3, .LBB1_6
-; RV32IZFH-NEXT: # %bb.5:
-; RV32IZFH-NEXT: addi a2, a4, -1
-; RV32IZFH-NEXT: .LBB1_6:
-; RV32IZFH-NEXT: feq.s a1, fs0, fs0
-; RV32IZFH-NEXT: neg a4, a1
-; RV32IZFH-NEXT: and a1, a4, a2
-; RV32IZFH-NEXT: neg a2, s0
; RV32IZFH-NEXT: and a0, a2, a0
-; RV32IZFH-NEXT: neg a2, a3
-; RV32IZFH-NEXT: or a0, a2, a0
-; RV32IZFH-NEXT: and a0, a4, a0
; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: addi sp, sp, 16
+; RV32IZFH-NEXT: beqz a3, .LBB1_6
+; RV32IZFH-NEXT: # %bb.5:
+; RV32IZFH-NEXT: addi a4, a5, -1
+; RV32IZFH-NEXT: .LBB1_6:
+; RV32IZFH-NEXT: and a1, a2, a4
; RV32IZFH-NEXT: ret
;
; RV64IZFH-LABEL: test_floor_si64:
@@ -177,16 +179,16 @@ define i64 @test_floor_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lui a2, %hi(.LCPI1_1)
; RV32IZHINX-NEXT: lw a2, %lo(.LCPI1_1)(a2)
; RV32IZHINX-NEXT: and a0, s2, a0
-; RV32IZHINX-NEXT: flt.s a4, a2, s0
-; RV32IZHINX-NEXT: neg a2, a4
+; RV32IZHINX-NEXT: flt.s a3, a2, s0
+; RV32IZHINX-NEXT: neg a2, a3
; RV32IZHINX-NEXT: or a0, a2, a0
; RV32IZHINX-NEXT: feq.s a2, s0, s0
; RV32IZHINX-NEXT: neg a2, a2
; RV32IZHINX-NEXT: lui a5, 524288
-; RV32IZHINX-NEXT: lui a3, 524288
+; RV32IZHINX-NEXT: lui a4, 524288
; RV32IZHINX-NEXT: beqz s1, .LBB1_4
; RV32IZHINX-NEXT: # %bb.3:
-; RV32IZHINX-NEXT: mv a3, a1
+; RV32IZHINX-NEXT: mv a4, a1
; RV32IZHINX-NEXT: .LBB1_4:
; RV32IZHINX-NEXT: and a0, a2, a0
; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -194,11 +196,11 @@ define i64 @test_floor_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: addi sp, sp, 16
-; RV32IZHINX-NEXT: beqz a4, .LBB1_6
+; RV32IZHINX-NEXT: beqz a3, .LBB1_6
; RV32IZHINX-NEXT: # %bb.5:
-; RV32IZHINX-NEXT: addi a3, a5, -1
+; RV32IZHINX-NEXT: addi a4, a5, -1
; RV32IZHINX-NEXT: .LBB1_6:
-; RV32IZHINX-NEXT: and a1, a2, a3
+; RV32IZHINX-NEXT: and a1, a2, a4
; RV32IZHINX-NEXT: ret
;
; RV64IZHINX-LABEL: test_floor_si64:
@@ -236,39 +238,41 @@ define i64 @test_floor_si64(half %x) nounwind {
; RV32IZFHMIN-NEXT: addi sp, sp, -16
; RV32IZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5
; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5
; RV32IZFHMIN-NEXT: lui a0, 913408
; RV32IZFHMIN-NEXT: fmv.w.x fa5, a0
; RV32IZFHMIN-NEXT: fle.s s0, fa5, fs0
+; RV32IZFHMIN-NEXT: neg s1, s0
; RV32IZFHMIN-NEXT: fmv.s fa0, fs0
; RV32IZFHMIN-NEXT: call __fixsfdi
+; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI1_0)
+; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI1_0)(a2)
+; RV32IZFHMIN-NEXT: and a0, s1, a0
+; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT: neg a2, a3
+; RV32IZFHMIN-NEXT: or a0, a2, a0
+; RV32IZFHMIN-NEXT: feq.s a2, fs0, fs0
+; RV32IZFHMIN-NEXT: neg a2, a2
+; RV32IZFHMIN-NEXT: lui a5, 524288
; RV32IZFHMIN-NEXT: lui a4, 524288
-; RV32IZFHMIN-NEXT: lui a2, 524288
; RV32IZFHMIN-NEXT: beqz s0, .LBB1_4
; RV32IZFHMIN-NEXT: # %bb.3:
-; RV32IZFHMIN-NEXT: mv a2, a1
+; RV32IZFHMIN-NEXT: mv a4, a1
; RV32IZFHMIN-NEXT: .LBB1_4:
-; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI1_0)
-; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI1_0)(a1)
-; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT: beqz a3, .LBB1_6
-; RV32IZFHMIN-NEXT: # %bb.5:
-; RV32IZFHMIN-NEXT: addi a2, a4, -1
-; RV32IZFHMIN-NEXT: .LBB1_6:
-; RV32IZFHMIN-NEXT: feq.s a1, fs0, fs0
-; RV32IZFHMIN-NEXT: neg a4, a1
-; RV32IZFHMIN-NEXT: and a1, a4, a2
-; RV32IZFHMIN-NEXT: neg a2, s0
; RV32IZFHMIN-NEXT: and a0, a2, a0
-; RV32IZFHMIN-NEXT: neg a2, a3
-; RV32IZFHMIN-NEXT: or a0, a2, a0
-; RV32IZFHMIN-NEXT: and a0, a4, a0
; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: addi sp, sp, 16
+; RV32IZFHMIN-NEXT: beqz a3, .LBB1_6
+; RV32IZFHMIN-NEXT: # %bb.5:
+; RV32IZFHMIN-NEXT: addi a4, a5, -1
+; RV32IZFHMIN-NEXT: .LBB1_6:
+; RV32IZFHMIN-NEXT: and a1, a2, a4
; RV32IZFHMIN-NEXT: ret
;
; RV64IZFHMIN-LABEL: test_floor_si64:
@@ -320,16 +324,16 @@ define i64 @test_floor_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI1_0)
; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI1_0)(a2)
; RV32IZHINXMIN-NEXT: and a0, s2, a0
-; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0
-; RV32IZHINXMIN-NEXT: neg a2, a4
+; RV32IZHINXMIN-NEXT: flt.s a3, a2, s0
+; RV32IZHINXMIN-NEXT: neg a2, a3
; RV32IZHINXMIN-NEXT: or a0, a2, a0
; RV32IZHINXMIN-NEXT: feq.s a2, s0, s0
; RV32IZHINXMIN-NEXT: neg a2, a2
; RV32IZHINXMIN-NEXT: lui a5, 524288
-; RV32IZHINXMIN-NEXT: lui a3, 524288
+; RV32IZHINXMIN-NEXT: lui a4, 524288
; RV32IZHINXMIN-NEXT: beqz s1, .LBB1_4
; RV32IZHINXMIN-NEXT: # %bb.3:
-; RV32IZHINXMIN-NEXT: mv a3, a1
+; RV32IZHINXMIN-NEXT: mv a4, a1
; RV32IZHINXMIN-NEXT: .LBB1_4:
; RV32IZHINXMIN-NEXT: and a0, a2, a0
; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -337,11 +341,11 @@ define i64 @test_floor_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: addi sp, sp, 16
-; RV32IZHINXMIN-NEXT: beqz a4, .LBB1_6
+; RV32IZHINXMIN-NEXT: beqz a3, .LBB1_6
; RV32IZHINXMIN-NEXT: # %bb.5:
-; RV32IZHINXMIN-NEXT: addi a3, a5, -1
+; RV32IZHINXMIN-NEXT: addi a4, a5, -1
; RV32IZHINXMIN-NEXT: .LBB1_6:
-; RV32IZHINXMIN-NEXT: and a1, a2, a3
+; RV32IZHINXMIN-NEXT: and a1, a2, a4
; RV32IZHINXMIN-NEXT: ret
;
; RV64IZHINXMIN-LABEL: test_floor_si64:
@@ -820,38 +824,40 @@ define i64 @test_ceil_si64(half %x) nounwind {
; RV32IZFH-NEXT: addi sp, sp, -16
; RV32IZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: fcvt.s.h fs0, fa0
; RV32IZFH-NEXT: lui a0, 913408
; RV32IZFH-NEXT: fmv.w.x fa5, a0
; RV32IZFH-NEXT: fle.s s0, fa5, fs0
+; RV32IZFH-NEXT: neg s1, s0
; RV32IZFH-NEXT: fmv.s fa0, fs0
; RV32IZFH-NEXT: call __fixsfdi
+; RV32IZFH-NEXT: lui a2, %hi(.LCPI5_1)
+; RV32IZFH-NEXT: flw fa5, %lo(.LCPI5_1)(a2)
+; RV32IZFH-NEXT: and a0, s1, a0
+; RV32IZFH-NEXT: flt.s a3, fa5, fs0
+; RV32IZFH-NEXT: neg a2, a3
+; RV32IZFH-NEXT: or a0, a2, a0
+; RV32IZFH-NEXT: feq.s a2, fs0, fs0
+; RV32IZFH-NEXT: neg a2, a2
+; RV32IZFH-NEXT: lui a5, 524288
; RV32IZFH-NEXT: lui a4, 524288
-; RV32IZFH-NEXT: lui a2, 524288
; RV32IZFH-NEXT: beqz s0, .LBB5_4
; RV32IZFH-NEXT: # %bb.3:
-; RV32IZFH-NEXT: mv a2, a1
+; RV32IZFH-NEXT: mv a4, a1
; RV32IZFH-NEXT: .LBB5_4:
-; RV32IZFH-NEXT: lui a1, %hi(.LCPI5_1)
-; RV32IZFH-NEXT: flw fa5, %lo(.LCPI5_1)(a1)
-; RV32IZFH-NEXT: flt.s a3, fa5, fs0
-; RV32IZFH-NEXT: beqz a3, .LBB5_6
-; RV32IZFH-NEXT: # %bb.5:
-; RV32IZFH-NEXT: addi a2, a4, -1
-; RV32IZFH-NEXT: .LBB5_6:
-; RV32IZFH-NEXT: feq.s a1, fs0, fs0
-; RV32IZFH-NEXT: neg a4, a1
-; RV32IZFH-NEXT: and a1, a4, a2
-; RV32IZFH-NEXT: neg a2, s0
; RV32IZFH-NEXT: and a0, a2, a0
-; RV32IZFH-NEXT: neg a2, a3
-; RV32IZFH-NEXT: or a0, a2, a0
-; RV32IZFH-NEXT: and a0, a4, a0
; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: addi sp, sp, 16
+; RV32IZFH-NEXT: beqz a3, .LBB5_6
+; RV32IZFH-NEXT: # %bb.5:
+; RV32IZFH-NEXT: addi a4, a5, -1
+; RV32IZFH-NEXT: .LBB5_6:
+; RV32IZFH-NEXT: and a1, a2, a4
; RV32IZFH-NEXT: ret
;
; RV64IZFH-LABEL: test_ceil_si64:
@@ -889,16 +895,16 @@ define i64 @test_ceil_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lui a2, %hi(.LCPI5_1)
; RV32IZHINX-NEXT: lw a2, %lo(.LCPI5_1)(a2)
; RV32IZHINX-NEXT: and a0, s2, a0
-; RV32IZHINX-NEXT: flt.s a4, a2, s0
-; RV32IZHINX-NEXT: neg a2, a4
+; RV32IZHINX-NEXT: flt.s a3, a2, s0
+; RV32IZHINX-NEXT: neg a2, a3
; RV32IZHINX-NEXT: or a0, a2, a0
; RV32IZHINX-NEXT: feq.s a2, s0, s0
; RV32IZHINX-NEXT: neg a2, a2
; RV32IZHINX-NEXT: lui a5, 524288
-; RV32IZHINX-NEXT: lui a3, 524288
+; RV32IZHINX-NEXT: lui a4, 524288
; RV32IZHINX-NEXT: beqz s1, .LBB5_4
; RV32IZHINX-NEXT: # %bb.3:
-; RV32IZHINX-NEXT: mv a3, a1
+; RV32IZHINX-NEXT: mv a4, a1
; RV32IZHINX-NEXT: .LBB5_4:
; RV32IZHINX-NEXT: and a0, a2, a0
; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -906,11 +912,11 @@ define i64 @test_ceil_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: addi sp, sp, 16
-; RV32IZHINX-NEXT: beqz a4, .LBB5_6
+; RV32IZHINX-NEXT: beqz a3, .LBB5_6
; RV32IZHINX-NEXT: # %bb.5:
-; RV32IZHINX-NEXT: addi a3, a5, -1
+; RV32IZHINX-NEXT: addi a4, a5, -1
; RV32IZHINX-NEXT: .LBB5_6:
-; RV32IZHINX-NEXT: and a1, a2, a3
+; RV32IZHINX-NEXT: and a1, a2, a4
; RV32IZHINX-NEXT: ret
;
; RV64IZHINX-LABEL: test_ceil_si64:
@@ -948,39 +954,41 @@ define i64 @test_ceil_si64(half %x) nounwind {
; RV32IZFHMIN-NEXT: addi sp, sp, -16
; RV32IZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5
; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5
; RV32IZFHMIN-NEXT: lui a0, 913408
; RV32IZFHMIN-NEXT: fmv.w.x fa5, a0
; RV32IZFHMIN-NEXT: fle.s s0, fa5, fs0
+; RV32IZFHMIN-NEXT: neg s1, s0
; RV32IZFHMIN-NEXT: fmv.s fa0, fs0
; RV32IZFHMIN-NEXT: call __fixsfdi
+; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI5_0)
+; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI5_0)(a2)
+; RV32IZFHMIN-NEXT: and a0, s1, a0
+; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT: neg a2, a3
+; RV32IZFHMIN-NEXT: or a0, a2, a0
+; RV32IZFHMIN-NEXT: feq.s a2, fs0, fs0
+; RV32IZFHMIN-NEXT: neg a2, a2
+; RV32IZFHMIN-NEXT: lui a5, 524288
; RV32IZFHMIN-NEXT: lui a4, 524288
-; RV32IZFHMIN-NEXT: lui a2, 524288
; RV32IZFHMIN-NEXT: beqz s0, .LBB5_4
; RV32IZFHMIN-NEXT: # %bb.3:
-; RV32IZFHMIN-NEXT: mv a2, a1
+; RV32IZFHMIN-NEXT: mv a4, a1
; RV32IZFHMIN-NEXT: .LBB5_4:
-; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI5_0)
-; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI5_0)(a1)
-; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT: beqz a3, .LBB5_6
-; RV32IZFHMIN-NEXT: # %bb.5:
-; RV32IZFHMIN-NEXT: addi a2, a4, -1
-; RV32IZFHMIN-NEXT: .LBB5_6:
-; RV32IZFHMIN-NEXT: feq.s a1, fs0, fs0
-; RV32IZFHMIN-NEXT: neg a4, a1
-; RV32IZFHMIN-NEXT: and a1, a4, a2
-; RV32IZFHMIN-NEXT: neg a2, s0
; RV32IZFHMIN-NEXT: and a0, a2, a0
-; RV32IZFHMIN-NEXT: neg a2, a3
-; RV32IZFHMIN-NEXT: or a0, a2, a0
-; RV32IZFHMIN-NEXT: and a0, a4, a0
; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: addi sp, sp, 16
+; RV32IZFHMIN-NEXT: beqz a3, .LBB5_6
+; RV32IZFHMIN-NEXT: # %bb.5:
+; RV32IZFHMIN-NEXT: addi a4, a5, -1
+; RV32IZFHMIN-NEXT: .LBB5_6:
+; RV32IZFHMIN-NEXT: and a1, a2, a4
; RV32IZFHMIN-NEXT: ret
;
; RV64IZFHMIN-LABEL: test_ceil_si64:
@@ -1032,16 +1040,16 @@ define i64 @test_ceil_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI5_0)
; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI5_0)(a2)
; RV32IZHINXMIN-NEXT: and a0, s2, a0
-; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0
-; RV32IZHINXMIN-NEXT: neg a2, a4
+; RV32IZHINXMIN-NEXT: flt.s a3, a2, s0
+; RV32IZHINXMIN-NEXT: neg a2, a3
; RV32IZHINXMIN-NEXT: or a0, a2, a0
; RV32IZHINXMIN-NEXT: feq.s a2, s0, s0
; RV32IZHINXMIN-NEXT: neg a2, a2
; RV32IZHINXMIN-NEXT: lui a5, 524288
-; RV32IZHINXMIN-NEXT: lui a3, 524288
+; RV32IZHINXMIN-NEXT: lui a4, 524288
; RV32IZHINXMIN-NEXT: beqz s1, .LBB5_4
; RV32IZHINXMIN-NEXT: # %bb.3:
-; RV32IZHINXMIN-NEXT: mv a3, a1
+; RV32IZHINXMIN-NEXT: mv a4, a1
; RV32IZHINXMIN-NEXT: .LBB5_4:
; RV32IZHINXMIN-NEXT: and a0, a2, a0
; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -1049,11 +1057,11 @@ define i64 @test_ceil_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: addi sp, sp, 16
-; RV32IZHINXMIN-NEXT: beqz a4, .LBB5_6
+; RV32IZHINXMIN-NEXT: beqz a3, .LBB5_6
; RV32IZHINXMIN-NEXT: # %bb.5:
-; RV32IZHINXMIN-NEXT: addi a3, a5, -1
+; RV32IZHINXMIN-NEXT: addi a4, a5, -1
; RV32IZHINXMIN-NEXT: .LBB5_6:
-; RV32IZHINXMIN-NEXT: and a1, a2, a3
+; RV32IZHINXMIN-NEXT: and a1, a2, a4
; RV32IZHINXMIN-NEXT: ret
;
; RV64IZHINXMIN-LABEL: test_ceil_si64:
@@ -1532,38 +1540,40 @@ define i64 @test_trunc_si64(half %x) nounwind {
; RV32IZFH-NEXT: addi sp, sp, -16
; RV32IZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: fcvt.s.h fs0, fa0
; RV32IZFH-NEXT: lui a0, 913408
; RV32IZFH-NEXT: fmv.w.x fa5, a0
; RV32IZFH-NEXT: fle.s s0, fa5, fs0
+; RV32IZFH-NEXT: neg s1, s0
; RV32IZFH-NEXT: fmv.s fa0, fs0
; RV32IZFH-NEXT: call __fixsfdi
+; RV32IZFH-NEXT: lui a2, %hi(.LCPI9_1)
+; RV32IZFH-NEXT: flw fa5, %lo(.LCPI9_1)(a2)
+; RV32IZFH-NEXT: and a0, s1, a0
+; RV32IZFH-NEXT: flt.s a3, fa5, fs0
+; RV32IZFH-NEXT: neg a2, a3
+; RV32IZFH-NEXT: or a0, a2, a0
+; RV32IZFH-NEXT: feq.s a2, fs0, fs0
+; RV32IZFH-NEXT: neg a2, a2
+; RV32IZFH-NEXT: lui a5, 524288
; RV32IZFH-NEXT: lui a4, 524288
-; RV32IZFH-NEXT: lui a2, 524288
; RV32IZFH-NEXT: beqz s0, .LBB9_4
; RV32IZFH-NEXT: # %bb.3:
-; RV32IZFH-NEXT: mv a2, a1
+; RV32IZFH-NEXT: mv a4, a1
; RV32IZFH-NEXT: .LBB9_4:
-; RV32IZFH-NEXT: lui a1, %hi(.LCPI9_1)
-; RV32IZFH-NEXT: flw fa5, %lo(.LCPI9_1)(a1)
-; RV32IZFH-NEXT: flt.s a3, fa5, fs0
-; RV32IZFH-NEXT: beqz a3, .LBB9_6
-; RV32IZFH-NEXT: # %bb.5:
-; RV32IZFH-NEXT: addi a2, a4, -1
-; RV32IZFH-NEXT: .LBB9_6:
-; RV32IZFH-NEXT: feq.s a1, fs0, fs0
-; RV32IZFH-NEXT: neg a4, a1
-; RV32IZFH-NEXT: and a1, a4, a2
-; RV32IZFH-NEXT: neg a2, s0
; RV32IZFH-NEXT: and a0, a2, a0
-; RV32IZFH-NEXT: neg a2, a3
-; RV32IZFH-NEXT: or a0, a2, a0
-; RV32IZFH-NEXT: and a0, a4, a0
; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: addi sp, sp, 16
+; RV32IZFH-NEXT: beqz a3, .LBB9_6
+; RV32IZFH-NEXT: # %bb.5:
+; RV32IZFH-NEXT: addi a4, a5, -1
+; RV32IZFH-NEXT: .LBB9_6:
+; RV32IZFH-NEXT: and a1, a2, a4
; RV32IZFH-NEXT: ret
;
; RV64IZFH-LABEL: test_trunc_si64:
@@ -1601,16 +1611,16 @@ define i64 @test_trunc_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lui a2, %hi(.LCPI9_1)
; RV32IZHINX-NEXT: lw a2, %lo(.LCPI9_1)(a2)
; RV32IZHINX-NEXT: and a0, s2, a0
-; RV32IZHINX-NEXT: flt.s a4, a2, s0
-; RV32IZHINX-NEXT: neg a2, a4
+; RV32IZHINX-NEXT: flt.s a3, a2, s0
+; RV32IZHINX-NEXT: neg a2, a3
; RV32IZHINX-NEXT: or a0, a2, a0
; RV32IZHINX-NEXT: feq.s a2, s0, s0
; RV32IZHINX-NEXT: neg a2, a2
; RV32IZHINX-NEXT: lui a5, 524288
-; RV32IZHINX-NEXT: lui a3, 524288
+; RV32IZHINX-NEXT: lui a4, 524288
; RV32IZHINX-NEXT: beqz s1, .LBB9_4
; RV32IZHINX-NEXT: # %bb.3:
-; RV32IZHINX-NEXT: mv a3, a1
+; RV32IZHINX-NEXT: mv a4, a1
; RV32IZHINX-NEXT: .LBB9_4:
; RV32IZHINX-NEXT: and a0, a2, a0
; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -1618,11 +1628,11 @@ define i64 @test_trunc_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: addi sp, sp, 16
-; RV32IZHINX-NEXT: beqz a4, .LBB9_6
+; RV32IZHINX-NEXT: beqz a3, .LBB9_6
; RV32IZHINX-NEXT: # %bb.5:
-; RV32IZHINX-NEXT: addi a3, a5, -1
+; RV32IZHINX-NEXT: addi a4, a5, -1
; RV32IZHINX-NEXT: .LBB9_6:
-; RV32IZHINX-NEXT: and a1, a2, a3
+; RV32IZHINX-NEXT: and a1, a2, a4
; RV32IZHINX-NEXT: ret
;
; RV64IZHINX-LABEL: test_trunc_si64:
@@ -1660,39 +1670,41 @@ define i64 @test_trunc_si64(half %x) nounwind {
; RV32IZFHMIN-NEXT: addi sp, sp, -16
; RV32IZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5
; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5
; RV32IZFHMIN-NEXT: lui a0, 913408
; RV32IZFHMIN-NEXT: fmv.w.x fa5, a0
; RV32IZFHMIN-NEXT: fle.s s0, fa5, fs0
+; RV32IZFHMIN-NEXT: neg s1, s0
; RV32IZFHMIN-NEXT: fmv.s fa0, fs0
; RV32IZFHMIN-NEXT: call __fixsfdi
+; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI9_0)
+; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI9_0)(a2)
+; RV32IZFHMIN-NEXT: and a0, s1, a0
+; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT: neg a2, a3
+; RV32IZFHMIN-NEXT: or a0, a2, a0
+; RV32IZFHMIN-NEXT: feq.s a2, fs0, fs0
+; RV32IZFHMIN-NEXT: neg a2, a2
+; RV32IZFHMIN-NEXT: lui a5, 524288
; RV32IZFHMIN-NEXT: lui a4, 524288
-; RV32IZFHMIN-NEXT: lui a2, 524288
; RV32IZFHMIN-NEXT: beqz s0, .LBB9_4
; RV32IZFHMIN-NEXT: # %bb.3:
-; RV32IZFHMIN-NEXT: mv a2, a1
+; RV32IZFHMIN-NEXT: mv a4, a1
; RV32IZFHMIN-NEXT: .LBB9_4:
-; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI9_0)
-; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI9_0)(a1)
-; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT: beqz a3, .LBB9_6
-; RV32IZFHMIN-NEXT: # %bb.5:
-; RV32IZFHMIN-NEXT: addi a2, a4, -1
-; RV32IZFHMIN-NEXT: .LBB9_6:
-; RV32IZFHMIN-NEXT: feq.s a1, fs0, fs0
-; RV32IZFHMIN-NEXT: neg a4, a1
-; RV32IZFHMIN-NEXT: and a1, a4, a2
-; RV32IZFHMIN-NEXT: neg a2, s0
; RV32IZFHMIN-NEXT: and a0, a2, a0
-; RV32IZFHMIN-NEXT: neg a2, a3
-; RV32IZFHMIN-NEXT: or a0, a2, a0
-; RV32IZFHMIN-NEXT: and a0, a4, a0
; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: addi sp, sp, 16
+; RV32IZFHMIN-NEXT: beqz a3, .LBB9_6
+; RV32IZFHMIN-NEXT: # %bb.5:
+; RV32IZFHMIN-NEXT: addi a4, a5, -1
+; RV32IZFHMIN-NEXT: .LBB9_6:
+; RV32IZFHMIN-NEXT: and a1, a2, a4
; RV32IZFHMIN-NEXT: ret
;
; RV64IZFHMIN-LABEL: test_trunc_si64:
@@ -1744,16 +1756,16 @@ define i64 @test_trunc_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI9_0)
; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI9_0)(a2)
; RV32IZHINXMIN-NEXT: and a0, s2, a0
-; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0
-; RV32IZHINXMIN-NEXT: neg a2, a4
+; RV32IZHINXMIN-NEXT: flt.s a3, a2, s0
+; RV32IZHINXMIN-NEXT: neg a2, a3
; RV32IZHINXMIN-NEXT: or a0, a2, a0
; RV32IZHINXMIN-NEXT: feq.s a2, s0, s0
; RV32IZHINXMIN-NEXT: neg a2, a2
; RV32IZHINXMIN-NEXT: lui a5, 524288
-; RV32IZHINXMIN-NEXT: lui a3, 524288
+; RV32IZHINXMIN-NEXT: lui a4, 524288
; RV32IZHINXMIN-NEXT: beqz s1, .LBB9_4
; RV32IZHINXMIN-NEXT: # %bb.3:
-; RV32IZHINXMIN-NEXT: mv a3, a1
+; RV32IZHINXMIN-NEXT: mv a4, a1
; RV32IZHINXMIN-NEXT: .LBB9_4:
; RV32IZHINXMIN-NEXT: and a0, a2, a0
; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -1761,11 +1773,11 @@ define i64 @test_trunc_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: addi sp, sp, 16
-; RV32IZHINXMIN-NEXT: beqz a4, .LBB9_6
+; RV32IZHINXMIN-NEXT: beqz a3, .LBB9_6
; RV32IZHINXMIN-NEXT: # %bb.5:
-; RV32IZHINXMIN-NEXT: addi a3, a5, -1
+; RV32IZHINXMIN-NEXT: addi a4, a5, -1
; RV32IZHINXMIN-NEXT: .LBB9_6:
-; RV32IZHINXMIN-NEXT: and a1, a2, a3
+; RV32IZHINXMIN-NEXT: and a1, a2, a4
; RV32IZHINXMIN-NEXT: ret
;
; RV64IZHINXMIN-LABEL: test_trunc_si64:
@@ -2244,38 +2256,40 @@ define i64 @test_round_si64(half %x) nounwind {
; RV32IZFH-NEXT: addi sp, sp, -16
; RV32IZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: fcvt.s.h fs0, fa0
; RV32IZFH-NEXT: lui a0, 913408
; RV32IZFH-NEXT: fmv.w.x fa5, a0
; RV32IZFH-NEXT: fle.s s0, fa5, fs0
+; RV32IZFH-NEXT: neg s1, s0
; RV32IZFH-NEXT: fmv.s fa0, fs0
; RV32IZFH-NEXT: call __fixsfdi
+; RV32IZFH-NEXT: lui a2, %hi(.LCPI13_1)
+; RV32IZFH-NEXT: flw fa5, %lo(.LCPI13_1)(a2)
+; RV32IZFH-NEXT: and a0, s1, a0
+; RV32IZFH-NEXT: flt.s a3, fa5, fs0
+; RV32IZFH-NEXT: neg a2, a3
+; RV32IZFH-NEXT: or a0, a2, a0
+; RV32IZFH-NEXT: feq.s a2, fs0, fs0
+; RV32IZFH-NEXT: neg a2, a2
+; RV32IZFH-NEXT: lui a5, 524288
; RV32IZFH-NEXT: lui a4, 524288
-; RV32IZFH-NEXT: lui a2, 524288
; RV32IZFH-NEXT: beqz s0, .LBB13_4
; RV32IZFH-NEXT: # %bb.3:
-; RV32IZFH-NEXT: mv a2, a1
+; RV32IZFH-NEXT: mv a4, a1
; RV32IZFH-NEXT: .LBB13_4:
-; RV32IZFH-NEXT: lui a1, %hi(.LCPI13_1)
-; RV32IZFH-NEXT: flw fa5, %lo(.LCPI13_1)(a1)
-; RV32IZFH-NEXT: flt.s a3, fa5, fs0
-; RV32IZFH-NEXT: beqz a3, .LBB13_6
-; RV32IZFH-NEXT: # %bb.5:
-; RV32IZFH-NEXT: addi a2, a4, -1
-; RV32IZFH-NEXT: .LBB13_6:
-; RV32IZFH-NEXT: feq.s a1, fs0, fs0
-; RV32IZFH-NEXT: neg a4, a1
-; RV32IZFH-NEXT: and a1, a4, a2
-; RV32IZFH-NEXT: neg a2, s0
; RV32IZFH-NEXT: and a0, a2, a0
-; RV32IZFH-NEXT: neg a2, a3
-; RV32IZFH-NEXT: or a0, a2, a0
-; RV32IZFH-NEXT: and a0, a4, a0
; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: addi sp, sp, 16
+; RV32IZFH-NEXT: beqz a3, .LBB13_6
+; RV32IZFH-NEXT: # %bb.5:
+; RV32IZFH-NEXT: addi a4, a5, -1
+; RV32IZFH-NEXT: .LBB13_6:
+; RV32IZFH-NEXT: and a1, a2, a4
; RV32IZFH-NEXT: ret
;
; RV64IZFH-LABEL: test_round_si64:
@@ -2313,16 +2327,16 @@ define i64 @test_round_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lui a2, %hi(.LCPI13_1)
; RV32IZHINX-NEXT: lw a2, %lo(.LCPI13_1)(a2)
; RV32IZHINX-NEXT: and a0, s2, a0
-; RV32IZHINX-NEXT: flt.s a4, a2, s0
-; RV32IZHINX-NEXT: neg a2, a4
+; RV32IZHINX-NEXT: flt.s a3, a2, s0
+; RV32IZHINX-NEXT: neg a2, a3
; RV32IZHINX-NEXT: or a0, a2, a0
; RV32IZHINX-NEXT: feq.s a2, s0, s0
; RV32IZHINX-NEXT: neg a2, a2
; RV32IZHINX-NEXT: lui a5, 524288
-; RV32IZHINX-NEXT: lui a3, 524288
+; RV32IZHINX-NEXT: lui a4, 524288
; RV32IZHINX-NEXT: beqz s1, .LBB13_4
; RV32IZHINX-NEXT: # %bb.3:
-; RV32IZHINX-NEXT: mv a3, a1
+; RV32IZHINX-NEXT: mv a4, a1
; RV32IZHINX-NEXT: .LBB13_4:
; RV32IZHINX-NEXT: and a0, a2, a0
; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -2330,11 +2344,11 @@ define i64 @test_round_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: addi sp, sp, 16
-; RV32IZHINX-NEXT: beqz a4, .LBB13_6
+; RV32IZHINX-NEXT: beqz a3, .LBB13_6
; RV32IZHINX-NEXT: # %bb.5:
-; RV32IZHINX-NEXT: addi a3, a5, -1
+; RV32IZHINX-NEXT: addi a4, a5, -1
; RV32IZHINX-NEXT: .LBB13_6:
-; RV32IZHINX-NEXT: and a1, a2, a3
+; RV32IZHINX-NEXT: and a1, a2, a4
; RV32IZHINX-NEXT: ret
;
; RV64IZHINX-LABEL: test_round_si64:
@@ -2372,39 +2386,41 @@ define i64 @test_round_si64(half %x) nounwind {
; RV32IZFHMIN-NEXT: addi sp, sp, -16
; RV32IZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5
; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5
; RV32IZFHMIN-NEXT: lui a0, 913408
; RV32IZFHMIN-NEXT: fmv.w.x fa5, a0
; RV32IZFHMIN-NEXT: fle.s s0, fa5, fs0
+; RV32IZFHMIN-NEXT: neg s1, s0
; RV32IZFHMIN-NEXT: fmv.s fa0, fs0
; RV32IZFHMIN-NEXT: call __fixsfdi
+; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI13_0)
+; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI13_0)(a2)
+; RV32IZFHMIN-NEXT: and a0, s1, a0
+; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT: neg a2, a3
+; RV32IZFHMIN-NEXT: or a0, a2, a0
+; RV32IZFHMIN-NEXT: feq.s a2, fs0, fs0
+; RV32IZFHMIN-NEXT: neg a2, a2
+; RV32IZFHMIN-NEXT: lui a5, 524288
; RV32IZFHMIN-NEXT: lui a4, 524288
-; RV32IZFHMIN-NEXT: lui a2, 524288
; RV32IZFHMIN-NEXT: beqz s0, .LBB13_4
; RV32IZFHMIN-NEXT: # %bb.3:
-; RV32IZFHMIN-NEXT: mv a2, a1
+; RV32IZFHMIN-NEXT: mv a4, a1
; RV32IZFHMIN-NEXT: .LBB13_4:
-; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI13_0)
-; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI13_0)(a1)
-; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT: beqz a3, .LBB13_6
-; RV32IZFHMIN-NEXT: # %bb.5:
-; RV32IZFHMIN-NEXT: addi a2, a4, -1
-; RV32IZFHMIN-NEXT: .LBB13_6:
-; RV32IZFHMIN-NEXT: feq.s a1, fs0, fs0
-; RV32IZFHMIN-NEXT: neg a4, a1
-; RV32IZFHMIN-NEXT: and a1, a4, a2
-; RV32IZFHMIN-NEXT: neg a2, s0
; RV32IZFHMIN-NEXT: and a0, a2, a0
-; RV32IZFHMIN-NEXT: neg a2, a3
-; RV32IZFHMIN-NEXT: or a0, a2, a0
-; RV32IZFHMIN-NEXT: and a0, a4, a0
; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: addi sp, sp, 16
+; RV32IZFHMIN-NEXT: beqz a3, .LBB13_6
+; RV32IZFHMIN-NEXT: # %bb.5:
+; RV32IZFHMIN-NEXT: addi a4, a5, -1
+; RV32IZFHMIN-NEXT: .LBB13_6:
+; RV32IZFHMIN-NEXT: and a1, a2, a4
; RV32IZFHMIN-NEXT: ret
;
; RV64IZFHMIN-LABEL: test_round_si64:
@@ -2456,16 +2472,16 @@ define i64 @test_round_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI13_0)
; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI13_0)(a2)
; RV32IZHINXMIN-NEXT: and a0, s2, a0
-; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0
-; RV32IZHINXMIN-NEXT: neg a2, a4
+; RV32IZHINXMIN-NEXT: flt.s a3, a2, s0
+; RV32IZHINXMIN-NEXT: neg a2, a3
; RV32IZHINXMIN-NEXT: or a0, a2, a0
; RV32IZHINXMIN-NEXT: feq.s a2, s0, s0
; RV32IZHINXMIN-NEXT: neg a2, a2
; RV32IZHINXMIN-NEXT: lui a5, 524288
-; RV32IZHINXMIN-NEXT: lui a3, 524288
+; RV32IZHINXMIN-NEXT: lui a4, 524288
; RV32IZHINXMIN-NEXT: beqz s1, .LBB13_4
; RV32IZHINXMIN-NEXT: # %bb.3:
-; RV32IZHINXMIN-NEXT: mv a3, a1
+; RV32IZHINXMIN-NEXT: mv a4, a1
; RV32IZHINXMIN-NEXT: .LBB13_4:
; RV32IZHINXMIN-NEXT: and a0, a2, a0
; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -2473,11 +2489,11 @@ define i64 @test_round_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: addi sp, sp, 16
-; RV32IZHINXMIN-NEXT: beqz a4, .LBB13_6
+; RV32IZHINXMIN-NEXT: beqz a3, .LBB13_6
; RV32IZHINXMIN-NEXT: # %bb.5:
-; RV32IZHINXMIN-NEXT: addi a3, a5, -1
+; RV32IZHINXMIN-NEXT: addi a4, a5, -1
; RV32IZHINXMIN-NEXT: .LBB13_6:
-; RV32IZHINXMIN-NEXT: and a1, a2, a3
+; RV32IZHINXMIN-NEXT: and a1, a2, a4
; RV32IZHINXMIN-NEXT: ret
;
; RV64IZHINXMIN-LABEL: test_round_si64:
@@ -2956,38 +2972,40 @@ define i64 @test_roundeven_si64(half %x) nounwind {
; RV32IZFH-NEXT: addi sp, sp, -16
; RV32IZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: fcvt.s.h fs0, fa0
; RV32IZFH-NEXT: lui a0, 913408
; RV32IZFH-NEXT: fmv.w.x fa5, a0
; RV32IZFH-NEXT: fle.s s0, fa5, fs0
+; RV32IZFH-NEXT: neg s1, s0
; RV32IZFH-NEXT: fmv.s fa0, fs0
; RV32IZFH-NEXT: call __fixsfdi
+; RV32IZFH-NEXT: lui a2, %hi(.LCPI17_1)
+; RV32IZFH-NEXT: flw fa5, %lo(.LCPI17_1)(a2)
+; RV32IZFH-NEXT: and a0, s1, a0
+; RV32IZFH-NEXT: flt.s a3, fa5, fs0
+; RV32IZFH-NEXT: neg a2, a3
+; RV32IZFH-NEXT: or a0, a2, a0
+; RV32IZFH-NEXT: feq.s a2, fs0, fs0
+; RV32IZFH-NEXT: neg a2, a2
+; RV32IZFH-NEXT: lui a5, 524288
; RV32IZFH-NEXT: lui a4, 524288
-; RV32IZFH-NEXT: lui a2, 524288
; RV32IZFH-NEXT: beqz s0, .LBB17_4
; RV32IZFH-NEXT: # %bb.3:
-; RV32IZFH-NEXT: mv a2, a1
+; RV32IZFH-NEXT: mv a4, a1
; RV32IZFH-NEXT: .LBB17_4:
-; RV32IZFH-NEXT: lui a1, %hi(.LCPI17_1)
-; RV32IZFH-NEXT: flw fa5, %lo(.LCPI17_1)(a1)
-; RV32IZFH-NEXT: flt.s a3, fa5, fs0
-; RV32IZFH-NEXT: beqz a3, .LBB17_6
-; RV32IZFH-NEXT: # %bb.5:
-; RV32IZFH-NEXT: addi a2, a4, -1
-; RV32IZFH-NEXT: .LBB17_6:
-; RV32IZFH-NEXT: feq.s a1, fs0, fs0
-; RV32IZFH-NEXT: neg a4, a1
-; RV32IZFH-NEXT: and a1, a4, a2
-; RV32IZFH-NEXT: neg a2, s0
; RV32IZFH-NEXT: and a0, a2, a0
-; RV32IZFH-NEXT: neg a2, a3
-; RV32IZFH-NEXT: or a0, a2, a0
-; RV32IZFH-NEXT: and a0, a4, a0
; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: addi sp, sp, 16
+; RV32IZFH-NEXT: beqz a3, .LBB17_6
+; RV32IZFH-NEXT: # %bb.5:
+; RV32IZFH-NEXT: addi a4, a5, -1
+; RV32IZFH-NEXT: .LBB17_6:
+; RV32IZFH-NEXT: and a1, a2, a4
; RV32IZFH-NEXT: ret
;
; RV64IZFH-LABEL: test_roundeven_si64:
@@ -3025,16 +3043,16 @@ define i64 @test_roundeven_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lui a2, %hi(.LCPI17_1)
; RV32IZHINX-NEXT: lw a2, %lo(.LCPI17_1)(a2)
; RV32IZHINX-NEXT: and a0, s2, a0
-; RV32IZHINX-NEXT: flt.s a4, a2, s0
-; RV32IZHINX-NEXT: neg a2, a4
+; RV32IZHINX-NEXT: flt.s a3, a2, s0
+; RV32IZHINX-NEXT: neg a2, a3
; RV32IZHINX-NEXT: or a0, a2, a0
; RV32IZHINX-NEXT: feq.s a2, s0, s0
; RV32IZHINX-NEXT: neg a2, a2
; RV32IZHINX-NEXT: lui a5, 524288
-; RV32IZHINX-NEXT: lui a3, 524288
+; RV32IZHINX-NEXT: lui a4, 524288
; RV32IZHINX-NEXT: beqz s1, .LBB17_4
; RV32IZHINX-NEXT: # %bb.3:
-; RV32IZHINX-NEXT: mv a3, a1
+; RV32IZHINX-NEXT: mv a4, a1
; RV32IZHINX-NEXT: .LBB17_4:
; RV32IZHINX-NEXT: and a0, a2, a0
; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -3042,11 +3060,11 @@ define i64 @test_roundeven_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: addi sp, sp, 16
-; RV32IZHINX-NEXT: beqz a4, .LBB17_6
+; RV32IZHINX-NEXT: beqz a3, .LBB17_6
; RV32IZHINX-NEXT: # %bb.5:
-; RV32IZHINX-NEXT: addi a3, a5, -1
+; RV32IZHINX-NEXT: addi a4, a5, -1
; RV32IZHINX-NEXT: .LBB17_6:
-; RV32IZHINX-NEXT: and a1, a2, a3
+; RV32IZHINX-NEXT: and a1, a2, a4
; RV32IZHINX-NEXT: ret
;
; RV64IZHINX-LABEL: test_roundeven_si64:
@@ -3084,39 +3102,41 @@ define i64 @test_roundeven_si64(half %x) nounwind {
; RV32IZFHMIN-NEXT: addi sp, sp, -16
; RV32IZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5
; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5
; RV32IZFHMIN-NEXT: lui a0, 913408
; RV32IZFHMIN-NEXT: fmv.w.x fa5, a0
; RV32IZFHMIN-NEXT: fle.s s0, fa5, fs0
+; RV32IZFHMIN-NEXT: neg s1, s0
; RV32IZFHMIN-NEXT: fmv.s fa0, fs0
; RV32IZFHMIN-NEXT: call __fixsfdi
+; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI17_0)
+; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI17_0)(a2)
+; RV32IZFHMIN-NEXT: and a0, s1, a0
+; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT: neg a2, a3
+; RV32IZFHMIN-NEXT: or a0, a2, a0
+; RV32IZFHMIN-NEXT: feq.s a2, fs0, fs0
+; RV32IZFHMIN-NEXT: neg a2, a2
+; RV32IZFHMIN-NEXT: lui a5, 524288
; RV32IZFHMIN-NEXT: lui a4, 524288
-; RV32IZFHMIN-NEXT: lui a2, 524288
; RV32IZFHMIN-NEXT: beqz s0, .LBB17_4
; RV32IZFHMIN-NEXT: # %bb.3:
-; RV32IZFHMIN-NEXT: mv a2, a1
+; RV32IZFHMIN-NEXT: mv a4, a1
; RV32IZFHMIN-NEXT: .LBB17_4:
-; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI17_0)
-; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI17_0)(a1)
-; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT: beqz a3, .LBB17_6
-; RV32IZFHMIN-NEXT: # %bb.5:
-; RV32IZFHMIN-NEXT: addi a2, a4, -1
-; RV32IZFHMIN-NEXT: .LBB17_6:
-; RV32IZFHMIN-NEXT: feq.s a1, fs0, fs0
-; RV32IZFHMIN-NEXT: neg a4, a1
-; RV32IZFHMIN-NEXT: and a1, a4, a2
-; RV32IZFHMIN-NEXT: neg a2, s0
; RV32IZFHMIN-NEXT: and a0, a2, a0
-; RV32IZFHMIN-NEXT: neg a2, a3
-; RV32IZFHMIN-NEXT: or a0, a2, a0
-; RV32IZFHMIN-NEXT: and a0, a4, a0
; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: addi sp, sp, 16
+; RV32IZFHMIN-NEXT: beqz a3, .LBB17_6
+; RV32IZFHMIN-NEXT: # %bb.5:
+; RV32IZFHMIN-NEXT: addi a4, a5, -1
+; RV32IZFHMIN-NEXT: .LBB17_6:
+; RV32IZFHMIN-NEXT: and a1, a2, a4
; RV32IZFHMIN-NEXT: ret
;
; RV64IZFHMIN-LABEL: test_roundeven_si64:
@@ -3168,16 +3188,16 @@ define i64 @test_roundeven_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI17_0)
; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI17_0)(a2)
; RV32IZHINXMIN-NEXT: and a0, s2, a0
-; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0
-; RV32IZHINXMIN-NEXT: neg a2, a4
+; RV32IZHINXMIN-NEXT: flt.s a3, a2, s0
+; RV32IZHINXMIN-NEXT: neg a2, a3
; RV32IZHINXMIN-NEXT: or a0, a2, a0
; RV32IZHINXMIN-NEXT: feq.s a2, s0, s0
; RV32IZHINXMIN-NEXT: neg a2, a2
; RV32IZHINXMIN-NEXT: lui a5, 524288
-; RV32IZHINXMIN-NEXT: lui a3, 524288
+; RV32IZHINXMIN-NEXT: lui a4, 524288
; RV32IZHINXMIN-NEXT: beqz s1, .LBB17_4
; RV32IZHINXMIN-NEXT: # %bb.3:
-; RV32IZHINXMIN-NEXT: mv a3, a1
+; RV32IZHINXMIN-NEXT: mv a4, a1
; RV32IZHINXMIN-NEXT: .LBB17_4:
; RV32IZHINXMIN-NEXT: and a0, a2, a0
; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -3185,11 +3205,11 @@ define i64 @test_roundeven_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: addi sp, sp, 16
-; RV32IZHINXMIN-NEXT: beqz a4, .LBB17_6
+; RV32IZHINXMIN-NEXT: beqz a3, .LBB17_6
; RV32IZHINXMIN-NEXT: # %bb.5:
-; RV32IZHINXMIN-NEXT: addi a3, a5, -1
+; RV32IZHINXMIN-NEXT: addi a4, a5, -1
; RV32IZHINXMIN-NEXT: .LBB17_6:
-; RV32IZHINXMIN-NEXT: and a1, a2, a3
+; RV32IZHINXMIN-NEXT: and a1, a2, a4
; RV32IZHINXMIN-NEXT: ret
;
; RV64IZHINXMIN-LABEL: test_roundeven_si64:
@@ -3668,38 +3688,40 @@ define i64 @test_rint_si64(half %x) nounwind {
; RV32IZFH-NEXT: addi sp, sp, -16
; RV32IZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
; RV32IZFH-NEXT: fcvt.s.h fs0, fa0
; RV32IZFH-NEXT: lui a0, 913408
; RV32IZFH-NEXT: fmv.w.x fa5, a0
; RV32IZFH-NEXT: fle.s s0, fa5, fs0
+; RV32IZFH-NEXT: neg s1, s0
; RV32IZFH-NEXT: fmv.s fa0, fs0
; RV32IZFH-NEXT: call __fixsfdi
+; RV32IZFH-NEXT: lui a2, %hi(.LCPI21_1)
+; RV32IZFH-NEXT: flw fa5, %lo(.LCPI21_1)(a2)
+; RV32IZFH-NEXT: and a0, s1, a0
+; RV32IZFH-NEXT: flt.s a3, fa5, fs0
+; RV32IZFH-NEXT: neg a2, a3
+; RV32IZFH-NEXT: or a0, a2, a0
+; RV32IZFH-NEXT: feq.s a2, fs0, fs0
+; RV32IZFH-NEXT: neg a2, a2
+; RV32IZFH-NEXT: lui a5, 524288
; RV32IZFH-NEXT: lui a4, 524288
-; RV32IZFH-NEXT: lui a2, 524288
; RV32IZFH-NEXT: beqz s0, .LBB21_4
; RV32IZFH-NEXT: # %bb.3:
-; RV32IZFH-NEXT: mv a2, a1
+; RV32IZFH-NEXT: mv a4, a1
; RV32IZFH-NEXT: .LBB21_4:
-; RV32IZFH-NEXT: lui a1, %hi(.LCPI21_1)
-; RV32IZFH-NEXT: flw fa5, %lo(.LCPI21_1)(a1)
-; RV32IZFH-NEXT: flt.s a3, fa5, fs0
-; RV32IZFH-NEXT: beqz a3, .LBB21_6
-; RV32IZFH-NEXT: # %bb.5:
-; RV32IZFH-NEXT: addi a2, a4, -1
-; RV32IZFH-NEXT: .LBB21_6:
-; RV32IZFH-NEXT: feq.s a1, fs0, fs0
-; RV32IZFH-NEXT: neg a4, a1
-; RV32IZFH-NEXT: and a1, a4, a2
-; RV32IZFH-NEXT: neg a2, s0
; RV32IZFH-NEXT: and a0, a2, a0
-; RV32IZFH-NEXT: neg a2, a3
-; RV32IZFH-NEXT: or a0, a2, a0
-; RV32IZFH-NEXT: and a0, a4, a0
; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
; RV32IZFH-NEXT: addi sp, sp, 16
+; RV32IZFH-NEXT: beqz a3, .LBB21_6
+; RV32IZFH-NEXT: # %bb.5:
+; RV32IZFH-NEXT: addi a4, a5, -1
+; RV32IZFH-NEXT: .LBB21_6:
+; RV32IZFH-NEXT: and a1, a2, a4
; RV32IZFH-NEXT: ret
;
; RV64IZFH-LABEL: test_rint_si64:
@@ -3737,16 +3759,16 @@ define i64 @test_rint_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lui a2, %hi(.LCPI21_1)
; RV32IZHINX-NEXT: lw a2, %lo(.LCPI21_1)(a2)
; RV32IZHINX-NEXT: and a0, s2, a0
-; RV32IZHINX-NEXT: flt.s a4, a2, s0
-; RV32IZHINX-NEXT: neg a2, a4
+; RV32IZHINX-NEXT: flt.s a3, a2, s0
+; RV32IZHINX-NEXT: neg a2, a3
; RV32IZHINX-NEXT: or a0, a2, a0
; RV32IZHINX-NEXT: feq.s a2, s0, s0
; RV32IZHINX-NEXT: neg a2, a2
; RV32IZHINX-NEXT: lui a5, 524288
-; RV32IZHINX-NEXT: lui a3, 524288
+; RV32IZHINX-NEXT: lui a4, 524288
; RV32IZHINX-NEXT: beqz s1, .LBB21_4
; RV32IZHINX-NEXT: # %bb.3:
-; RV32IZHINX-NEXT: mv a3, a1
+; RV32IZHINX-NEXT: mv a4, a1
; RV32IZHINX-NEXT: .LBB21_4:
; RV32IZHINX-NEXT: and a0, a2, a0
; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -3754,11 +3776,11 @@ define i64 @test_rint_si64(half %x) nounwind {
; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: addi sp, sp, 16
-; RV32IZHINX-NEXT: beqz a4, .LBB21_6
+; RV32IZHINX-NEXT: beqz a3, .LBB21_6
; RV32IZHINX-NEXT: # %bb.5:
-; RV32IZHINX-NEXT: addi a3, a5, -1
+; RV32IZHINX-NEXT: addi a4, a5, -1
; RV32IZHINX-NEXT: .LBB21_6:
-; RV32IZHINX-NEXT: and a1, a2, a3
+; RV32IZHINX-NEXT: and a1, a2, a4
; RV32IZHINX-NEXT: ret
;
; RV64IZHINX-LABEL: test_rint_si64:
@@ -3796,39 +3818,41 @@ define i64 @test_rint_si64(half %x) nounwind {
; RV32IZFHMIN-NEXT: addi sp, sp, -16
; RV32IZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT: fsw fs0, 0(sp) # 4-byte Folded Spill
; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5
; RV32IZFHMIN-NEXT: fcvt.s.h fs0, fa5
; RV32IZFHMIN-NEXT: lui a0, 913408
; RV32IZFHMIN-NEXT: fmv.w.x fa5, a0
; RV32IZFHMIN-NEXT: fle.s s0, fa5, fs0
+; RV32IZFHMIN-NEXT: neg s1, s0
; RV32IZFHMIN-NEXT: fmv.s fa0, fs0
; RV32IZFHMIN-NEXT: call __fixsfdi
+; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI21_0)
+; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI21_0)(a2)
+; RV32IZFHMIN-NEXT: and a0, s1, a0
+; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT: neg a2, a3
+; RV32IZFHMIN-NEXT: or a0, a2, a0
+; RV32IZFHMIN-NEXT: feq.s a2, fs0, fs0
+; RV32IZFHMIN-NEXT: neg a2, a2
+; RV32IZFHMIN-NEXT: lui a5, 524288
; RV32IZFHMIN-NEXT: lui a4, 524288
-; RV32IZFHMIN-NEXT: lui a2, 524288
; RV32IZFHMIN-NEXT: beqz s0, .LBB21_4
; RV32IZFHMIN-NEXT: # %bb.3:
-; RV32IZFHMIN-NEXT: mv a2, a1
+; RV32IZFHMIN-NEXT: mv a4, a1
; RV32IZFHMIN-NEXT: .LBB21_4:
-; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI21_0)
-; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI21_0)(a1)
-; RV32IZFHMIN-NEXT: flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT: beqz a3, .LBB21_6
-; RV32IZFHMIN-NEXT: # %bb.5:
-; RV32IZFHMIN-NEXT: addi a2, a4, -1
-; RV32IZFHMIN-NEXT: .LBB21_6:
-; RV32IZFHMIN-NEXT: feq.s a1, fs0, fs0
-; RV32IZFHMIN-NEXT: neg a4, a1
-; RV32IZFHMIN-NEXT: and a1, a4, a2
-; RV32IZFHMIN-NEXT: neg a2, s0
; RV32IZFHMIN-NEXT: and a0, a2, a0
-; RV32IZFHMIN-NEXT: neg a2, a3
-; RV32IZFHMIN-NEXT: or a0, a2, a0
-; RV32IZFHMIN-NEXT: and a0, a4, a0
; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT: flw fs0, 0(sp) # 4-byte Folded Reload
; RV32IZFHMIN-NEXT: addi sp, sp, 16
+; RV32IZFHMIN-NEXT: beqz a3, .LBB21_6
+; RV32IZFHMIN-NEXT: # %bb.5:
+; RV32IZFHMIN-NEXT: addi a4, a5, -1
+; RV32IZFHMIN-NEXT: .LBB21_6:
+; RV32IZFHMIN-NEXT: and a1, a2, a4
; RV32IZFHMIN-NEXT: ret
;
; RV64IZFHMIN-LABEL: test_rint_si64:
@@ -3880,16 +3904,16 @@ define i64 @test_rint_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lui a2, %hi(.LCPI21_0)
; RV32IZHINXMIN-NEXT: lw a2, %lo(.LCPI21_0)(a2)
; RV32IZHINXMIN-NEXT: and a0, s2, a0
-; RV32IZHINXMIN-NEXT: flt.s a4, a2, s0
-; RV32IZHINXMIN-NEXT: neg a2, a4
+; RV32IZHINXMIN-NEXT: flt.s a3, a2, s0
+; RV32IZHINXMIN-NEXT: neg a2, a3
; RV32IZHINXMIN-NEXT: or a0, a2, a0
; RV32IZHINXMIN-NEXT: feq.s a2, s0, s0
; RV32IZHINXMIN-NEXT: neg a2, a2
; RV32IZHINXMIN-NEXT: lui a5, 524288
-; RV32IZHINXMIN-NEXT: lui a3, 524288
+; RV32IZHINXMIN-NEXT: lui a4, 524288
; RV32IZHINXMIN-NEXT: beqz s1, .LBB21_4
; RV32IZHINXMIN-NEXT: # %bb.3:
-; RV32IZHINXMIN-NEXT: mv a3, a1
+; RV32IZHINXMIN-NEXT: mv a4, a1
; RV32IZHINXMIN-NEXT: .LBB21_4:
; RV32IZHINXMIN-NEXT: and a0, a2, a0
; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
@@ -3897,11 +3921,11 @@ define i64 @test_rint_si64(half %x) nounwind {
; RV32IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: lw s2, 0(sp) # 4-byte Folded Reload
; RV32IZHINXMIN-NEXT: addi sp, sp, 16
-; RV32IZHINXMIN-NEXT: beqz a4, .LBB21_6
+; RV32IZHINXMIN-NEXT: beqz a3, .LBB21_6
; RV32IZHINXMIN-NEXT: # %bb.5:
-; RV32IZHINXMIN-NEXT: addi a3, a5, -1
+; RV32IZHINXMIN-NEXT: addi a4, a5, -1
; RV32IZHINXMIN-NEXT: .LBB21_6:
-; RV32IZHINXMIN-NEXT: and a1, a2, a3
+; RV32IZHINXMIN-NEXT: and a1, a2, a4
; RV32IZHINXMIN-NEXT: ret
;
; RV64IZHINXMIN-LABEL: test_rint_si64:
diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll
index a0c85ab4dca7f7c..98c886333d69a03 100644
--- a/llvm/test/CodeGen/RISCV/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/iabs.ll
@@ -630,8 +630,8 @@ define void @zext16_abs8(i8 %x, ptr %p) {
; RV32I-LABEL: zext16_abs8:
; RV32I: # %bb.0:
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: srai a2, a0, 31
; RV32I-NEXT: srai a0, a0, 24
+; RV32I-NEXT: srai a2, a0, 31
; RV32I-NEXT: xor a0, a0, a2
; RV32I-NEXT: sub a0, a0, a2
; RV32I-NEXT: sh a0, 0(a1)
@@ -648,8 +648,8 @@ define void @zext16_abs8(i8 %x, ptr %p) {
; RV64I-LABEL: zext16_abs8:
; RV64I: # %bb.0:
; RV64I-NEXT: slli a0, a0, 56
-; RV64I-NEXT: srai a2, a0, 63
; RV64I-NEXT: srai a0, a0, 56
+; RV64I-NEXT: srai a2, a0, 63
; RV64I-NEXT: xor a0, a0, a2
; RV64I-NEXT: subw a0, a0, a2
; RV64I-NEXT: sh a0, 0(a1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index a6b2d3141f22f95..b3bda5973eb8c4c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -2190,66 +2190,65 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
; CHECK-NOV-NEXT: .cfi_offset s0, -16
; CHECK-NOV-NEXT: .cfi_offset s1, -24
; CHECK-NOV-NEXT: .cfi_offset fs0, -32
-; CHECK-NOV-NEXT: fmv.d fs0, fa1
+; CHECK-NOV-NEXT: fmv.d fs0, fa0
+; CHECK-NOV-NEXT: fmv.d fa0, fa1
; CHECK-NOV-NEXT: call __fixdfti
; CHECK-NOV-NEXT: mv s0, a0
; CHECK-NOV-NEXT: mv s1, a1
; CHECK-NOV-NEXT: fmv.d fa0, fs0
; CHECK-NOV-NEXT: call __fixdfti
-; CHECK-NOV-NEXT: mv a2, a0
-; CHECK-NOV-NEXT: li a0, -1
-; CHECK-NOV-NEXT: srli a3, a0, 1
-; CHECK-NOV-NEXT: beqz a1, .LBB18_3
+; CHECK-NOV-NEXT: li a2, -1
+; CHECK-NOV-NEXT: srli a3, a2, 1
+; CHECK-NOV-NEXT: beqz s1, .LBB18_3
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: slti a4, a1, 0
-; CHECK-NOV-NEXT: bnez s1, .LBB18_4
+; CHECK-NOV-NEXT: slti a4, s1, 0
+; CHECK-NOV-NEXT: bnez a1, .LBB18_4
; CHECK-NOV-NEXT: .LBB18_2:
-; CHECK-NOV-NEXT: sltu a5, s0, a3
+; CHECK-NOV-NEXT: sltu a5, a0, a3
; CHECK-NOV-NEXT: beqz a5, .LBB18_5
; CHECK-NOV-NEXT: j .LBB18_6
; CHECK-NOV-NEXT: .LBB18_3:
-; CHECK-NOV-NEXT: sltu a4, a2, a3
-; CHECK-NOV-NEXT: beqz s1, .LBB18_2
+; CHECK-NOV-NEXT: sltu a4, s0, a3
+; CHECK-NOV-NEXT: beqz a1, .LBB18_2
; CHECK-NOV-NEXT: .LBB18_4: # %entry
-; CHECK-NOV-NEXT: slti a5, s1, 0
+; CHECK-NOV-NEXT: slti a5, a1, 0
; CHECK-NOV-NEXT: bnez a5, .LBB18_6
; CHECK-NOV-NEXT: .LBB18_5: # %entry
-; CHECK-NOV-NEXT: mv s0, a3
+; CHECK-NOV-NEXT: mv a0, a3
; CHECK-NOV-NEXT: .LBB18_6: # %entry
; CHECK-NOV-NEXT: neg a6, a5
; CHECK-NOV-NEXT: neg a5, a4
-; CHECK-NOV-NEXT: and a5, a5, a1
+; CHECK-NOV-NEXT: and a5, a5, s1
; CHECK-NOV-NEXT: bnez a4, .LBB18_8
; CHECK-NOV-NEXT: # %bb.7: # %entry
-; CHECK-NOV-NEXT: mv a2, a3
+; CHECK-NOV-NEXT: mv s0, a3
; CHECK-NOV-NEXT: .LBB18_8: # %entry
-; CHECK-NOV-NEXT: and a4, a6, s1
-; CHECK-NOV-NEXT: slli a1, a0, 63
-; CHECK-NOV-NEXT: beq a5, a0, .LBB18_11
+; CHECK-NOV-NEXT: and a4, a6, a1
+; CHECK-NOV-NEXT: slli a1, a2, 63
+; CHECK-NOV-NEXT: beq a5, a2, .LBB18_11
; CHECK-NOV-NEXT: # %bb.9: # %entry
; CHECK-NOV-NEXT: slti a3, a5, 0
; CHECK-NOV-NEXT: xori a3, a3, 1
-; CHECK-NOV-NEXT: bne a4, a0, .LBB18_12
+; CHECK-NOV-NEXT: bne a4, a2, .LBB18_12
; CHECK-NOV-NEXT: .LBB18_10:
-; CHECK-NOV-NEXT: sltu a0, a1, s0
-; CHECK-NOV-NEXT: beqz a0, .LBB18_13
+; CHECK-NOV-NEXT: sltu a2, a1, a0
+; CHECK-NOV-NEXT: beqz a2, .LBB18_13
; CHECK-NOV-NEXT: j .LBB18_14
; CHECK-NOV-NEXT: .LBB18_11:
-; CHECK-NOV-NEXT: sltu a3, a1, a2
-; CHECK-NOV-NEXT: beq a4, a0, .LBB18_10
+; CHECK-NOV-NEXT: sltu a3, a1, s0
+; CHECK-NOV-NEXT: beq a4, a2, .LBB18_10
; CHECK-NOV-NEXT: .LBB18_12: # %entry
-; CHECK-NOV-NEXT: slti a0, a4, 0
-; CHECK-NOV-NEXT: xori a0, a0, 1
-; CHECK-NOV-NEXT: bnez a0, .LBB18_14
+; CHECK-NOV-NEXT: slti a2, a4, 0
+; CHECK-NOV-NEXT: xori a2, a2, 1
+; CHECK-NOV-NEXT: bnez a2, .LBB18_14
; CHECK-NOV-NEXT: .LBB18_13: # %entry
-; CHECK-NOV-NEXT: mv s0, a1
+; CHECK-NOV-NEXT: mv a0, a1
; CHECK-NOV-NEXT: .LBB18_14: # %entry
; CHECK-NOV-NEXT: bnez a3, .LBB18_16
; CHECK-NOV-NEXT: # %bb.15: # %entry
-; CHECK-NOV-NEXT: mv a2, a1
+; CHECK-NOV-NEXT: mv s0, a1
; CHECK-NOV-NEXT: .LBB18_16: # %entry
-; CHECK-NOV-NEXT: mv a0, s0
-; CHECK-NOV-NEXT: mv a1, a2
+; CHECK-NOV-NEXT: mv a1, s0
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
@@ -2274,43 +2273,43 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
; CHECK-V-NEXT: addi a0, sp, 32
; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT: vslidedown.vi v9, v8, 1
-; CHECK-V-NEXT: vfmv.f.s fa0, v9
+; CHECK-V-NEXT: vfmv.f.s fa0, v8
; CHECK-V-NEXT: call __fixdfti
; CHECK-V-NEXT: mv s0, a0
; CHECK-V-NEXT: mv s1, a1
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: addi a0, sp, 32
; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vslidedown.vi v8, v8, 1
; CHECK-V-NEXT: vfmv.f.s fa0, v8
; CHECK-V-NEXT: call __fixdfti
; CHECK-V-NEXT: li a2, -1
; CHECK-V-NEXT: srli a3, a2, 1
-; CHECK-V-NEXT: beqz a1, .LBB18_3
+; CHECK-V-NEXT: beqz s1, .LBB18_3
; CHECK-V-NEXT: # %bb.1: # %entry
-; CHECK-V-NEXT: slti a4, a1, 0
-; CHECK-V-NEXT: bnez s1, .LBB18_4
+; CHECK-V-NEXT: slti a4, s1, 0
+; CHECK-V-NEXT: bnez a1, .LBB18_4
; CHECK-V-NEXT: .LBB18_2:
-; CHECK-V-NEXT: sltu a5, s0, a3
+; CHECK-V-NEXT: sltu a5, a0, a3
; CHECK-V-NEXT: beqz a5, .LBB18_5
; CHECK-V-NEXT: j .LBB18_6
; CHECK-V-NEXT: .LBB18_3:
-; CHECK-V-NEXT: sltu a4, a0, a3
-; CHECK-V-NEXT: beqz s1, .LBB18_2
+; CHECK-V-NEXT: sltu a4, s0, a3
+; CHECK-V-NEXT: beqz a1, .LBB18_2
; CHECK-V-NEXT: .LBB18_4: # %entry
-; CHECK-V-NEXT: slti a5, s1, 0
+; CHECK-V-NEXT: slti a5, a1, 0
; CHECK-V-NEXT: bnez a5, .LBB18_6
; CHECK-V-NEXT: .LBB18_5: # %entry
-; CHECK-V-NEXT: mv s0, a3
+; CHECK-V-NEXT: mv a0, a3
; CHECK-V-NEXT: .LBB18_6: # %entry
; CHECK-V-NEXT: neg a6, a5
; CHECK-V-NEXT: neg a5, a4
-; CHECK-V-NEXT: and a5, a5, a1
+; CHECK-V-NEXT: and a5, a5, s1
; CHECK-V-NEXT: bnez a4, .LBB18_8
; CHECK-V-NEXT: # %bb.7: # %entry
-; CHECK-V-NEXT: mv a0, a3
+; CHECK-V-NEXT: mv s0, a3
; CHECK-V-NEXT: .LBB18_8: # %entry
-; CHECK-V-NEXT: and a4, a6, s1
+; CHECK-V-NEXT: and a4, a6, a1
; CHECK-V-NEXT: slli a1, a2, 63
; CHECK-V-NEXT: beq a5, a2, .LBB18_11
; CHECK-V-NEXT: # %bb.9: # %entry
@@ -2318,26 +2317,26 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
; CHECK-V-NEXT: xori a3, a3, 1
; CHECK-V-NEXT: bne a4, a2, .LBB18_12
; CHECK-V-NEXT: .LBB18_10:
-; CHECK-V-NEXT: sltu a2, a1, s0
+; CHECK-V-NEXT: sltu a2, a1, a0
; CHECK-V-NEXT: beqz a2, .LBB18_13
; CHECK-V-NEXT: j .LBB18_14
; CHECK-V-NEXT: .LBB18_11:
-; CHECK-V-NEXT: sltu a3, a1, a0
+; CHECK-V-NEXT: sltu a3, a1, s0
; CHECK-V-NEXT: beq a4, a2, .LBB18_10
; CHECK-V-NEXT: .LBB18_12: # %entry
; CHECK-V-NEXT: slti a2, a4, 0
; CHECK-V-NEXT: xori a2, a2, 1
; CHECK-V-NEXT: bnez a2, .LBB18_14
; CHECK-V-NEXT: .LBB18_13: # %entry
-; CHECK-V-NEXT: mv s0, a1
+; CHECK-V-NEXT: mv a0, a1
; CHECK-V-NEXT: .LBB18_14: # %entry
; CHECK-V-NEXT: bnez a3, .LBB18_16
; CHECK-V-NEXT: # %bb.15: # %entry
-; CHECK-V-NEXT: mv a0, a1
+; CHECK-V-NEXT: mv s0, a1
; CHECK-V-NEXT: .LBB18_16: # %entry
; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: vmv.s.x v9, s0
+; CHECK-V-NEXT: vmv.s.x v8, s0
+; CHECK-V-NEXT: vmv.s.x v9, a0
; CHECK-V-NEXT: vslideup.vi v8, v9, 1
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
@@ -2370,19 +2369,19 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) {
; CHECK-NOV-NEXT: .cfi_offset s0, -16
; CHECK-NOV-NEXT: .cfi_offset s1, -24
; CHECK-NOV-NEXT: .cfi_offset fs0, -32
-; CHECK-NOV-NEXT: fmv.d fs0, fa1
+; CHECK-NOV-NEXT: fmv.d fs0, fa0
+; CHECK-NOV-NEXT: fmv.d fa0, fa1
; CHECK-NOV-NEXT: call __fixunsdfti
; CHECK-NOV-NEXT: mv s0, a0
; CHECK-NOV-NEXT: mv s1, a1
; CHECK-NOV-NEXT: fmv.d fa0, fs0
; CHECK-NOV-NEXT: call __fixunsdfti
-; CHECK-NOV-NEXT: snez a1, a1
; CHECK-NOV-NEXT: snez a2, s1
-; CHECK-NOV-NEXT: addi a2, a2, -1
-; CHECK-NOV-NEXT: and a2, a2, s0
+; CHECK-NOV-NEXT: snez a1, a1
; CHECK-NOV-NEXT: addi a1, a1, -1
-; CHECK-NOV-NEXT: and a1, a1, a0
-; CHECK-NOV-NEXT: mv a0, a2
+; CHECK-NOV-NEXT: and a0, a1, a0
+; CHECK-NOV-NEXT: addi a1, a2, -1
+; CHECK-NOV-NEXT: and a1, a1, s0
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
@@ -2407,25 +2406,25 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) {
; CHECK-V-NEXT: addi a0, sp, 32
; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT: vslidedown.vi v9, v8, 1
-; CHECK-V-NEXT: vfmv.f.s fa0, v9
+; CHECK-V-NEXT: vfmv.f.s fa0, v8
; CHECK-V-NEXT: call __fixunsdfti
; CHECK-V-NEXT: mv s0, a0
; CHECK-V-NEXT: mv s1, a1
; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; CHECK-V-NEXT: addi a0, sp, 32
; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vslidedown.vi v8, v8, 1
; CHECK-V-NEXT: vfmv.f.s fa0, v8
; CHECK-V-NEXT: call __fixunsdfti
-; CHECK-V-NEXT: snez a1, a1
; CHECK-V-NEXT: snez a2, s1
-; CHECK-V-NEXT: addi a2, a2, -1
-; CHECK-V-NEXT: and a2, a2, s0
+; CHECK-V-NEXT: snez a1, a1
; CHECK-V-NEXT: addi a1, a1, -1
; CHECK-V-NEXT: and a0, a1, a0
+; CHECK-V-NEXT: addi a2, a2, -1
+; CHECK-V-NEXT: and a2, a2, s0
; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: vmv.s.x v9, a2
+; CHECK-V-NEXT: vmv.s.x v8, a2
+; CHECK-V-NEXT: vmv.s.x v9, a0
; CHECK-V-NEXT: vslideup.vi v8, v9, 1
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
@@ -2467,32 +2466,32 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) {
; CHECK-NOV-NEXT: # %bb.1: # %entry
; CHECK-NOV-NEXT: li a2, 1
; CHECK-NOV-NEXT: .LBB20_2: # %entry
-; CHECK-NOV-NEXT: slti a3, a1, 1
; CHECK-NOV-NEXT: slti a4, s1, 1
+; CHECK-NOV-NEXT: slti a3, a1, 1
; CHECK-NOV-NEXT: blez a1, .LBB20_4
; CHECK-NOV-NEXT: # %bb.3: # %entry
; CHECK-NOV-NEXT: li a1, 1
; CHECK-NOV-NEXT: .LBB20_4: # %entry
-; CHECK-NOV-NEXT: neg a4, a4
; CHECK-NOV-NEXT: neg a3, a3
; CHECK-NOV-NEXT: and a3, a3, a0
+; CHECK-NOV-NEXT: neg a0, a4
; CHECK-NOV-NEXT: beqz a1, .LBB20_7
; CHECK-NOV-NEXT: # %bb.5: # %entry
; CHECK-NOV-NEXT: sgtz a1, a1
-; CHECK-NOV-NEXT: and a4, a4, s0
+; CHECK-NOV-NEXT: and a0, a0, s0
; CHECK-NOV-NEXT: bnez a2, .LBB20_8
; CHECK-NOV-NEXT: .LBB20_6:
-; CHECK-NOV-NEXT: snez a0, a4
+; CHECK-NOV-NEXT: snez a2, a0
; CHECK-NOV-NEXT: j .LBB20_9
; CHECK-NOV-NEXT: .LBB20_7:
; CHECK-NOV-NEXT: snez a1, a3
-; CHECK-NOV-NEXT: and a4, a4, s0
+; CHECK-NOV-NEXT: and a0, a0, s0
; CHECK-NOV-NEXT: beqz a2, .LBB20_6
; CHECK-NOV-NEXT: .LBB20_8: # %entry
-; CHECK-NOV-NEXT: sgtz a0, a2
+; CHECK-NOV-NEXT: sgtz a2, a2
; CHECK-NOV-NEXT: .LBB20_9: # %entry
-; CHECK-NOV-NEXT: neg a0, a0
-; CHECK-NOV-NEXT: and a0, a0, a4
+; CHECK-NOV-NEXT: neg a2, a2
+; CHECK-NOV-NEXT: and a0, a2, a0
; CHECK-NOV-NEXT: neg a1, a1
; CHECK-NOV-NEXT: and a1, a1, a3
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
@@ -2534,15 +2533,15 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) {
; CHECK-V-NEXT: # %bb.1: # %entry
; CHECK-V-NEXT: li a2, 1
; CHECK-V-NEXT: .LBB20_2: # %entry
-; CHECK-V-NEXT: slti a4, a1, 1
; CHECK-V-NEXT: slti a3, s1, 1
+; CHECK-V-NEXT: slti a4, a1, 1
; CHECK-V-NEXT: blez a1, .LBB20_4
; CHECK-V-NEXT: # %bb.3: # %entry
; CHECK-V-NEXT: li a1, 1
; CHECK-V-NEXT: .LBB20_4: # %entry
-; CHECK-V-NEXT: neg a3, a3
; CHECK-V-NEXT: neg a4, a4
; CHECK-V-NEXT: and a0, a4, a0
+; CHECK-V-NEXT: neg a3, a3
; CHECK-V-NEXT: beqz a1, .LBB20_7
; CHECK-V-NEXT: # %bb.5: # %entry
; CHECK-V-NEXT: sgtz a1, a1
@@ -2597,66 +2596,65 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
; CHECK-NOV-NEXT: .cfi_offset s0, -16
; CHECK-NOV-NEXT: .cfi_offset s1, -24
; CHECK-NOV-NEXT: .cfi_offset fs0, -32
-; CHECK-NOV-NEXT: fmv.s fs0, fa1
+; CHECK-NOV-NEXT: fmv.s fs0, fa0
+; CHECK-NOV-NEXT: fmv.s fa0, fa1
; CHECK-NOV-NEXT: call __fixsfti
; CHECK-NOV-NEXT: mv s0, a0
; CHECK-NOV-NEXT: mv s1, a1
; CHECK-NOV-NEXT: fmv.s fa0, fs0
; CHECK-NOV-NEXT: call __fixsfti
-; CHECK-NOV-NEXT: mv a2, a0
-; CHECK-NOV-NEXT: li a0, -1
-; CHECK-NOV-NEXT: srli a3, a0, 1
-; CHECK-NOV-NEXT: beqz a1, .LBB21_3
+; CHECK-NOV-NEXT: li a2, -1
+; CHECK-NOV-NEXT: srli a3, a2, 1
+; CHECK-NOV-NEXT: beqz s1, .LBB21_3
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: slti a4, a1, 0
-; CHECK-NOV-NEXT: bnez s1, .LBB21_4
+; CHECK-NOV-NEXT: slti a4, s1, 0
+; CHECK-NOV-NEXT: bnez a1, .LBB21_4
; CHECK-NOV-NEXT: .LBB21_2:
-; CHECK-NOV-NEXT: sltu a5, s0, a3
+; CHECK-NOV-NEXT: sltu a5, a0, a3
; CHECK-NOV-NEXT: beqz a5, .LBB21_5
; CHECK-NOV-NEXT: j .LBB21_6
; CHECK-NOV-NEXT: .LBB21_3:
-; CHECK-NOV-NEXT: sltu a4, a2, a3
-; CHECK-NOV-NEXT: beqz s1, .LBB21_2
+; CHECK-NOV-NEXT: sltu a4, s0, a3
+; CHECK-NOV-NEXT: beqz a1, .LBB21_2
; CHECK-NOV-NEXT: .LBB21_4: # %entry
-; CHECK-NOV-NEXT: slti a5, s1, 0
+; CHECK-NOV-NEXT: slti a5, a1, 0
; CHECK-NOV-NEXT: bnez a5, .LBB21_6
; CHECK-NOV-NEXT: .LBB21_5: # %entry
-; CHECK-NOV-NEXT: mv s0, a3
+; CHECK-NOV-NEXT: mv a0, a3
; CHECK-NOV-NEXT: .LBB21_6: # %entry
; CHECK-NOV-NEXT: neg a6, a5
; CHECK-NOV-NEXT: neg a5, a4
-; CHECK-NOV-NEXT: and a5, a5, a1
+; CHECK-NOV-NEXT: and a5, a5, s1
; CHECK-NOV-NEXT: bnez a4, .LBB21_8
; CHECK-NOV-NEXT: # %bb.7: # %entry
-; CHECK-NOV-NEXT: mv a2, a3
+; CHECK-NOV-NEXT: mv s0, a3
; CHECK-NOV-NEXT: .LBB21_8: # %entry
-; CHECK-NOV-NEXT: and a4, a6, s1
-; CHECK-NOV-NEXT: slli a1, a0, 63
-; CHECK-NOV-NEXT: beq a5, a0, .LBB21_11
+; CHECK-NOV-NEXT: and a4, a6, a1
+; CHECK-NOV-NEXT: slli a1, a2, 63
+; CHECK-NOV-NEXT: beq a5, a2, .LBB21_11
; CHECK-NOV-NEXT: # %bb.9: # %entry
; CHECK-NOV-NEXT: slti a3, a5, 0
; CHECK-NOV-NEXT: xori a3, a3, 1
-; CHECK-NOV-NEXT: bne a4, a0, .LBB21_12
+; CHECK-NOV-NEXT: bne a4, a2, .LBB21_12
; CHECK-NOV-NEXT: .LBB21_10:
-; CHECK-NOV-NEXT: sltu a0, a1, s0
-; CHECK-NOV-NEXT: beqz a0, .LBB21_13
+; CHECK-NOV-NEXT: sltu a2, a1, a0
+; CHECK-NOV-NEXT: beqz a2, .LBB21_13
; CHECK-NOV-NEXT: j .LBB21_14
; CHECK-NOV-NEXT: .LBB21_11:
-; CHECK-NOV-NEXT: sltu a3, a1, a2
-; CHECK-NOV-NEXT: beq a4, a0, .LBB21_10
+; CHECK-NOV-NEXT: sltu a3, a1, s0
+; CHECK-NOV-NEXT: beq a4, a2, .LBB21_10
; CHECK-NOV-NEXT: .LBB21_12: # %entry
-; CHECK-NOV-NEXT: slti a0, a4, 0
-; CHECK-NOV-NEXT: xori a0, a0, 1
-; CHECK-NOV-NEXT: bnez a0, .LBB21_14
+; CHECK-NOV-NEXT: slti a2, a4, 0
+; CHECK-NOV-NEXT: xori a2, a2, 1
+; CHECK-NOV-NEXT: bnez a2, .LBB21_14
; CHECK-NOV-NEXT: .LBB21_13: # %entry
-; CHECK-NOV-NEXT: mv s0, a1
+; CHECK-NOV-NEXT: mv a0, a1
; CHECK-NOV-NEXT: .LBB21_14: # %entry
; CHECK-NOV-NEXT: bnez a3, .LBB21_16
; CHECK-NOV-NEXT: # %bb.15: # %entry
-; CHECK-NOV-NEXT: mv a2, a1
+; CHECK-NOV-NEXT: mv s0, a1
; CHECK-NOV-NEXT: .LBB21_16: # %entry
-; CHECK-NOV-NEXT: mv a0, s0
-; CHECK-NOV-NEXT: mv a1, a2
+; CHECK-NOV-NEXT: mv a1, s0
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
@@ -2681,43 +2679,43 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
; CHECK-V-NEXT: addi a0, sp, 32
; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-V-NEXT: vslidedown.vi v9, v8, 1
-; CHECK-V-NEXT: vfmv.f.s fa0, v9
+; CHECK-V-NEXT: vfmv.f.s fa0, v8
; CHECK-V-NEXT: call __fixsfti
; CHECK-V-NEXT: mv s0, a0
; CHECK-V-NEXT: mv s1, a1
; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
; CHECK-V-NEXT: addi a0, sp, 32
; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vslidedown.vi v8, v8, 1
; CHECK-V-NEXT: vfmv.f.s fa0, v8
; CHECK-V-NEXT: call __fixsfti
; CHECK-V-NEXT: li a2, -1
; CHECK-V-NEXT: srli a3, a2, 1
-; CHECK-V-NEXT: beqz a1, .LBB21_3
+; CHECK-V-NEXT: beqz s1, .LBB21_3
; CHECK-V-NEXT: # %bb.1: # %entry
-; CHECK-V-NEXT: slti a4, a1, 0
-; CHECK-V-NEXT: bnez s1, .LBB21_4
+; CHECK-V-NEXT: slti a4, s1, 0
+; CHECK-V-NEXT: bnez a1, .LBB21_4
; CHECK-V-NEXT: .LBB21_2:
-; CHECK-V-NEXT: sltu a5, s0, a3
+; CHECK-V-NEXT: sltu a5, a0, a3
; CHECK-V-NEXT: beqz a5, .LBB21_5
; CHECK-V-NEXT: j .LBB21_6
; CHECK-V-NEXT: .LBB21_3:
-; CHECK-V-NEXT: sltu a4, a0, a3
-; CHECK-V-NEXT: beqz s1, .LBB21_2
+; CHECK-V-NEXT: sltu a4, s0, a3
+; CHECK-V-NEXT: beqz a1, .LBB21_2
; CHECK-V-NEXT: .LBB21_4: # %entry
-; CHECK-V-NEXT: slti a5, s1, 0
+; CHECK-V-NEXT: slti a5, a1, 0
; CHECK-V-NEXT: bnez a5, .LBB21_6
; CHECK-V-NEXT: .LBB21_5: # %entry
-; CHECK-V-NEXT: mv s0, a3
+; CHECK-V-NEXT: mv a0, a3
; CHECK-V-NEXT: .LBB21_6: # %entry
; CHECK-V-NEXT: neg a6, a5
; CHECK-V-NEXT: neg a5, a4
-; CHECK-V-NEXT: and a5, a5, a1
+; CHECK-V-NEXT: and a5, a5, s1
; CHECK-V-NEXT: bnez a4, .LBB21_8
; CHECK-V-NEXT: # %bb.7: # %entry
-; CHECK-V-NEXT: mv a0, a3
+; CHECK-V-NEXT: mv s0, a3
; CHECK-V-NEXT: .LBB21_8: # %entry
-; CHECK-V-NEXT: and a4, a6, s1
+; CHECK-V-NEXT: and a4, a6, a1
; CHECK-V-NEXT: slli a1, a2, 63
; CHECK-V-NEXT: beq a5, a2, .LBB21_11
; CHECK-V-NEXT: # %bb.9: # %entry
@@ -2725,26 +2723,26 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
; CHECK-V-NEXT: xori a3, a3, 1
; CHECK-V-NEXT: bne a4, a2, .LBB21_12
; CHECK-V-NEXT: .LBB21_10:
-; CHECK-V-NEXT: sltu a2, a1, s0
+; CHECK-V-NEXT: sltu a2, a1, a0
; CHECK-V-NEXT: beqz a2, .LBB21_13
; CHECK-V-NEXT: j .LBB21_14
; CHECK-V-NEXT: .LBB21_11:
-; CHECK-V-NEXT: sltu a3, a1, a0
+; CHECK-V-NEXT: sltu a3, a1, s0
; CHECK-V-NEXT: beq a4, a2, .LBB21_10
; CHECK-V-NEXT: .LBB21_12: # %entry
; CHECK-V-NEXT: slti a2, a4, 0
; CHECK-V-NEXT: xori a2, a2, 1
; CHECK-V-NEXT: bnez a2, .LBB21_14
; CHECK-V-NEXT: .LBB21_13: # %entry
-; CHECK-V-NEXT: mv s0, a1
+; CHECK-V-NEXT: mv a0, a1
; CHECK-V-NEXT: .LBB21_14: # %entry
; CHECK-V-NEXT: bnez a3, .LBB21_16
; CHECK-V-NEXT: # %bb.15: # %entry
-; CHECK-V-NEXT: mv a0, a1
+; CHECK-V-NEXT: mv s0, a1
; CHECK-V-NEXT: .LBB21_16: # %entry
; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: vmv.s.x v9, s0
+; CHECK-V-NEXT: vmv.s.x v8, s0
+; CHECK-V-NEXT: vmv.s.x v9, a0
; CHECK-V-NEXT: vslideup.vi v8, v9, 1
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
@@ -2777,19 +2775,19 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
; CHECK-NOV-NEXT: .cfi_offset s0, -16
; CHECK-NOV-NEXT: .cfi_offset s1, -24
; CHECK-NOV-NEXT: .cfi_offset fs0, -32
-; CHECK-NOV-NEXT: fmv.s fs0, fa1
+; CHECK-NOV-NEXT: fmv.s fs0, fa0
+; CHECK-NOV-NEXT: fmv.s fa0, fa1
; CHECK-NOV-NEXT: call __fixunssfti
; CHECK-NOV-NEXT: mv s0, a0
; CHECK-NOV-NEXT: mv s1, a1
; CHECK-NOV-NEXT: fmv.s fa0, fs0
; CHECK-NOV-NEXT: call __fixunssfti
-; CHECK-NOV-NEXT: snez a1, a1
; CHECK-NOV-NEXT: snez a2, s1
-; CHECK-NOV-NEXT: addi a2, a2, -1
-; CHECK-NOV-NEXT: and a2, a2, s0
+; CHECK-NOV-NEXT: snez a1, a1
; CHECK-NOV-NEXT: addi a1, a1, -1
-; CHECK-NOV-NEXT: and a1, a1, a0
-; CHECK-NOV-NEXT: mv a0, a2
+; CHECK-NOV-NEXT: and a0, a1, a0
+; CHECK-NOV-NEXT: addi a1, a2, -1
+; CHECK-NOV-NEXT: and a1, a1, s0
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
@@ -2814,25 +2812,25 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
; CHECK-V-NEXT: addi a0, sp, 32
; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-V-NEXT: vslidedown.vi v9, v8, 1
-; CHECK-V-NEXT: vfmv.f.s fa0, v9
+; CHECK-V-NEXT: vfmv.f.s fa0, v8
; CHECK-V-NEXT: call __fixunssfti
; CHECK-V-NEXT: mv s0, a0
; CHECK-V-NEXT: mv s1, a1
; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
; CHECK-V-NEXT: addi a0, sp, 32
; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT: vslidedown.vi v8, v8, 1
; CHECK-V-NEXT: vfmv.f.s fa0, v8
; CHECK-V-NEXT: call __fixunssfti
-; CHECK-V-NEXT: snez a1, a1
; CHECK-V-NEXT: snez a2, s1
-; CHECK-V-NEXT: addi a2, a2, -1
-; CHECK-V-NEXT: and a2, a2, s0
+; CHECK-V-NEXT: snez a1, a1
; CHECK-V-NEXT: addi a1, a1, -1
; CHECK-V-NEXT: and a0, a1, a0
+; CHECK-V-NEXT: addi a2, a2, -1
+; CHECK-V-NEXT: and a2, a2, s0
; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT: vmv.s.x v8, a0
-; CHECK-V-NEXT: vmv.s.x v9, a2
+; CHECK-V-NEXT: vmv.s.x v8, a2
+; CHECK-V-NEXT: vmv.s.x v9, a0
; CHECK-V-NEXT: vslideup.vi v8, v9, 1
; CHECK-V-NEXT: csrr a0, vlenb
; CHECK-V-NEXT: slli a0, a0, 1
@@ -2874,32 +2872,32 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) {
; CHECK-NOV-NEXT: # %bb.1: # %entry
; CHECK-NOV-NEXT: li a2, 1
; CHECK-NOV-NEXT: .LBB23_2: # %entry
-; CHECK-NOV-NEXT: slti a3, a1, 1
; CHECK-NOV-NEXT: slti a4, s1, 1
+; CHECK-NOV-NEXT: slti a3, a1, 1
; CHECK-NOV-NEXT: blez a1, .LBB23_4
; CHECK-NOV-NEXT: # %bb.3: # %entry
; CHECK-NOV-NEXT: li a1, 1
; CHECK-NOV-NEXT: .LBB23_4: # %entry
-; CHECK-NOV-NEXT: neg a4, a4
; CHECK-NOV-NEXT: neg a3, a3
; CHECK-NOV-NEXT: and a3, a3, a0
+; CHECK-NOV-NEXT: neg a0, a4
; CHECK-NOV-NEXT: beqz a1, .LBB23_7
; CHECK-NOV-NEXT: # %bb.5: # %entry
; CHECK-NOV-NEXT: sgtz a1, a1
-; CHECK-NOV-NEXT: and a4, a4, s0
+; CHECK-NOV-NEXT: and a0, a0, s0
; CHECK-NOV-NEXT: bnez a2, .LBB23_8
; CHECK-NOV-NEXT: .LBB23_6:
-; CHECK-NOV-NEXT: snez a0, a4
+; CHECK-NOV-NEXT: snez a2, a0
; CHECK-NOV-NEXT: j .LBB23_9
; CHECK-NOV-NEXT: .LBB23_7:
; CHECK-NOV-NEXT: snez a1, a3
-; CHECK-NOV-NEXT: and a4, a4, s0
+; CHECK-NOV-NEXT: and a0, a0, s0
; CHECK-NOV-NEXT: beqz a2, .LBB23_6
; CHECK-NOV-NEXT: .LBB23_8: # %entry
-; CHECK-NOV-NEXT: sgtz a0, a2
+; CHECK-NOV-NEXT: sgtz a2, a2
; CHECK-NOV-NEXT: .LBB23_9: # %entry
-; CHECK-NOV-NEXT: neg a0, a0
-; CHECK-NOV-NEXT: and a0, a0, a4
+; CHECK-NOV-NEXT: neg a2, a2
+; CHECK-NOV-NEXT: and a0, a2, a0
; CHECK-NOV-NEXT: neg a1, a1
; CHECK-NOV-NEXT: and a1, a1, a3
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
@@ -2941,15 +2939,15 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) {
; CHECK-V-NEXT: # %bb.1: # %entry
; CHECK-V-NEXT: li a2, 1
; CHECK-V-NEXT: .LBB23_2: # %entry
-; CHECK-V-NEXT: slti a4, a1, 1
; CHECK-V-NEXT: slti a3, s1, 1
+; CHECK-V-NEXT: slti a4, a1, 1
; CHECK-V-NEXT: blez a1, .LBB23_4
; CHECK-V-NEXT: # %bb.3: # %entry
; CHECK-V-NEXT: li a1, 1
; CHECK-V-NEXT: .LBB23_4: # %entry
-; CHECK-V-NEXT: neg a3, a3
; CHECK-V-NEXT: neg a4, a4
; CHECK-V-NEXT: and a0, a4, a0
+; CHECK-V-NEXT: neg a3, a3
; CHECK-V-NEXT: beqz a1, .LBB23_7
; CHECK-V-NEXT: # %bb.5: # %entry
; CHECK-V-NEXT: sgtz a1, a1
@@ -3004,8 +3002,8 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
; CHECK-NOV-NEXT: .cfi_offset s0, -16
; CHECK-NOV-NEXT: .cfi_offset s1, -24
; CHECK-NOV-NEXT: .cfi_offset s2, -32
-; CHECK-NOV-NEXT: mv s2, a1
-; CHECK-NOV-NEXT: fmv.w.x fa0, a0
+; CHECK-NOV-NEXT: mv s2, a0
+; CHECK-NOV-NEXT: fmv.w.x fa0, a1
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: call __fixsfti
; CHECK-NOV-NEXT: mv s0, a0
@@ -3013,60 +3011,58 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
; CHECK-NOV-NEXT: fmv.w.x fa0, s2
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: call __fixsfti
-; CHECK-NOV-NEXT: mv a2, a0
-; CHECK-NOV-NEXT: li a0, -1
-; CHECK-NOV-NEXT: srli a3, a0, 1
-; CHECK-NOV-NEXT: beqz a1, .LBB24_3
+; CHECK-NOV-NEXT: li a2, -1
+; CHECK-NOV-NEXT: srli a3, a2, 1
+; CHECK-NOV-NEXT: beqz s1, .LBB24_3
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: slti a4, a1, 0
-; CHECK-NOV-NEXT: bnez s1, .LBB24_4
+; CHECK-NOV-NEXT: slti a4, s1, 0
+; CHECK-NOV-NEXT: bnez a1, .LBB24_4
; CHECK-NOV-NEXT: .LBB24_2:
-; CHECK-NOV-NEXT: sltu a5, s0, a3
+; CHECK-NOV-NEXT: sltu a5, a0, a3
; CHECK-NOV-NEXT: beqz a5, .LBB24_5
; CHECK-NOV-NEXT: j .LBB24_6
; CHECK-NOV-NEXT: .LBB24_3:
-; CHECK-NOV-NEXT: sltu a4, a2, a3
-; CHECK-NOV-NEXT: beqz s1, .LBB24_2
+; CHECK-NOV-NEXT: sltu a4, s0, a3
+; CHECK-NOV-NEXT: beqz a1, .LBB24_2
; CHECK-NOV-NEXT: .LBB24_4: # %entry
-; CHECK-NOV-NEXT: slti a5, s1, 0
+; CHECK-NOV-NEXT: slti a5, a1, 0
; CHECK-NOV-NEXT: bnez a5, .LBB24_6
; CHECK-NOV-NEXT: .LBB24_5: # %entry
-; CHECK-NOV-NEXT: mv s0, a3
+; CHECK-NOV-NEXT: mv a0, a3
; CHECK-NOV-NEXT: .LBB24_6: # %entry
; CHECK-NOV-NEXT: neg a6, a5
; CHECK-NOV-NEXT: neg a5, a4
-; CHECK-NOV-NEXT: and a5, a5, a1
+; CHECK-NOV-NEXT: and a5, a5, s1
; CHECK-NOV-NEXT: bnez a4, .LBB24_8
; CHECK-NOV-NEXT: # %bb.7: # %entry
-; CHECK-NOV-NEXT: mv a2, a3
+; CHECK-NOV-NEXT: mv s0, a3
; CHECK-NOV-NEXT: .LBB24_8: # %entry
-; CHECK-NOV-NEXT: and a4, a6, s1
-; CHECK-NOV-NEXT: slli a1, a0, 63
-; CHECK-NOV-NEXT: beq a5, a0, .LBB24_11
+; CHECK-NOV-NEXT: and a4, a6, a1
+; CHECK-NOV-NEXT: slli a1, a2, 63
+; CHECK-NOV-NEXT: beq a5, a2, .LBB24_11
; CHECK-NOV-NEXT: # %bb.9: # %entry
; CHECK-NOV-NEXT: slti a3, a5, 0
; CHECK-NOV-NEXT: xori a3, a3, 1
-; CHECK-NOV-NEXT: bne a4, a0, .LBB24_12
+; CHECK-NOV-NEXT: bne a4, a2, .LBB24_12
; CHECK-NOV-NEXT: .LBB24_10:
-; CHECK-NOV-NEXT: sltu a0, a1, s0
-; CHECK-NOV-NEXT: beqz a0, .LBB24_13
+; CHECK-NOV-NEXT: sltu a2, a1, a0
+; CHECK-NOV-NEXT: beqz a2, .LBB24_13
; CHECK-NOV-NEXT: j .LBB24_14
; CHECK-NOV-NEXT: .LBB24_11:
-; CHECK-NOV-NEXT: sltu a3, a1, a2
-; CHECK-NOV-NEXT: beq a4, a0, .LBB24_10
+; CHECK-NOV-NEXT: sltu a3, a1, s0
+; CHECK-NOV-NEXT: beq a4, a2, .LBB24_10
; CHECK-NOV-NEXT: .LBB24_12: # %entry
-; CHECK-NOV-NEXT: slti a0, a4, 0
-; CHECK-NOV-NEXT: xori a0, a0, 1
-; CHECK-NOV-NEXT: bnez a0, .LBB24_14
+; CHECK-NOV-NEXT: slti a2, a4, 0
+; CHECK-NOV-NEXT: xori a2, a2, 1
+; CHECK-NOV-NEXT: bnez a2, .LBB24_14
; CHECK-NOV-NEXT: .LBB24_13: # %entry
-; CHECK-NOV-NEXT: mv s0, a1
+; CHECK-NOV-NEXT: mv a0, a1
; CHECK-NOV-NEXT: .LBB24_14: # %entry
; CHECK-NOV-NEXT: bnez a3, .LBB24_16
; CHECK-NOV-NEXT: # %bb.15: # %entry
-; CHECK-NOV-NEXT: mv a2, a1
+; CHECK-NOV-NEXT: mv s0, a1
; CHECK-NOV-NEXT: .LBB24_16: # %entry
-; CHECK-NOV-NEXT: mv a0, s0
-; CHECK-NOV-NEXT: mv a1, a2
+; CHECK-NOV-NEXT: mv a1, s0
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
@@ -3086,8 +3082,8 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
; CHECK-V-NEXT: .cfi_offset s0, -16
; CHECK-V-NEXT: .cfi_offset s1, -24
; CHECK-V-NEXT: .cfi_offset s2, -32
-; CHECK-V-NEXT: mv s2, a1
-; CHECK-V-NEXT: fmv.w.x fa0, a0
+; CHECK-V-NEXT: mv s2, a0
+; CHECK-V-NEXT: fmv.w.x fa0, a1
; CHECK-V-NEXT: call __extendhfsf2
; CHECK-V-NEXT: call __fixsfti
; CHECK-V-NEXT: mv s0, a0
@@ -3097,31 +3093,31 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
; CHECK-V-NEXT: call __fixsfti
; CHECK-V-NEXT: li a2, -1
; CHECK-V-NEXT: srli a3, a2, 1
-; CHECK-V-NEXT: beqz a1, .LBB24_3
+; CHECK-V-NEXT: beqz s1, .LBB24_3
; CHECK-V-NEXT: # %bb.1: # %entry
-; CHECK-V-NEXT: slti a4, a1, 0
-; CHECK-V-NEXT: bnez s1, .LBB24_4
+; CHECK-V-NEXT: slti a4, s1, 0
+; CHECK-V-NEXT: bnez a1, .LBB24_4
; CHECK-V-NEXT: .LBB24_2:
-; CHECK-V-NEXT: sltu a5, s0, a3
+; CHECK-V-NEXT: sltu a5, a0, a3
; CHECK-V-NEXT: beqz a5, .LBB24_5
; CHECK-V-NEXT: j .LBB24_6
; CHECK-V-NEXT: .LBB24_3:
-; CHECK-V-NEXT: sltu a4, a0, a3
-; CHECK-V-NEXT: beqz s1, .LBB24_2
+; CHECK-V-NEXT: sltu a4, s0, a3
+; CHECK-V-NEXT: beqz a1, .LBB24_2
; CHECK-V-NEXT: .LBB24_4: # %entry
-; CHECK-V-NEXT: slti a5, s1, 0
+; CHECK-V-NEXT: slti a5, a1, 0
; CHECK-V-NEXT: bnez a5, .LBB24_6
; CHECK-V-NEXT: .LBB24_5: # %entry
-; CHECK-V-NEXT: mv s0, a3
+; CHECK-V-NEXT: mv a0, a3
; CHECK-V-NEXT: .LBB24_6: # %entry
; CHECK-V-NEXT: neg a6, a5
; CHECK-V-NEXT: neg a5, a4
-; CHECK-V-NEXT: and a5, a5, a1
+; CHECK-V-NEXT: and a5, a5, s1
; CHECK-V-NEXT: bnez a4, .LBB24_8
; CHECK-V-NEXT: # %bb.7: # %entry
-; CHECK-V-NEXT: mv a0, a3
+; CHECK-V-NEXT: mv s0, a3
; CHECK-V-NEXT: .LBB24_8: # %entry
-; CHECK-V-NEXT: and a4, a6, s1
+; CHECK-V-NEXT: and a4, a6, a1
; CHECK-V-NEXT: slli a1, a2, 63
; CHECK-V-NEXT: beq a5, a2, .LBB24_11
; CHECK-V-NEXT: # %bb.9: # %entry
@@ -3129,26 +3125,26 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
; CHECK-V-NEXT: xori a3, a3, 1
; CHECK-V-NEXT: bne a4, a2, .LBB24_12
; CHECK-V-NEXT: .LBB24_10:
-; CHECK-V-NEXT: sltu a2, a1, s0
+; CHECK-V-NEXT: sltu a2, a1, a0
; CHECK-V-NEXT: beqz a2, .LBB24_13
; CHECK-V-NEXT: j .LBB24_14
; CHECK-V-NEXT: .LBB24_11:
-; CHECK-V-NEXT: sltu a3, a1, a0
+; CHECK-V-NEXT: sltu a3, a1, s0
; CHECK-V-NEXT: beq a4, a2, .LBB24_10
; CHECK-V-NEXT: .LBB24_12: # %entry
; CHECK-V-NEXT: slti a2, a4, 0
; CHECK-V-NEXT: xori a2, a2, 1
; CHECK-V-NEXT: bnez a2, .LBB24_14
; CHECK-V-NEXT: .LBB24_13: # %entry
-; CHECK-V-NEXT: mv s0, a1
+; CHECK-V-NEXT: mv a0, a1
; CHECK-V-NEXT: .LBB24_14: # %entry
; CHECK-V-NEXT: bnez a3, .LBB24_16
; CHECK-V-NEXT: # %bb.15: # %entry
-; CHECK-V-NEXT: mv a0, a1
+; CHECK-V-NEXT: mv s0, a1
; CHECK-V-NEXT: .LBB24_16: # %entry
; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT: vmv.s.x v9, a0
-; CHECK-V-NEXT: vmv.s.x v8, s0
+; CHECK-V-NEXT: vmv.s.x v9, s0
+; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: vslideup.vi v8, v9, 1
; CHECK-V-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-V-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
@@ -3179,8 +3175,8 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
; CHECK-NOV-NEXT: .cfi_offset s0, -16
; CHECK-NOV-NEXT: .cfi_offset s1, -24
; CHECK-NOV-NEXT: .cfi_offset s2, -32
-; CHECK-NOV-NEXT: mv s0, a1
-; CHECK-NOV-NEXT: fmv.w.x fa0, a0
+; CHECK-NOV-NEXT: mv s0, a0
+; CHECK-NOV-NEXT: fmv.w.x fa0, a1
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: call __fixunssfti
; CHECK-NOV-NEXT: mv s1, a0
@@ -3188,13 +3184,12 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
; CHECK-NOV-NEXT: fmv.w.x fa0, s0
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: call __fixunssfti
-; CHECK-NOV-NEXT: snez a1, a1
; CHECK-NOV-NEXT: snez a2, s2
-; CHECK-NOV-NEXT: addi a2, a2, -1
-; CHECK-NOV-NEXT: and a2, a2, s1
+; CHECK-NOV-NEXT: snez a1, a1
; CHECK-NOV-NEXT: addi a1, a1, -1
-; CHECK-NOV-NEXT: and a1, a1, a0
-; CHECK-NOV-NEXT: mv a0, a2
+; CHECK-NOV-NEXT: and a0, a1, a0
+; CHECK-NOV-NEXT: addi a1, a2, -1
+; CHECK-NOV-NEXT: and a1, a1, s1
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
@@ -3214,8 +3209,8 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
; CHECK-V-NEXT: .cfi_offset s0, -16
; CHECK-V-NEXT: .cfi_offset s1, -24
; CHECK-V-NEXT: .cfi_offset s2, -32
-; CHECK-V-NEXT: mv s0, a1
-; CHECK-V-NEXT: fmv.w.x fa0, a0
+; CHECK-V-NEXT: mv s0, a0
+; CHECK-V-NEXT: fmv.w.x fa0, a1
; CHECK-V-NEXT: call __extendhfsf2
; CHECK-V-NEXT: call __fixunssfti
; CHECK-V-NEXT: mv s1, a0
@@ -3223,15 +3218,15 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s0
; CHECK-V-NEXT: call __extendhfsf2
; CHECK-V-NEXT: call __fixunssfti
-; CHECK-V-NEXT: snez a1, a1
; CHECK-V-NEXT: snez a2, s2
-; CHECK-V-NEXT: addi a2, a2, -1
-; CHECK-V-NEXT: and a2, a2, s1
+; CHECK-V-NEXT: snez a1, a1
; CHECK-V-NEXT: addi a1, a1, -1
; CHECK-V-NEXT: and a0, a1, a0
+; CHECK-V-NEXT: addi a2, a2, -1
+; CHECK-V-NEXT: and a2, a2, s1
; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT: vmv.s.x v9, a0
-; CHECK-V-NEXT: vmv.s.x v8, a2
+; CHECK-V-NEXT: vmv.s.x v9, a2
+; CHECK-V-NEXT: vmv.s.x v8, a0
; CHECK-V-NEXT: vslideup.vi v8, v9, 1
; CHECK-V-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-V-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
@@ -3274,32 +3269,32 @@ define <2 x i64> @ustest_f16i64(<2 x half> %x) {
; CHECK-NOV-NEXT: # %bb.1: # %entry
; CHECK-NOV-NEXT: li a2, 1
; CHECK-NOV-NEXT: .LBB26_2: # %entry
-; CHECK-NOV-NEXT: slti a3, a1, 1
; CHECK-NOV-NEXT: slti a4, s1, 1
+; CHECK-NOV-NEXT: slti a3, a1, 1
; CHECK-NOV-NEXT: blez a1, .LBB26_4
; CHECK-NOV-NEXT: # %bb.3: # %entry
; CHECK-NOV-NEXT: li a1, 1
; CHECK-NOV-NEXT: .LBB26_4: # %entry
-; CHECK-NOV-NEXT: neg a4, a4
; CHECK-NOV-NEXT: neg a3, a3
; CHECK-NOV-NEXT: and a3, a3, a0
+; CHECK-NOV-NEXT: neg a0, a4
; CHECK-NOV-NEXT: beqz a1, .LBB26_7
; CHECK-NOV-NEXT: # %bb.5: # %entry
; CHECK-NOV-NEXT: sgtz a1, a1
-; CHECK-NOV-NEXT: and a4, a4, s0
+; CHECK-NOV-NEXT: and a0, a0, s0
; CHECK-NOV-NEXT: bnez a2, .LBB26_8
; CHECK-NOV-NEXT: .LBB26_6:
-; CHECK-NOV-NEXT: snez a0, a4
+; CHECK-NOV-NEXT: snez a2, a0
; CHECK-NOV-NEXT: j .LBB26_9
; CHECK-NOV-NEXT: .LBB26_7:
; CHECK-NOV-NEXT: snez a1, a3
-; CHECK-NOV-NEXT: and a4, a4, s0
+; CHECK-NOV-NEXT: and a0, a0, s0
; CHECK-NOV-NEXT: beqz a2, .LBB26_6
; CHECK-NOV-NEXT: .LBB26_8: # %entry
-; CHECK-NOV-NEXT: sgtz a0, a2
+; CHECK-NOV-NEXT: sgtz a2, a2
; CHECK-NOV-NEXT: .LBB26_9: # %entry
-; CHECK-NOV-NEXT: neg a0, a0
-; CHECK-NOV-NEXT: and a0, a0, a4
+; CHECK-NOV-NEXT: neg a2, a2
+; CHECK-NOV-NEXT: and a0, a2, a0
; CHECK-NOV-NEXT: neg a1, a1
; CHECK-NOV-NEXT: and a1, a1, a3
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
@@ -3335,15 +3330,15 @@ define <2 x i64> @ustest_f16i64(<2 x half> %x) {
; CHECK-V-NEXT: # %bb.1: # %entry
; CHECK-V-NEXT: li a2, 1
; CHECK-V-NEXT: .LBB26_2: # %entry
-; CHECK-V-NEXT: slti a4, a1, 1
; CHECK-V-NEXT: slti a3, s1, 1
+; CHECK-V-NEXT: slti a4, a1, 1
; CHECK-V-NEXT: blez a1, .LBB26_4
; CHECK-V-NEXT: # %bb.3: # %entry
; CHECK-V-NEXT: li a1, 1
; CHECK-V-NEXT: .LBB26_4: # %entry
-; CHECK-V-NEXT: neg a3, a3
; CHECK-V-NEXT: neg a4, a4
; CHECK-V-NEXT: and a0, a4, a0
+; CHECK-V-NEXT: neg a3, a3
; CHECK-V-NEXT: beqz a1, .LBB26_7
; CHECK-V-NEXT: # %bb.5: # %entry
; CHECK-V-NEXT: sgtz a1, a1
@@ -5816,15 +5811,15 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
; CHECK-NOV-NEXT: mv s1, a1
; CHECK-NOV-NEXT: fmv.d fa0, fs0
; CHECK-NOV-NEXT: call __fixdfti
-; CHECK-NOV-NEXT: mv a2, a1
+; CHECK-NOV-NEXT: mv a2, s1
+; CHECK-NOV-NEXT: mv a3, a1
; CHECK-NOV-NEXT: blez a1, .LBB47_2
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: li a2, 1
+; CHECK-NOV-NEXT: li a3, 1
; CHECK-NOV-NEXT: .LBB47_2: # %entry
-; CHECK-NOV-NEXT: mv a3, s1
-; CHECK-NOV-NEXT: blez s1, .LBB47_4
+; CHECK-NOV-NEXT: blez a2, .LBB47_4
; CHECK-NOV-NEXT: # %bb.3: # %entry
-; CHECK-NOV-NEXT: li a3, 1
+; CHECK-NOV-NEXT: li a2, 1
; CHECK-NOV-NEXT: .LBB47_4: # %entry
; CHECK-NOV-NEXT: slti a1, a1, 1
; CHECK-NOV-NEXT: neg a1, a1
@@ -5832,11 +5827,11 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
; CHECK-NOV-NEXT: slti a0, s1, 1
; CHECK-NOV-NEXT: neg a0, a0
; CHECK-NOV-NEXT: and a0, a0, s0
-; CHECK-NOV-NEXT: slti a3, a3, 0
-; CHECK-NOV-NEXT: addi a3, a3, -1
-; CHECK-NOV-NEXT: and a0, a3, a0
; CHECK-NOV-NEXT: slti a2, a2, 0
; CHECK-NOV-NEXT: addi a2, a2, -1
+; CHECK-NOV-NEXT: and a0, a2, a0
+; CHECK-NOV-NEXT: slti a2, a3, 0
+; CHECK-NOV-NEXT: addi a2, a2, -1
; CHECK-NOV-NEXT: and a1, a2, a1
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
@@ -5872,15 +5867,15 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vfmv.f.s fa0, v8
; CHECK-V-NEXT: call __fixdfti
-; CHECK-V-NEXT: mv a2, a1
+; CHECK-V-NEXT: mv a2, s1
+; CHECK-V-NEXT: mv a3, a1
; CHECK-V-NEXT: blez a1, .LBB47_2
; CHECK-V-NEXT: # %bb.1: # %entry
-; CHECK-V-NEXT: li a2, 1
+; CHECK-V-NEXT: li a3, 1
; CHECK-V-NEXT: .LBB47_2: # %entry
-; CHECK-V-NEXT: mv a3, s1
-; CHECK-V-NEXT: blez s1, .LBB47_4
+; CHECK-V-NEXT: blez a2, .LBB47_4
; CHECK-V-NEXT: # %bb.3: # %entry
-; CHECK-V-NEXT: li a3, 1
+; CHECK-V-NEXT: li a2, 1
; CHECK-V-NEXT: .LBB47_4: # %entry
; CHECK-V-NEXT: slti a1, a1, 1
; CHECK-V-NEXT: neg a1, a1
@@ -5888,11 +5883,11 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
; CHECK-V-NEXT: slti a1, s1, 1
; CHECK-V-NEXT: neg a1, a1
; CHECK-V-NEXT: and a1, a1, s0
-; CHECK-V-NEXT: slti a3, a3, 0
-; CHECK-V-NEXT: addi a3, a3, -1
-; CHECK-V-NEXT: and a1, a3, a1
; CHECK-V-NEXT: slti a2, a2, 0
; CHECK-V-NEXT: addi a2, a2, -1
+; CHECK-V-NEXT: and a1, a2, a1
+; CHECK-V-NEXT: slti a2, a3, 0
+; CHECK-V-NEXT: addi a2, a2, -1
; CHECK-V-NEXT: and a0, a2, a0
; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
@@ -6202,15 +6197,15 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
; CHECK-NOV-NEXT: mv s1, a1
; CHECK-NOV-NEXT: fmv.s fa0, fs0
; CHECK-NOV-NEXT: call __fixsfti
-; CHECK-NOV-NEXT: mv a2, a1
+; CHECK-NOV-NEXT: mv a2, s1
+; CHECK-NOV-NEXT: mv a3, a1
; CHECK-NOV-NEXT: blez a1, .LBB50_2
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: li a2, 1
+; CHECK-NOV-NEXT: li a3, 1
; CHECK-NOV-NEXT: .LBB50_2: # %entry
-; CHECK-NOV-NEXT: mv a3, s1
-; CHECK-NOV-NEXT: blez s1, .LBB50_4
+; CHECK-NOV-NEXT: blez a2, .LBB50_4
; CHECK-NOV-NEXT: # %bb.3: # %entry
-; CHECK-NOV-NEXT: li a3, 1
+; CHECK-NOV-NEXT: li a2, 1
; CHECK-NOV-NEXT: .LBB50_4: # %entry
; CHECK-NOV-NEXT: slti a1, a1, 1
; CHECK-NOV-NEXT: neg a1, a1
@@ -6218,11 +6213,11 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
; CHECK-NOV-NEXT: slti a0, s1, 1
; CHECK-NOV-NEXT: neg a0, a0
; CHECK-NOV-NEXT: and a0, a0, s0
-; CHECK-NOV-NEXT: slti a3, a3, 0
-; CHECK-NOV-NEXT: addi a3, a3, -1
-; CHECK-NOV-NEXT: and a0, a3, a0
; CHECK-NOV-NEXT: slti a2, a2, 0
; CHECK-NOV-NEXT: addi a2, a2, -1
+; CHECK-NOV-NEXT: and a0, a2, a0
+; CHECK-NOV-NEXT: slti a2, a3, 0
+; CHECK-NOV-NEXT: addi a2, a2, -1
; CHECK-NOV-NEXT: and a1, a2, a1
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
@@ -6258,15 +6253,15 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-V-NEXT: vfmv.f.s fa0, v8
; CHECK-V-NEXT: call __fixsfti
-; CHECK-V-NEXT: mv a2, a1
+; CHECK-V-NEXT: mv a2, s1
+; CHECK-V-NEXT: mv a3, a1
; CHECK-V-NEXT: blez a1, .LBB50_2
; CHECK-V-NEXT: # %bb.1: # %entry
-; CHECK-V-NEXT: li a2, 1
+; CHECK-V-NEXT: li a3, 1
; CHECK-V-NEXT: .LBB50_2: # %entry
-; CHECK-V-NEXT: mv a3, s1
-; CHECK-V-NEXT: blez s1, .LBB50_4
+; CHECK-V-NEXT: blez a2, .LBB50_4
; CHECK-V-NEXT: # %bb.3: # %entry
-; CHECK-V-NEXT: li a3, 1
+; CHECK-V-NEXT: li a2, 1
; CHECK-V-NEXT: .LBB50_4: # %entry
; CHECK-V-NEXT: slti a1, a1, 1
; CHECK-V-NEXT: neg a1, a1
@@ -6274,11 +6269,11 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
; CHECK-V-NEXT: slti a1, s1, 1
; CHECK-V-NEXT: neg a1, a1
; CHECK-V-NEXT: and a1, a1, s0
-; CHECK-V-NEXT: slti a3, a3, 0
-; CHECK-V-NEXT: addi a3, a3, -1
-; CHECK-V-NEXT: and a1, a3, a1
; CHECK-V-NEXT: slti a2, a2, 0
; CHECK-V-NEXT: addi a2, a2, -1
+; CHECK-V-NEXT: and a1, a2, a1
+; CHECK-V-NEXT: slti a2, a3, 0
+; CHECK-V-NEXT: addi a2, a2, -1
; CHECK-V-NEXT: and a0, a2, a0
; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-V-NEXT: vmv.s.x v8, a0
@@ -6580,15 +6575,15 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
; CHECK-NOV-NEXT: fmv.w.x fa0, s2
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: call __fixsfti
-; CHECK-NOV-NEXT: mv a2, a1
+; CHECK-NOV-NEXT: mv a2, s1
+; CHECK-NOV-NEXT: mv a3, a1
; CHECK-NOV-NEXT: blez a1, .LBB53_2
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: li a2, 1
+; CHECK-NOV-NEXT: li a3, 1
; CHECK-NOV-NEXT: .LBB53_2: # %entry
-; CHECK-NOV-NEXT: mv a3, s1
-; CHECK-NOV-NEXT: blez s1, .LBB53_4
+; CHECK-NOV-NEXT: blez a2, .LBB53_4
; CHECK-NOV-NEXT: # %bb.3: # %entry
-; CHECK-NOV-NEXT: li a3, 1
+; CHECK-NOV-NEXT: li a2, 1
; CHECK-NOV-NEXT: .LBB53_4: # %entry
; CHECK-NOV-NEXT: slti a1, a1, 1
; CHECK-NOV-NEXT: neg a1, a1
@@ -6596,11 +6591,11 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
; CHECK-NOV-NEXT: slti a0, s1, 1
; CHECK-NOV-NEXT: neg a0, a0
; CHECK-NOV-NEXT: and a0, a0, s0
-; CHECK-NOV-NEXT: slti a3, a3, 0
-; CHECK-NOV-NEXT: addi a3, a3, -1
-; CHECK-NOV-NEXT: and a0, a3, a0
; CHECK-NOV-NEXT: slti a2, a2, 0
; CHECK-NOV-NEXT: addi a2, a2, -1
+; CHECK-NOV-NEXT: and a0, a2, a0
+; CHECK-NOV-NEXT: slti a2, a3, 0
+; CHECK-NOV-NEXT: addi a2, a2, -1
; CHECK-NOV-NEXT: and a1, a2, a1
; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
@@ -6630,15 +6625,15 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
; CHECK-V-NEXT: fmv.w.x fa0, s2
; CHECK-V-NEXT: call __extendhfsf2
; CHECK-V-NEXT: call __fixsfti
-; CHECK-V-NEXT: mv a2, a1
+; CHECK-V-NEXT: mv a2, s1
+; CHECK-V-NEXT: mv a3, a1
; CHECK-V-NEXT: blez a1, .LBB53_2
; CHECK-V-NEXT: # %bb.1: # %entry
-; CHECK-V-NEXT: li a2, 1
+; CHECK-V-NEXT: li a3, 1
; CHECK-V-NEXT: .LBB53_2: # %entry
-; CHECK-V-NEXT: mv a3, s1
-; CHECK-V-NEXT: blez s1, .LBB53_4
+; CHECK-V-NEXT: blez a2, .LBB53_4
; CHECK-V-NEXT: # %bb.3: # %entry
-; CHECK-V-NEXT: li a3, 1
+; CHECK-V-NEXT: li a2, 1
; CHECK-V-NEXT: .LBB53_4: # %entry
; CHECK-V-NEXT: slti a1, a1, 1
; CHECK-V-NEXT: neg a1, a1
@@ -6646,11 +6641,11 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
; CHECK-V-NEXT: slti a1, s1, 1
; CHECK-V-NEXT: neg a1, a1
; CHECK-V-NEXT: and a1, a1, s0
-; CHECK-V-NEXT: slti a3, a3, 0
-; CHECK-V-NEXT: addi a3, a3, -1
-; CHECK-V-NEXT: and a1, a3, a1
; CHECK-V-NEXT: slti a2, a2, 0
; CHECK-V-NEXT: addi a2, a2, -1
+; CHECK-V-NEXT: and a1, a2, a1
+; CHECK-V-NEXT: slti a2, a3, 0
+; CHECK-V-NEXT: addi a2, a2, -1
; CHECK-V-NEXT: and a0, a2, a0
; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-V-NEXT: vmv.s.x v9, a0
diff --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll
index 0c33e8973c2d204..dd180b67e492a0f 100644
--- a/llvm/test/CodeGen/X86/abdu-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll
@@ -715,41 +715,43 @@ define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwin
;
; SSE42-LABEL: abd_cmp_v2i64_multiuse_cmp:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa %xmm0, %xmm2
-; SSE42-NEXT: psubq %xmm1, %xmm2
+; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: movdqa %xmm1, %xmm3
-; SSE42-NEXT: psubq %xmm0, %xmm3
-; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
-; SSE42-NEXT: pxor %xmm4, %xmm1
-; SSE42-NEXT: pxor %xmm4, %xmm0
-; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE42-NEXT: paddq %xmm3, %xmm0
+; SSE42-NEXT: pxor %xmm2, %xmm3
+; SSE42-NEXT: pxor %xmm0, %xmm2
+; SSE42-NEXT: pcmpgtq %xmm3, %xmm2
+; SSE42-NEXT: movdqa %xmm0, %xmm3
+; SSE42-NEXT: psubq %xmm1, %xmm3
+; SSE42-NEXT: psubq %xmm0, %xmm1
+; SSE42-NEXT: movdqa %xmm2, %xmm0
+; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; SSE42-NEXT: paddq %xmm1, %xmm2
+; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: abd_cmp_v2i64_multiuse_cmp:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm3
-; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: # xmm4 = mem[0,0]
-; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm1
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm2 = mem[0,0]
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_cmp_v2i64_multiuse_cmp:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm3
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm1
-; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3
+; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_cmp_v2i64_multiuse_cmp:
diff --git a/llvm/test/CodeGen/X86/apx/kmov-postrapseudos.ll b/llvm/test/CodeGen/X86/apx/kmov-postrapseudos.ll
index b2cb2c3e04b3f4b..017024c173c3f7e 100644
--- a/llvm/test/CodeGen/X86/apx/kmov-postrapseudos.ll
+++ b/llvm/test/CodeGen/X86/apx/kmov-postrapseudos.ll
@@ -52,7 +52,10 @@ alloca_21:
define i32 @kmovrk_1(<4 x ptr> %arg) {
; AVX512-LABEL: kmovrk_1:
; AVX512: # %bb.0: # %bb
-; AVX512-NEXT: vptest %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x17,0xc0]
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc0]
+; AVX512-NEXT: kmovw %k0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x93,0xc0]
+; AVX512-NEXT: testb $15, %al # encoding: [0xa8,0x0f]
; AVX512-NEXT: jne .LBB2_1 # encoding: [0x75,A]
; AVX512-NEXT: # fixup A - offset: 1, value: .LBB2_1-1, kind: FK_PCRel_1
; AVX512-NEXT: # %bb.2: # %bb3
@@ -63,7 +66,10 @@ define i32 @kmovrk_1(<4 x ptr> %arg) {
;
; AVX512BW-LABEL: kmovrk_1:
; AVX512BW: # %bb.0: # %bb
-; AVX512BW-NEXT: vptest %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x17,0xc0]
+; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc0]
+; AVX512BW-NEXT: kmovd %k0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x93,0xc0]
+; AVX512BW-NEXT: testb $15, %al # encoding: [0xa8,0x0f]
; AVX512BW-NEXT: jne .LBB2_1 # encoding: [0x75,A]
; AVX512BW-NEXT: # fixup A - offset: 1, value: .LBB2_1-1, kind: FK_PCRel_1
; AVX512BW-NEXT: # %bb.2: # %bb3
diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
index 7e48b3719cf0ffa..13d1265a249d1fa 100644
--- a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
@@ -30,13 +30,13 @@ define <64 x i8> @add_v64i8_broadcasts(<64 x i8> %a0, i64 %a1, i8 %a2) {
; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm4
-; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm4
; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: vpternlogq $226, %zmm4, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index 5d7bf4a2c9788ff..8d2bb77a9e1af68 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -80,13 +80,13 @@ define <4 x i32> @combine_vec_mul_pow2b(<4 x i32> %x) {
define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) {
; SSE-LABEL: combine_vec_mul_pow2c:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: paddq %xmm0, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: psllq $4, %xmm2
; SSE-NEXT: psllq $2, %xmm1
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: paddq %xmm0, %xmm2
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_pow2c:
@@ -399,12 +399,14 @@ define i64 @combine_mul_self_demandedbits(i64 %x) {
; SSE: # %bb.0:
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: imulq %rdi, %rax
+; SSE-NEXT: andq $-3, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: combine_mul_self_demandedbits:
; AVX: # %bb.0:
; AVX-NEXT: movq %rdi, %rax
; AVX-NEXT: imulq %rdi, %rax
+; AVX-NEXT: andq $-3, %rax
; AVX-NEXT: retq
%1 = mul i64 %x, %x
%2 = and i64 %1, -3
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 33cc8e96f663f5a..e12ca56023a7f2c 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -182,101 +182,101 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: sarl $31, %eax
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: sarl $31, %ebx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: sarl $31, %edi
; X86-NEXT: movl %eax, %esi
; X86-NEXT: xorl %ecx, %esi
; X86-NEXT: movl %esi, %ebp
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: movl %ecx, %ebx
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %eax, %esi
; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi
; X86-NEXT: subl %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %edi
-; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %ebp
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: movl %edi, %esi
; X86-NEXT: xorl %edx, %esi
-; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl %edi, %edx
; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: xorl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %ebx, %ebp
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %edi, %ebp
; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: subl %ebx, %ebp
-; X86-NEXT: sbbl %ebx, %edi
-; X86-NEXT: sbbl %ebx, %edx
-; X86-NEXT: sbbl %ebx, %esi
-; X86-NEXT: xorl %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: subl %edi, %ebp
+; X86-NEXT: sbbl %edi, %ebx
+; X86-NEXT: sbbl %edi, %edx
+; X86-NEXT: sbbl %edi, %esi
+; X86-NEXT: xorl %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl %ebp, %ecx
; X86-NEXT: orl %edx, %ecx
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: sete %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: orl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: orl %eax, %edx
; X86-NEXT: sete %al
; X86-NEXT: orb %cl, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: bsrl %esi, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl %ebx, %ecx
+; X86-NEXT: bsrl %edi, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: addl $32, %ecx
; X86-NEXT: testl %esi, %esi
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: bsrl %edi, %edx
+; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: bsrl %ebp, %ebp
; X86-NEXT: xorl $31, %ebp
; X86-NEXT: addl $32, %ebp
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: testl %edi, %edi
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %ebp
; X86-NEXT: addl $64, %ebp
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: orl %esi, %edi
; X86-NEXT: cmovnel %ecx, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: bsrl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: bsrl %edi, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: bsrl %eax, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: addl $32, %ecx
-; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: bsrl %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: bsrl %ebx, %esi
; X86-NEXT: xorl $31, %esi
-; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: bsrl (%esp), %edx # 4-byte Folded Reload
; X86-NEXT: xorl $31, %edx
; X86-NEXT: addl $32, %edx
-; X86-NEXT: testl %edi, %edi
+; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %esi, %edx
; X86-NEXT: addl $64, %edx
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: orl %edi, %esi
; X86-NEXT: cmovnel %ecx, %edx
; X86-NEXT: xorl %esi, %esi
; X86-NEXT: subl %edx, %ebp
-; X86-NEXT: movl $0, %edi
-; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
; X86-NEXT: movl $0, %edx
; X86-NEXT: sbbl %edx, %edx
; X86-NEXT: movl $0, %eax
@@ -284,39 +284,39 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl $127, %ecx
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: cmpl %ebp, %ecx
-; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: sbbl %ebx, %ecx
; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl $0, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: setb %cl
; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: cmovnel %esi, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: cmovnel %esi, %edx
-; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT: cmovnel %esi, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: cmovnel %esi, %eax
-; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: cmovel (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: jne .LBB4_8
; X86-NEXT: # %bb.1: # %_udiv-special-cases
-; X86-NEXT: movl %edi, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: xorl $127, %edi
-; X86-NEXT: orl %ebp, %edi
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: xorl $127, %ebx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: orl %ebx, %ecx
; X86-NEXT: je .LBB4_8
; X86-NEXT: # %bb.2: # %udiv-bb1
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
@@ -332,44 +332,41 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $15, %al
; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %edi
-; X86-NEXT: movl 144(%esp,%edi), %edx
-; X86-NEXT: movl 148(%esp,%edi), %esi
+; X86-NEXT: movsbl %al, %ebx
+; X86-NEXT: movl 144(%esp,%ebx), %edx
+; X86-NEXT: movl 148(%esp,%ebx), %edi
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %edi
; X86-NEXT: shll %cl, %edx
; X86-NEXT: notb %cl
-; X86-NEXT: movl 140(%esp,%edi), %eax
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: shrl %ebx
-; X86-NEXT: shrl %cl, %ebx
-; X86-NEXT: orl %edx, %ebx
-; X86-NEXT: movl 136(%esp,%edi), %edx
+; X86-NEXT: movl 140(%esp,%ebx), %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: shrl %esi
+; X86-NEXT: shrl %cl, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: movl 136(%esp,%ebx), %esi
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: shll %cl, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: jae .LBB4_3
; X86-NEXT: # %bb.6:
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: xorl %ebx, %ebx
+; X86-NEXT: xorl %esi, %esi
; X86-NEXT: jmp .LBB4_7
; X86-NEXT: .LBB4_3: # %udiv-preheader
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl (%esp), %esi # 4-byte Reload
; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
@@ -377,188 +374,192 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movb %dl, %ch
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movb %bl, %ch
; X86-NEXT: andb $7, %ch
-; X86-NEXT: movb %dl, %cl
+; X86-NEXT: movb %bl, %cl
; X86-NEXT: shrb $3, %cl
; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %edx
-; X86-NEXT: movl 100(%esp,%edx), %esi
-; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT: movl 96(%esp,%edx), %edi
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %ebp
+; X86-NEXT: movzbl %cl, %ebp
+; X86-NEXT: movl 100(%esp,%ebp), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 96(%esp,%ebp), %ebx
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, %edx
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrdl %cl, %esi, %ebp
-; X86-NEXT: movl 88(%esp,%edx), %ebx
-; X86-NEXT: movl 92(%esp,%edx), %esi
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: shrl %cl, %edx
+; X86-NEXT: shrdl %cl, %esi, %edx
+; X86-NEXT: movl 88(%esp,%ebp), %ebp
+; X86-NEXT: movl 92(%esp,%eax), %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: shrl %cl, %eax
; X86-NEXT: notb %cl
-; X86-NEXT: addl %edi, %edi
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
-; X86-NEXT: shrdl %cl, %esi, %ebx
+; X86-NEXT: addl %ebx, %ebx
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: orl %eax, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movb %ch, %cl
+; X86-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shrdl %cl, %esi, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %esi, %esi
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB4_4: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, %esi
-; X86-NEXT: shldl $1, %ebp, (%esp) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: shldl $1, %ebp, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl $1, %edi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $1, %ecx, %edi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: orl %ebx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl $1, %eax, %ecx
-; X86-NEXT: orl %ebx, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %edx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shldl $1, %ebp, %edx
+; X86-NEXT: shldl $1, %edi, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %edi
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT: movl %ecx, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: shldl $1, %ecx, %eax
-; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: orl %esi, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: addl %eax, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %ebp, %ecx
+; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT: sbbl %ebx, %ecx
; X86-NEXT: sarl $31, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl $1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl %edi, %esi
; X86-NEXT: movl %ecx, %edi
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: subl %ecx, %ebp
+; X86-NEXT: sbbl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: sbbl %eax, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %edi, %esi
-; X86-NEXT: movl %esi, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: sbbl %ebx, (%esp) # 4-byte Folded Spill
+; X86-NEXT: sbbl %edi, %edx
+; X86-NEXT: movl (%esp), %edi # 4-byte Reload
+; X86-NEXT: sbbl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl $-1, %ecx
-; X86-NEXT: adcl $-1, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: adcl $-1, %esi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %edi
+; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %eax
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %ebx, %ecx
-; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: jne .LBB4_4
; X86-NEXT: # %bb.5:
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: .LBB4_7: # %udiv-loop-exit
-; X86-NEXT: shldl $1, %ebx, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: shldl $1, %eax, %ebx
-; X86-NEXT: orl %ecx, %ebx
+; X86-NEXT: shldl $1, %edx, %edi
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: movl %esi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: shldl $1, %esi, %eax
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: addl %esi, %esi
-; X86-NEXT: orl %edi, %esi
+; X86-NEXT: orl %ebx, %esi
; X86-NEXT: .LBB4_8: # %udiv-end
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %ecx, %edi
; X86-NEXT: xorl %ecx, %edx
-; X86-NEXT: xorl %ecx, %ebx
; X86-NEXT: xorl %ecx, %eax
; X86-NEXT: xorl %ecx, %esi
; X86-NEXT: subl %ecx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %ecx, %ebx
; X86-NEXT: sbbl %ecx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %esi, (%ecx)
-; X86-NEXT: movl %eax, 4(%ecx)
-; X86-NEXT: movl %ebx, 8(%ecx)
-; X86-NEXT: movl %edx, 12(%ecx)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: sbbl %ecx, %edi
+; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT: movl %esi, (%ebp)
+; X86-NEXT: movl %eax, 4(%ebp)
+; X86-NEXT: movl %edx, 8(%ebp)
+; X86-NEXT: movl %edi, 12(%ebp)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebx, %edx
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-NEXT: imull %eax, %ecx
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: addl %ecx, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: imull %esi, %ecx
@@ -567,12 +568,12 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: mull %edx
; X86-NEXT: addl %edx, %ebp
; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl %ebx, %ebp
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: subl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
diff --git a/llvm/test/CodeGen/X86/fold-masked-merge.ll b/llvm/test/CodeGen/X86/fold-masked-merge.ll
index b2614c5fe0493c0..135494ac25f8cb2 100644
--- a/llvm/test/CodeGen/X86/fold-masked-merge.ll
+++ b/llvm/test/CodeGen/X86/fold-masked-merge.ll
@@ -56,7 +56,9 @@ define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
; NOBMI-LABEL: masked_merge2:
; NOBMI: # %bb.0:
; NOBMI-NEXT: movl %esi, %eax
-; NOBMI-NEXT: # kill: def $al killed $al killed $eax
+; NOBMI-NEXT: xorb %sil, %al
+; NOBMI-NEXT: andb %dil, %al
+; NOBMI-NEXT: xorb %sil, %al
; NOBMI-NEXT: retq
;
; BMI-LABEL: masked_merge2:
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index c79da37988e40b0..b212e9438e1b52f 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -459,7 +459,8 @@ define i32 @freeze_ashr(i32 %a0) nounwind {
; X64-LABEL: freeze_ashr:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: sarl $6, %eax
+; X64-NEXT: sarl $3, %eax
+; X64-NEXT: sarl $3, %eax
; X64-NEXT: retq
%x = ashr i32 %a0, 3
%y = freeze i32 %x
@@ -530,12 +531,30 @@ define i32 @freeze_ashr_outofrange(i32 %a0) nounwind {
define <8 x i16> @freeze_ashr_vec(<8 x i16> %a0) nounwind {
; X86-LABEL: freeze_ashr_vec:
; X86: # %bb.0:
-; X86-NEXT: psraw $4, %xmm0
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: psraw $1, %xmm2
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
+; X86-NEXT: movdqa %xmm1, %xmm3
+; X86-NEXT: pandn %xmm2, %xmm3
+; X86-NEXT: psraw $3, %xmm0
+; X86-NEXT: pand %xmm1, %xmm0
+; X86-NEXT: por %xmm3, %xmm0
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: psraw $3, %xmm2
+; X86-NEXT: psraw $1, %xmm0
+; X86-NEXT: pand %xmm1, %xmm0
+; X86-NEXT: pandn %xmm2, %xmm1
+; X86-NEXT: por %xmm1, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: freeze_ashr_vec:
; X64: # %bb.0:
-; X64-NEXT: vpsraw $4, %xmm0, %xmm0
+; X64-NEXT: vpsraw $1, %xmm0, %xmm1
+; X64-NEXT: vpsraw $3, %xmm0, %xmm0
+; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; X64-NEXT: vpsraw $3, %xmm0, %xmm1
+; X64-NEXT: vpsraw $1, %xmm0, %xmm0
+; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; X64-NEXT: retq
%x = ashr <8 x i16> %a0, <i16 3, i16 1, i16 3, i16 1, i16 3, i16 1, i16 3, i16 1>
%y = freeze <8 x i16> %x
@@ -573,7 +592,8 @@ define i32 @freeze_lshr(i32 %a0) nounwind {
; X64-LABEL: freeze_lshr:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shrl $3, %eax
+; X64-NEXT: shrl $2, %eax
+; X64-NEXT: shrl %eax
; X64-NEXT: retq
%x = lshr i32 %a0, 2
%y = freeze i32 %x
@@ -644,12 +664,30 @@ define i32 @freeze_lshr_outofrange(i32 %a0) nounwind {
define <8 x i16> @freeze_lshr_vec(<8 x i16> %a0) nounwind {
; X86-LABEL: freeze_lshr_vec:
; X86: # %bb.0:
-; X86-NEXT: psrlw $3, %xmm0
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: psrlw $1, %xmm2
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
+; X86-NEXT: movdqa %xmm1, %xmm3
+; X86-NEXT: pandn %xmm2, %xmm3
+; X86-NEXT: psrlw $2, %xmm0
+; X86-NEXT: pand %xmm1, %xmm0
+; X86-NEXT: por %xmm3, %xmm0
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: psrlw $2, %xmm2
+; X86-NEXT: psrlw $1, %xmm0
+; X86-NEXT: pand %xmm1, %xmm0
+; X86-NEXT: pandn %xmm2, %xmm1
+; X86-NEXT: por %xmm1, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: freeze_lshr_vec:
; X64: # %bb.0:
-; X64-NEXT: vpsrlw $3, %xmm0, %xmm0
+; X64-NEXT: vpsrlw $1, %xmm0, %xmm1
+; X64-NEXT: vpsrlw $2, %xmm0, %xmm0
+; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; X64-NEXT: vpsrlw $2, %xmm0, %xmm1
+; X64-NEXT: vpsrlw $1, %xmm0, %xmm0
+; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; X64-NEXT: retq
%x = lshr <8 x i16> %a0, <i16 2, i16 1, i16 2, i16 1, i16 2, i16 1, i16 2, i16 1>
%y = freeze <8 x i16> %x
diff --git a/llvm/test/CodeGen/X86/freeze-combine.ll b/llvm/test/CodeGen/X86/freeze-combine.ll
index 1cfb8627a4dd457..b037a6d9a1b93b9 100644
--- a/llvm/test/CodeGen/X86/freeze-combine.ll
+++ b/llvm/test/CodeGen/X86/freeze-combine.ll
@@ -3,9 +3,9 @@
define i32 @const() {
; CHECK-LABEL: name: const
; CHECK: bb.0 (%ir-block.0):
- ; CHECK-NEXT: [[MOV32ri:%[0-9]+]]:gr32 = MOV32ri 1
- ; CHECK-NEXT: $eax = COPY [[MOV32ri]]
- ; CHECK-NEXT: RET 0, $eax
+ ; CHECK: [[MOV32ri:%[0-9]+]]:gr32 = MOV32ri 1
+ ; CHECK: $eax = COPY [[MOV32ri]]
+ ; CHECK: RET 0, $eax
%y = freeze i32 1
ret i32 %y
}
@@ -13,11 +13,11 @@ define i32 @const() {
define i32 @fold(i32 %x) {
; CHECK-LABEL: name: fold
; CHECK: bb.0 (%ir-block.0):
- ; CHECK-NEXT: liveins: $edi
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edi
- ; CHECK-NEXT: $eax = COPY [[COPY]]
- ; CHECK-NEXT: RET 0, $eax
+ ; CHECK: liveins: $edi
+ ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY $edi
+ ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY [[COPY]]
+ ; CHECK: $eax = COPY [[COPY1]]
+ ; CHECK: RET 0, $eax
%y = freeze i32 %x
%z = freeze i32 %y
ret i32 %z
diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
index afe0ebb9dcb4f08..0c341dc63a9ecc9 100644
--- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
@@ -522,17 +522,17 @@ declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
define <16 x i8> @splatconstant_fshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; GFNISSE-LABEL: splatconstant_fshr_v16i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
; GFNISSE-NEXT: psrlw $7, %xmm1
; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; GFNISSE-NEXT: paddb %xmm0, %xmm0
; GFNISSE-NEXT: por %xmm1, %xmm0
; GFNISSE-NEXT: retq
;
; GFNIAVX1OR2-LABEL: splatconstant_fshr_v16i8:
; GFNIAVX1OR2: # %bb.0:
-; GFNIAVX1OR2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: vpsrlw $7, %xmm1, %xmm1
; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; GFNIAVX1OR2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll
index 96aff5b2af31556..7ab8300b269a48e 100644
--- a/llvm/test/CodeGen/X86/gfni-rotates.ll
+++ b/llvm/test/CodeGen/X86/gfni-rotates.ll
@@ -421,18 +421,18 @@ define <16 x i8> @splatconstant_rotr_v16i8(<16 x i8> %a) nounwind {
; GFNISSE-LABEL: splatconstant_rotr_v16i8:
; GFNISSE: # %bb.0:
; GFNISSE-NEXT: movdqa %xmm0, %xmm1
-; GFNISSE-NEXT: paddb %xmm0, %xmm1
-; GFNISSE-NEXT: psrlw $7, %xmm0
-; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSE-NEXT: psrlw $7, %xmm1
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; GFNISSE-NEXT: paddb %xmm0, %xmm0
; GFNISSE-NEXT: por %xmm1, %xmm0
; GFNISSE-NEXT: retq
;
; GFNIAVX1OR2-LABEL: splatconstant_rotr_v16i8:
; GFNIAVX1OR2: # %bb.0:
-; GFNIAVX1OR2-NEXT: vpaddb %xmm0, %xmm0, %xmm1
-; GFNIAVX1OR2-NEXT: vpsrlw $7, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT: vpor %xmm0, %xmm1, %xmm0
+; GFNIAVX1OR2-NEXT: vpsrlw $7, %xmm0, %xmm1
+; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; GFNIAVX1OR2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: retq
;
; GFNIAVX512-LABEL: splatconstant_rotr_v16i8:
diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index 2f780e3c6fe1f1e..39d02f9112f4fcb 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -676,13 +676,12 @@ define i32 @rotr_known_nonzero(i32 %xx, i32 %y) {
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: orl $256, %edi # imm = 0x100
-; X64-NEXT: movl %edi, %eax
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: rorl %cl, %eax
+; X64-NEXT: rorl %cl, %edi
; X64-NEXT: testl %edi, %edi
; X64-NEXT: je .LBB22_1
; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: rep bsfl %eax, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB22_1:
; X64-NEXT: movl $32, %eax
@@ -714,13 +713,12 @@ define i32 @rotr_maybe_zero(i32 %x, i32 %y) {
; X64-LABEL: rotr_maybe_zero:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: movl %edi, %eax
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: rorl %cl, %eax
+; X64-NEXT: rorl %cl, %edi
; X64-NEXT: testl %edi, %edi
; X64-NEXT: je .LBB23_1
; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: rep bsfl %eax, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB23_1:
; X64-NEXT: movl $32, %eax
@@ -775,13 +773,12 @@ define i32 @rotr_with_fshr_maybe_zero(i32 %x, i32 %y) {
; X64-LABEL: rotr_with_fshr_maybe_zero:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: movl %edi, %eax
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: rorl %cl, %eax
+; X64-NEXT: rorl %cl, %edi
; X64-NEXT: testl %edi, %edi
; X64-NEXT: je .LBB25_1
; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: rep bsfl %eax, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB25_1:
; X64-NEXT: movl $32, %eax
@@ -811,13 +808,12 @@ define i32 @rotl_known_nonzero(i32 %xx, i32 %y) {
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: orl $256, %edi # imm = 0x100
-; X64-NEXT: movl %edi, %eax
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: roll %cl, %eax
+; X64-NEXT: roll %cl, %edi
; X64-NEXT: testl %edi, %edi
; X64-NEXT: je .LBB26_1
; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: rep bsfl %eax, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB26_1:
; X64-NEXT: movl $32, %eax
@@ -849,13 +845,12 @@ define i32 @rotl_maybe_zero(i32 %x, i32 %y) {
; X64-LABEL: rotl_maybe_zero:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: movl %edi, %eax
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: roll %cl, %eax
+; X64-NEXT: roll %cl, %edi
; X64-NEXT: testl %edi, %edi
; X64-NEXT: je .LBB27_1
; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: rep bsfl %eax, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB27_1:
; X64-NEXT: movl $32, %eax
@@ -910,13 +905,12 @@ define i32 @rotl_with_fshl_maybe_zero(i32 %x, i32 %y) {
; X64-LABEL: rotl_with_fshl_maybe_zero:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: movl %edi, %eax
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: roll %cl, %eax
+; X64-NEXT: roll %cl, %edi
; X64-NEXT: testl %edi, %edi
; X64-NEXT: je .LBB29_1
; X64-NEXT: # %bb.2: # %cond.false
-; X64-NEXT: rep bsfl %eax, %eax
+; X64-NEXT: rep bsfl %edi, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB29_1:
; X64-NEXT: movl $32, %eax
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index 5a6375e08bcaff6..d3cced3233ea650 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -893,26 +893,27 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [1,1]
-; SSE41-NEXT: por %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: psubq %xmm1, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; SSE41-NEXT: por %xmm3, %xmm6
+; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psubq %xmm1, %xmm3
; SSE41-NEXT: psubq %xmm2, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: psrlq $1, %xmm0
; SSE41-NEXT: psrlq $33, %xmm1
-; SSE41-NEXT: pmuludq %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psrlq $32, %xmm4
-; SSE41-NEXT: pmuludq %xmm0, %xmm4
-; SSE41-NEXT: paddq %xmm1, %xmm4
-; SSE41-NEXT: psllq $32, %xmm4
-; SSE41-NEXT: pmuludq %xmm3, %xmm0
+; SSE41-NEXT: pmuludq %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm6, %xmm3
+; SSE41-NEXT: psrlq $32, %xmm3
+; SSE41-NEXT: pmuludq %xmm0, %xmm3
+; SSE41-NEXT: paddq %xmm1, %xmm3
+; SSE41-NEXT: psllq $32, %xmm3
+; SSE41-NEXT: pmuludq %xmm6, %xmm0
; SSE41-NEXT: paddq %xmm2, %xmm0
-; SSE41-NEXT: paddq %xmm4, %xmm0
+; SSE41-NEXT: paddq %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: vec128_i64_signed_reg_reg:
@@ -1076,26 +1077,27 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [1,1]
-; SSE41-NEXT: por %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: psubq %xmm1, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; SSE41-NEXT: por %xmm3, %xmm6
+; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psubq %xmm1, %xmm3
; SSE41-NEXT: psubq %xmm2, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: psrlq $1, %xmm0
; SSE41-NEXT: psrlq $33, %xmm1
-; SSE41-NEXT: pmuludq %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psrlq $32, %xmm4
-; SSE41-NEXT: pmuludq %xmm0, %xmm4
-; SSE41-NEXT: paddq %xmm1, %xmm4
-; SSE41-NEXT: psllq $32, %xmm4
-; SSE41-NEXT: pmuludq %xmm3, %xmm0
+; SSE41-NEXT: pmuludq %xmm6, %xmm1
+; SSE41-NEXT: movdqa %xmm6, %xmm3
+; SSE41-NEXT: psrlq $32, %xmm3
+; SSE41-NEXT: pmuludq %xmm0, %xmm3
+; SSE41-NEXT: paddq %xmm1, %xmm3
+; SSE41-NEXT: psllq $32, %xmm3
+; SSE41-NEXT: pmuludq %xmm6, %xmm0
; SSE41-NEXT: paddq %xmm2, %xmm0
-; SSE41-NEXT: paddq %xmm4, %xmm0
+; SSE41-NEXT: paddq %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: vec128_i64_unsigned_reg_reg:
@@ -1991,14 +1993,14 @@ define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) noun
;
; AVX512VL-FALLBACK-LABEL: vec128_i16_unsigned_reg_reg:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm2
-; AVX512VL-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm1
-; AVX512VL-FALLBACK-NEXT: vpsubw %xmm1, %xmm2, %xmm2
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2
-; AVX512VL-FALLBACK-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm1
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm1, %xmm2, %xmm2
-; AVX512VL-FALLBACK-NEXT: vpsubw %xmm1, %xmm2, %xmm1
+; AVX512VL-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm2
+; AVX512VL-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
+; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512VL-FALLBACK-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2
+; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1
; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; AVX512VL-FALLBACK-NEXT: retq
;
@@ -2784,14 +2786,14 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
;
; AVX512VL-FALLBACK-LABEL: vec128_i8_unsigned_reg_reg:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm2
-; AVX512VL-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm2
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2
-; AVX512VL-FALLBACK-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm1
-; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; AVX512VL-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm2
+; AVX512VL-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
+; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX512VL-FALLBACK-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2
+; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2
+; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
+; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1
; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX512VL-FALLBACK-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index e880a1acc9e83fe..cc08396ae8c78fe 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -1445,14 +1445,14 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n
;
; AVX512VL-FALLBACK-LABEL: vec256_i16_unsigned_reg_reg:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpxor %ymm1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm2, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm2, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: retq
;
@@ -2210,14 +2210,14 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
;
; AVX512VL-FALLBACK-LABEL: vec256_i8_unsigned_reg_reg:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm2, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
index 366dad1612b437a..2fdf6ef224ca960 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
@@ -684,21 +684,22 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5
-; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
-; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5
; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5
+; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2
-; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5
+; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
+; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2
+; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
@@ -714,21 +715,22 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5
; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5
+; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
@@ -770,19 +772,20 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm6
; AVX512F-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7
; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
-; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1
; AVX512F-NEXT: vpmaxub %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4
-; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2
-; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpandq %zmm6, %zmm4, %zmm4
+; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
+; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2
+; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
@@ -800,19 +803,20 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm6
; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm4, %zmm4
+; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index ace78b38d53edbb..04aff9b7d2e5863 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -22,7 +22,7 @@ define void @f() nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $160, %esp
+; X86-NEXT: subl $176, %esp
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
@@ -47,54 +47,55 @@ define void @f() nounwind {
; X86-NEXT: testl %edi, %edi
; X86-NEXT: jne .LBB0_1
; X86-NEXT: # %bb.2: # %BB_udiv-special-cases
-; X86-NEXT: bsrl %esi, %eax
-; X86-NEXT: xorl $31, %eax
-; X86-NEXT: addl $32, %eax
+; X86-NEXT: bsrl %esi, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: addl $32, %ecx
; X86-NEXT: jmp .LBB0_3
; X86-NEXT: .LBB0_1:
-; X86-NEXT: bsrl %edi, %eax
-; X86-NEXT: xorl $31, %eax
+; X86-NEXT: bsrl %edi, %ecx
+; X86-NEXT: xorl $31, %ecx
; X86-NEXT: .LBB0_3: # %BB_udiv-special-cases
-; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: xorl %eax, %eax
; X86-NEXT: testl %edx, %edx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: jne .LBB0_4
; X86-NEXT: # %bb.5: # %BB_udiv-special-cases
-; X86-NEXT: addl $64, %eax
+; X86-NEXT: addl $64, %ecx
; X86-NEXT: jmp .LBB0_6
; X86-NEXT: .LBB0_4:
-; X86-NEXT: bsrl %edx, %eax
-; X86-NEXT: xorl $31, %eax
-; X86-NEXT: addl $32, %eax
+; X86-NEXT: bsrl %edx, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: addl $32, %ecx
; X86-NEXT: .LBB0_6: # %BB_udiv-special-cases
-; X86-NEXT: subl $62, %eax
+; X86-NEXT: subl $62, %ecx
; X86-NEXT: movl $0, %ebx
; X86-NEXT: sbbl %ebx, %ebx
-; X86-NEXT: sbbl %ecx, %ecx
-; X86-NEXT: addl $-66, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: addl $-66, %ecx
; X86-NEXT: adcl $-1, %ebx
-; X86-NEXT: adcl $3, %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: movb $1, %cl
-; X86-NEXT: testb %cl, %cl
+; X86-NEXT: adcl $3, %eax
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movb $1, %al
+; X86-NEXT: testb %al, %al
; X86-NEXT: jne .LBB0_11
; X86-NEXT: # %bb.7: # %BB_udiv-special-cases
-; X86-NEXT: andl $3, %esi
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: xorl $65, %ecx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: andl $3, %edi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: xorl $65, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl %ebx, %eax
; X86-NEXT: je .LBB0_11
; X86-NEXT: # %bb.8: # %udiv-bb1
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: addl $1, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: andl $3, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: andl $3, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movb $65, %cl
; X86-NEXT: subb %al, %cl
; X86-NEXT: movb %cl, %ch
@@ -111,31 +112,29 @@ define void @f() nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 120(%esp,%eax), %edi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 136(%esp,%eax), %edx
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shll %cl, %edi
+; X86-NEXT: shll %cl, %edx
; X86-NEXT: notb %cl
-; X86-NEXT: movl 112(%esp,%eax), %esi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 116(%esp,%eax), %edx
-; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl 128(%esp,%eax), %edi
+; X86-NEXT: movl 132(%esp,%eax), %esi
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: shrl %eax
; X86-NEXT: shrl %cl, %eax
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll %cl, %esi
+; X86-NEXT: shldl %cl, %edi, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %ebx, %ecx
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %ebx, %ecx
; X86-NEXT: je .LBB0_11
; X86-NEXT: # %bb.9: # %udiv-preheader
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: andl $3, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: andl $3, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -150,20 +149,20 @@ define void @f() nounwind {
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $15, %al
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl 64(%esp,%eax), %edi
-; X86-NEXT: movl 68(%esp,%eax), %edx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movzbl %al, %esi
+; X86-NEXT: movl 80(%esp,%esi), %edx
+; X86-NEXT: movl 84(%esp,%esi), %eax
+; X86-NEXT: movl %eax, %edi
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrl %cl, %esi
+; X86-NEXT: shrl %cl, %edi
; X86-NEXT: notb %cl
-; X86-NEXT: movl 72(%esp,%eax), %ebx
-; X86-NEXT: addl %ebx, %ebx
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl 88(%esp,%esi), %esi
+; X86-NEXT: addl %esi, %esi
+; X86-NEXT: shll %cl, %esi
+; X86-NEXT: orl %edi, %esi
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrdl %cl, %edx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrdl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl $-1, %eax
@@ -180,62 +179,63 @@ define void @f() nounwind {
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB0_10: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: shldl $1, %ebx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %ebx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: andl $2, %edx
-; X86-NEXT: shrl %edx
-; X86-NEXT: leal (%edx,%ebx,2), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %esi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: shldl $1, %edx, %esi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: andl $2, %eax
+; X86-NEXT: shrl %eax
+; X86-NEXT: leal (%eax,%edx,2), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl %esi, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl $1, %eax, %edx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %eax, %edi
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %eax, %eax
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $3, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: sbbl %edi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: sbbl %ecx, %esi
-; X86-NEXT: shll $30, %esi
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: sarl $30, %edx
-; X86-NEXT: sarl $31, %esi
-; X86-NEXT: shrdl $1, %esi, %edx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl $1, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: subl %edx, %ebx
+; X86-NEXT: andl $3, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %esi, %edi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl %ecx, %ebx
+; X86-NEXT: shll $30, %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: sarl $30, %eax
+; X86-NEXT: sarl $31, %ebx
+; X86-NEXT: shrdl $1, %ebx, %eax
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: andl $1, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: subl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %ebx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl %edi, %ecx
; X86-NEXT: andl $3, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl $-1, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: adcl $-1, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: adcl $3, %esi
-; X86-NEXT: andl $3, %esi
+; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $3, %edi
+; X86-NEXT: andl $3, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl %ebx, %eax
; X86-NEXT: jne .LBB0_10
; X86-NEXT: .LBB0_11: # %udiv-end
; X86-NEXT: cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll
index 1b13cee628df676..782c84408f25abc 100644
--- a/llvm/test/CodeGen/X86/pr62286.ll
+++ b/llvm/test/CodeGen/X86/pr62286.ll
@@ -8,20 +8,21 @@ define i64 @PR62286(i32 %a) {
; SSE-LABEL: PR62286:
; SSE: # %bb.0:
; SSE-NEXT: movd %edi, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,1,0]
-; SSE-NEXT: paddd %xmm0, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,0]
+; SSE-NEXT: paddd %xmm1, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE-NEXT: pxor %xmm3, %xmm3
-; SSE-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT: pcmpgtd %xmm1, %xmm2
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT: paddq %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: pxor %xmm3, %xmm3
+; SSE-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0]
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE-NEXT: paddq %xmm1, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE-NEXT: paddq %xmm0, %xmm1
+; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: retq
;
; AVX1-LABEL: PR62286:
@@ -46,10 +47,10 @@ define i64 @PR62286(i32 %a) {
; AVX2-LABEL: PR62286:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm1
-; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index 785b97d8c24027c..a9f3e8b22fb69ed 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -14,6 +14,7 @@ define i256 @test1(i256 %a) nounwind {
; ILP: # %bb.0:
; ILP-NEXT: movq %rdi, %rax
; ILP-NEXT: leal (%rsi,%rsi), %ecx
+; ILP-NEXT: addb $3, %cl
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
@@ -22,7 +23,6 @@ define i256 @test1(i256 %a) nounwind {
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: addb $3, %cl
; ILP-NEXT: movl %ecx, %edx
; ILP-NEXT: shrb $3, %dl
; ILP-NEXT: andb $7, %cl
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index a1cabb433d879bc..31297a06f809935 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -563,20 +563,18 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: subq $120, %rsp
; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: pxor %xmm3, %xmm3
-; X64-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
+; X64-NEXT: psllq $32, %xmm3
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
; X64-NEXT: psrad $31, %xmm2
; X64-NEXT: psrlq $31, %xmm3
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: movq %xmm0, %rbp
-; X64-NEXT: movq %rbp, %r14
-; X64-NEXT: sarq $63, %r14
-; X64-NEXT: shldq $31, %rbp, %r14
-; X64-NEXT: movq %rbp, %r15
-; X64-NEXT: shlq $31, %r15
+; X64-NEXT: movq %xmm0, %rbx
+; X64-NEXT: movq %rbx, %r13
+; X64-NEXT: sarq $63, %r13
+; X64-NEXT: shldq $31, %rbx, %r13
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X64-NEXT: pxor %xmm0, %xmm0
; X64-NEXT: pcmpgtd %xmm1, %xmm0
@@ -584,113 +582,113 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: movq %xmm1, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: sarq $63, %rbx
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r14, %rsi
-; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: sarq $63, %r15
+; X64-NEXT: movq %rbx, %r12
+; X64-NEXT: shlq $31, %r12
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %r13
+; X64-NEXT: movq %rax, %rbp
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %r13
-; X64-NEXT: sbbq $0, %r12
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: subq $1, %rbp
+; X64-NEXT: sbbq $0, %r14
+; X64-NEXT: shrq $63, %rbx
+; X64-NEXT: xorl %r15d, %ebx
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %r13, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
-; X64-NEXT: shrq $63, %rbp
-; X64-NEXT: xorl %ebp, %ebx
; X64-NEXT: testb %bl, %al
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
; X64-NEXT: xorl %ecx, %ecx
; X64-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rdx, %r13
-; X64-NEXT: movq %r12, %rax
+; X64-NEXT: cmpq %rdx, %rbp
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: sbbq $0, %rax
-; X64-NEXT: cmovgeq %rdx, %r13
-; X64-NEXT: cmovgeq %rcx, %r12
+; X64-NEXT: cmovgeq %rcx, %r14
+; X64-NEXT: cmovgeq %rdx, %rbp
; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %r13, %rcx
+; X64-NEXT: cmpq %rbp, %rcx
; X64-NEXT: movq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; X64-NEXT: movq $-1, %rax
-; X64-NEXT: sbbq %r12, %rax
-; X64-NEXT: cmovgeq %rcx, %r13
-; X64-NEXT: movq %r13, %xmm0
+; X64-NEXT: sbbq %r14, %rax
+; X64-NEXT: cmovgeq %rcx, %rbp
+; X64-NEXT: movq %rbp, %xmm0
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
-; X64-NEXT: movq %xmm0, %rbp
-; X64-NEXT: movq %rbp, %r14
-; X64-NEXT: sarq $63, %r14
-; X64-NEXT: shldq $31, %rbp, %r14
-; X64-NEXT: movq %rbp, %r15
-; X64-NEXT: shlq $31, %r15
+; X64-NEXT: movq %xmm0, %rbx
+; X64-NEXT: movq %rbx, %r13
+; X64-NEXT: sarq $63, %r13
+; X64-NEXT: shldq $31, %rbx, %r13
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
; X64-NEXT: movq %xmm0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: sarq $63, %rbx
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r14, %rsi
-; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: sarq $63, %r15
+; X64-NEXT: movq %rbx, %r12
+; X64-NEXT: shlq $31, %r12
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %r13
+; X64-NEXT: movq %rax, %rbp
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %r13
-; X64-NEXT: sbbq $0, %r12
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: subq $1, %rbp
+; X64-NEXT: sbbq $0, %r14
+; X64-NEXT: shrq $63, %rbx
+; X64-NEXT: xorl %r15d, %ebx
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %r13, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
-; X64-NEXT: shrq $63, %rbp
-; X64-NEXT: xorl %ebp, %ebx
; X64-NEXT: testb %bl, %al
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rcx, %r13
-; X64-NEXT: movq %r12, %rax
+; X64-NEXT: cmpq %rcx, %rbp
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: sbbq $0, %rax
-; X64-NEXT: cmovgeq %rcx, %r13
; X64-NEXT: movl $0, %eax
-; X64-NEXT: cmovgeq %rax, %r12
+; X64-NEXT: cmovgeq %rax, %r14
+; X64-NEXT: cmovgeq %rcx, %rbp
; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %r13, %rcx
+; X64-NEXT: cmpq %rbp, %rcx
; X64-NEXT: movq $-1, %rax
-; X64-NEXT: sbbq %r12, %rax
-; X64-NEXT: cmovgeq %rcx, %r13
-; X64-NEXT: movq %r13, %xmm0
+; X64-NEXT: sbbq %r14, %rax
+; X64-NEXT: cmovgeq %rcx, %rbp
+; X64-NEXT: movq %rbp, %xmm0
; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; X64-NEXT: psrlq $1, %xmm1
; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: pxor %xmm0, %xmm0
-; X64-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; X64-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT: # xmm0 = mem[0,1,1,3]
+; X64-NEXT: psllq $32, %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
; X64-NEXT: psrad $31, %xmm1
; X64-NEXT: psrlq $31, %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: movq %xmm0, %rbp
-; X64-NEXT: movq %rbp, %r14
-; X64-NEXT: sarq $63, %r14
-; X64-NEXT: shldq $31, %rbp, %r14
-; X64-NEXT: movq %rbp, %r15
-; X64-NEXT: shlq $31, %r15
+; X64-NEXT: movq %xmm0, %rbx
+; X64-NEXT: movq %rbx, %r13
+; X64-NEXT: sarq $63, %r13
+; X64-NEXT: shldq $31, %rbx, %r13
; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; X64-NEXT: pxor %xmm1, %xmm1
; X64-NEXT: pcmpgtd %xmm0, %xmm1
@@ -698,92 +696,94 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: movq %xmm0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: sarq $63, %rbx
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r14, %rsi
-; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: sarq $63, %r15
+; X64-NEXT: movq %rbx, %r12
+; X64-NEXT: shlq $31, %r12
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %r13
+; X64-NEXT: movq %rax, %rbp
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %r13
-; X64-NEXT: sbbq $0, %r12
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: subq $1, %rbp
+; X64-NEXT: sbbq $0, %r14
+; X64-NEXT: shrq $63, %rbx
+; X64-NEXT: xorl %r15d, %ebx
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %r13, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
-; X64-NEXT: shrq $63, %rbp
-; X64-NEXT: xorl %ebp, %ebx
; X64-NEXT: testb %bl, %al
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rcx, %r13
-; X64-NEXT: movq %r12, %rax
+; X64-NEXT: cmpq %rcx, %rbp
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: sbbq $0, %rax
-; X64-NEXT: cmovgeq %rcx, %r13
; X64-NEXT: movl $0, %eax
-; X64-NEXT: cmovgeq %rax, %r12
+; X64-NEXT: cmovgeq %rax, %r14
+; X64-NEXT: cmovgeq %rcx, %rbp
; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %r13, %rcx
+; X64-NEXT: cmpq %rbp, %rcx
; X64-NEXT: movq $-1, %rax
-; X64-NEXT: sbbq %r12, %rax
-; X64-NEXT: cmovgeq %rcx, %r13
-; X64-NEXT: movq %r13, %xmm0
+; X64-NEXT: sbbq %r14, %rax
+; X64-NEXT: cmovgeq %rcx, %rbp
+; X64-NEXT: movq %rbp, %xmm0
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
-; X64-NEXT: movq %xmm0, %rbp
-; X64-NEXT: movq %rbp, %r14
-; X64-NEXT: sarq $63, %r14
-; X64-NEXT: shldq $31, %rbp, %r14
-; X64-NEXT: movq %rbp, %r15
-; X64-NEXT: shlq $31, %r15
+; X64-NEXT: movq %xmm0, %rbx
+; X64-NEXT: movq %rbx, %r13
+; X64-NEXT: sarq $63, %r13
+; X64-NEXT: shldq $31, %rbx, %r13
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
; X64-NEXT: movq %xmm0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: sarq $63, %rbx
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r14, %rsi
-; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: sarq $63, %r15
+; X64-NEXT: movq %rbx, %r12
+; X64-NEXT: shlq $31, %r12
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %r13
+; X64-NEXT: movq %rax, %rbp
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %r13
-; X64-NEXT: sbbq $0, %r12
-; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: subq $1, %rbp
+; X64-NEXT: sbbq $0, %r14
+; X64-NEXT: shrq $63, %rbx
+; X64-NEXT: xorl %r15d, %ebx
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: movq %r13, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
-; X64-NEXT: shrq $63, %rbp
-; X64-NEXT: xorl %ebp, %ebx
; X64-NEXT: testb %bl, %al
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rcx, %r13
-; X64-NEXT: movq %r12, %rax
+; X64-NEXT: cmpq %rcx, %rbp
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: sbbq $0, %rax
-; X64-NEXT: cmovgeq %rcx, %r13
; X64-NEXT: movl $0, %eax
-; X64-NEXT: cmovgeq %rax, %r12
+; X64-NEXT: cmovgeq %rax, %r14
+; X64-NEXT: cmovgeq %rcx, %rbp
; X64-NEXT: movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %r13, %rax
-; X64-NEXT: sbbq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT: cmovgeq %rax, %r13
-; X64-NEXT: movq %r13, %xmm1
+; X64-NEXT: cmpq %rbp, %rax
+; X64-NEXT: sbbq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT: cmovgeq %rax, %rbp
+; X64-NEXT: movq %rbp, %xmm1
; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: psrlq $1, %xmm0
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index a80d8d8cd01b853..97c3c2040b2914c 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -46,6 +46,7 @@ define void @failing(ptr %0, ptr %1) nounwind {
; CHECK-NEXT: movq 24(%rsi), %rcx
; CHECK-NEXT: movq 32(%rsi), %rdx
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
+; CHECK-NEXT: xorl %esi, %esi
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1,1]
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2,2]
; CHECK-NEXT: .p2align 4, 0x90
@@ -53,45 +54,39 @@ define void @failing(ptr %0, ptr %1) nounwind {
; CHECK-NEXT: # =>This Loop Header: Depth=1
; CHECK-NEXT: # Child Loop BB0_2 Depth 2
; CHECK-NEXT: xorpd %xmm3, %xmm3
-; CHECK-NEXT: movq $-1024, %rsi # imm = 0xFC00
+; CHECK-NEXT: movq $-1024, %rdi # imm = 0xFC00
; CHECK-NEXT: movdqa %xmm0, %xmm4
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_2: # %vector.body
; CHECK-NEXT: # Parent Loop BB0_1 Depth=1
; CHECK-NEXT: # => This Inner Loop Header: Depth=2
-; CHECK-NEXT: movdqu 1024(%rdx,%rsi), %xmm5
-; CHECK-NEXT: movdqu 1040(%rdx,%rsi), %xmm6
-; CHECK-NEXT: movq %xmm5, %rdi
-; CHECK-NEXT: movq %xmm6, %r8
-; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
-; CHECK-NEXT: movq %xmm5, %r9
-; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
-; CHECK-NEXT: movq %xmm5, %r10
-; CHECK-NEXT: negq %r8
+; CHECK-NEXT: cmpq 1024(%rdx,%rdi), %rsi
; CHECK-NEXT: movq %rcx, %r8
-; CHECK-NEXT: sbbq %r10, %r8
+; CHECK-NEXT: sbbq 1032(%rdx,%rdi), %r8
; CHECK-NEXT: setge %r8b
; CHECK-NEXT: movzbl %r8b, %r8d
+; CHECK-NEXT: andl $1, %r8d
; CHECK-NEXT: negq %r8
; CHECK-NEXT: movq %r8, %xmm5
-; CHECK-NEXT: negq %rdi
-; CHECK-NEXT: movq %rcx, %rdi
-; CHECK-NEXT: sbbq %r9, %rdi
-; CHECK-NEXT: setge %dil
-; CHECK-NEXT: movzbl %dil, %edi
-; CHECK-NEXT: negq %rdi
-; CHECK-NEXT: movq %rdi, %xmm6
-; CHECK-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
-; CHECK-NEXT: movdqa %xmm1, %xmm5
-; CHECK-NEXT: psllq %xmm4, %xmm5
+; CHECK-NEXT: cmpq 1040(%rdx,%rdi), %rsi
+; CHECK-NEXT: movq %rcx, %r8
+; CHECK-NEXT: sbbq 1048(%rdx,%rdi), %r8
+; CHECK-NEXT: setge %r8b
+; CHECK-NEXT: movzbl %r8b, %r8d
+; CHECK-NEXT: andl $1, %r8d
+; CHECK-NEXT: negq %r8
+; CHECK-NEXT: movq %r8, %xmm6
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
+; CHECK-NEXT: movdqa %xmm1, %xmm6
+; CHECK-NEXT: psllq %xmm4, %xmm6
; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3]
; CHECK-NEXT: movdqa %xmm1, %xmm8
; CHECK-NEXT: psllq %xmm7, %xmm8
-; CHECK-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1]
-; CHECK-NEXT: andpd %xmm6, %xmm8
+; CHECK-NEXT: movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1]
+; CHECK-NEXT: andpd %xmm5, %xmm8
; CHECK-NEXT: orpd %xmm8, %xmm3
; CHECK-NEXT: paddq %xmm2, %xmm4
-; CHECK-NEXT: addq $32, %rsi
+; CHECK-NEXT: addq $32, %rdi
; CHECK-NEXT: jne .LBB0_2
; CHECK-NEXT: # %bb.3: # %middle.block
; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1
@@ -106,6 +101,7 @@ define void @failing(ptr %0, ptr %1) nounwind {
; CHECK-AVX2-NEXT: movq 24(%rsi), %rcx
; CHECK-AVX2-NEXT: movq 32(%rsi), %rdx
; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,1]
+; CHECK-AVX2-NEXT: xorl %esi, %esi
; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,1]
; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [2,2]
; CHECK-AVX2-NEXT: .p2align 4, 0x90
@@ -113,40 +109,34 @@ define void @failing(ptr %0, ptr %1) nounwind {
; CHECK-AVX2-NEXT: # =>This Loop Header: Depth=1
; CHECK-AVX2-NEXT: # Child Loop BB0_2 Depth 2
; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT: movq $-1024, %rsi # imm = 0xFC00
+; CHECK-AVX2-NEXT: movq $-1024, %rdi # imm = 0xFC00
; CHECK-AVX2-NEXT: vmovdqa %xmm0, %xmm4
; CHECK-AVX2-NEXT: .p2align 4, 0x90
; CHECK-AVX2-NEXT: .LBB0_2: # %vector.body
; CHECK-AVX2-NEXT: # Parent Loop BB0_1 Depth=1
; CHECK-AVX2-NEXT: # => This Inner Loop Header: Depth=2
-; CHECK-AVX2-NEXT: vmovdqu 1024(%rdx,%rsi), %xmm5
-; CHECK-AVX2-NEXT: vmovdqu 1040(%rdx,%rsi), %xmm6
-; CHECK-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm5[0],xmm6[0]
-; CHECK-AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
-; CHECK-AVX2-NEXT: vmovq %xmm5, %rdi
-; CHECK-AVX2-NEXT: vpextrq $1, %xmm5, %r8
-; CHECK-AVX2-NEXT: vmovq %xmm7, %r9
-; CHECK-AVX2-NEXT: vpextrq $1, %xmm7, %r10
-; CHECK-AVX2-NEXT: negq %r10
-; CHECK-AVX2-NEXT: movq %rcx, %r10
-; CHECK-AVX2-NEXT: sbbq %r8, %r10
+; CHECK-AVX2-NEXT: cmpq 1024(%rdx,%rdi), %rsi
+; CHECK-AVX2-NEXT: movq %rcx, %r8
+; CHECK-AVX2-NEXT: sbbq 1032(%rdx,%rdi), %r8
; CHECK-AVX2-NEXT: setge %r8b
; CHECK-AVX2-NEXT: movzbl %r8b, %r8d
+; CHECK-AVX2-NEXT: andl $1, %r8d
; CHECK-AVX2-NEXT: negq %r8
; CHECK-AVX2-NEXT: vmovq %r8, %xmm5
-; CHECK-AVX2-NEXT: negq %r9
+; CHECK-AVX2-NEXT: cmpq 1040(%rdx,%rdi), %rsi
; CHECK-AVX2-NEXT: movq %rcx, %r8
-; CHECK-AVX2-NEXT: sbbq %rdi, %r8
-; CHECK-AVX2-NEXT: setge %dil
-; CHECK-AVX2-NEXT: movzbl %dil, %edi
-; CHECK-AVX2-NEXT: negq %rdi
-; CHECK-AVX2-NEXT: vmovq %rdi, %xmm6
-; CHECK-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0]
+; CHECK-AVX2-NEXT: sbbq 1048(%rdx,%rdi), %r8
+; CHECK-AVX2-NEXT: setge %r8b
+; CHECK-AVX2-NEXT: movzbl %r8b, %r8d
+; CHECK-AVX2-NEXT: andl $1, %r8d
+; CHECK-AVX2-NEXT: negq %r8
+; CHECK-AVX2-NEXT: vmovq %r8, %xmm6
+; CHECK-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
; CHECK-AVX2-NEXT: vpsllvq %xmm4, %xmm1, %xmm6
; CHECK-AVX2-NEXT: vpand %xmm6, %xmm5, %xmm5
; CHECK-AVX2-NEXT: vpor %xmm3, %xmm5, %xmm3
; CHECK-AVX2-NEXT: vpaddq %xmm2, %xmm4, %xmm4
-; CHECK-AVX2-NEXT: addq $32, %rsi
+; CHECK-AVX2-NEXT: addq $32, %rdi
; CHECK-AVX2-NEXT: jne .LBB0_2
; CHECK-AVX2-NEXT: # %bb.3: # %middle.block
; CHECK-AVX2-NEXT: # in Loop: Header=BB0_1 Depth=1
diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index 460c5fe11f82a51..cee30f5fe5da9e9 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -1045,12 +1045,16 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
;
; AVX512-LABEL: saddo_v4i1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpslld $31, %xmm2, %xmm2
-; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k0
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
+; AVX512-NEXT: vpslld $31, %xmm1, %xmm0
+; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k2
+; AVX512-NEXT: kandw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: kshiftlw $12, %k2, %k0
+; AVX512-NEXT: kshiftrw $12, %k0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll
index d06993da6365d8e..64ed081048851bd 100644
--- a/llvm/test/CodeGen/X86/vec_ssubo.ll
+++ b/llvm/test/CodeGen/X86/vec_ssubo.ll
@@ -1062,12 +1062,16 @@ define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
;
; AVX512-LABEL: ssubo_v4i1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpslld $31, %xmm2, %xmm2
-; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k0
-; AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
+; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k0
+; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 {%k1}
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: kshiftlw $12, %k0, %k0
+; AVX512-NEXT: kshiftrw $12, %k0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index bac118095331ca5..950e943bd902013 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -1098,12 +1098,16 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
;
; AVX512-LABEL: uaddo_v4i1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpslld $31, %xmm2, %xmm2
-; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k0
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
+; AVX512-NEXT: vpslld $31, %xmm1, %xmm0
+; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k2
+; AVX512-NEXT: kandw %k1, %k0, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: kshiftlw $12, %k2, %k0
+; AVX512-NEXT: kshiftrw $12, %k0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll
index ab75ada72f25652..7de972770d8da4f 100644
--- a/llvm/test/CodeGen/X86/vec_usubo.ll
+++ b/llvm/test/CodeGen/X86/vec_usubo.ll
@@ -1145,12 +1145,16 @@ define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
;
; AVX512-LABEL: usubo_v4i1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpslld $31, %xmm2, %xmm2
-; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k0
-; AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
+; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k0
+; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 {%k1}
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: kshiftlw $12, %k0, %k0
+; AVX512-NEXT: kshiftrw $12, %k0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll
index 27aaad6353ed687..78797b9acc2e6ec 100644
--- a/llvm/test/CodeGen/X86/vector-bo-select.ll
+++ b/llvm/test/CodeGen/X86/vector-bo-select.ll
@@ -3137,11 +3137,11 @@ define <8 x i64> @mul_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef
; AVX512-LABEL: mul_v8i64_cast_cond:
; AVX512: # %bb.0:
; AVX512-NEXT: kmovw %edi, %k1
-; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512-NEXT: vpaddq %zmm2, %zmm3, %zmm2
+; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm2
+; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm2
+; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm3
+; AVX512-NEXT: vpmuludq %zmm1, %zmm3, %zmm3
+; AVX512-NEXT: vpaddq %zmm3, %zmm2, %zmm2
; AVX512-NEXT: vpsllq $32, %zmm2, %zmm2
; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm1
; AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm0 {%k1}
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 3aaa9268a8d8880..b839452725a95f4 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -58,12 +58,12 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; SSE41-NEXT: psrlq %xmm4, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: pandn %xmm3, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
; SSE41-NEXT: paddq %xmm0, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psllq %xmm1, %xmm3
; SSE41-NEXT: psllq %xmm2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: retq
;
@@ -76,11 +76,11 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -158,13 +158,13 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; XOPAVX1-LABEL: var_funnnel_v2i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
-; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT: vpshlq %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
@@ -366,13 +366,13 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
; XOPAVX1-LABEL: var_funnnel_v4i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
-; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT: vpshld %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
@@ -646,26 +646,26 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; XOPAVX1-LABEL: var_funnnel_v8i16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT: vpsubw %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT: vpshlw %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshlw %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT: vpshlw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshlw %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_funnnel_v8i16:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX2-NEXT: vpsubw %xmm4, %xmm5, %xmm4
+; XOPAVX2-NEXT: vpshlw %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
; XOPAVX2-NEXT: vpaddw %xmm0, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2
-; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0
; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
@@ -995,26 +995,26 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
; XOPAVX1-LABEL: var_funnnel_v16i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT: vpsubb %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshlb %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_funnnel_v16i8:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX2-NEXT: vpsubb %xmm4, %xmm5, %xmm4
+; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
; XOPAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpshlb %xmm4, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2
-; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0
; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index fc65f759f5fbed6..7b6b0ea83c7eea3 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -486,22 +486,22 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; XOPAVX2-LABEL: var_funnnel_v16i16:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; XOPAVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0
-; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
-; XOPAVX2-NEXT: vpshlw %xmm5, %xmm6, %xmm5
-; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0
-; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; XOPAVX2-NEXT: vpsubw %xmm5, %xmm6, %xmm5
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm7
+; XOPAVX2-NEXT: vpshlw %xmm5, %xmm7, %xmm5
+; XOPAVX2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; XOPAVX2-NEXT: vpshlw %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpsubw %xmm3, %xmm4, %xmm3
-; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
-; XOPAVX2-NEXT: vpshlw %xmm3, %xmm5, %xmm3
-; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm2
-; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1
-; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
+; XOPAVX2-NEXT: vpshlw %xmm3, %xmm4, %xmm3
+; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt)
diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
index a6067a960fc0d63..0426c48aecfcffc 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
@@ -185,13 +185,13 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt)
; XOPAVX1-LABEL: var_funnnel_v2i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
-; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT: vpshld %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index 75baba5f35f792c..c54da38ef10cc18 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -927,9 +927,9 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: paddq %xmm0, %xmm1
-; SSE2-NEXT: psllq $7, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: psllq $7, %xmm1
+; SSE2-NEXT: paddq %xmm0, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i64:
@@ -975,9 +975,9 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
; X86-SSE-LABEL: constant_shift_v2i64:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE-NEXT: paddq %xmm0, %xmm1
-; X86-SSE-NEXT: psllq $7, %xmm0
-; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-SSE-NEXT: psllq $7, %xmm1
+; X86-SSE-NEXT: paddq %xmm0, %xmm0
+; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; X86-SSE-NEXT: retl
%shift = shl <2 x i64> %a, <i64 1, i64 7>
ret <2 x i64> %shift
More information about the llvm-commits
mailing list