[llvm] [SelectionDAG] Handle more opcodes in canCreateUndefOrPoison (PR #84921)
Björn Pettersson via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 12 07:15:24 PDT 2024
https://github.com/bjope created https://github.com/llvm/llvm-project/pull/84921
Deal with these possibly poison generating operations in SelectionDAG::canCreateUndefOrPoison:
EXTRACT_VECTOR_ELT, SRL, SRA
Also handle these operations that only propagate poison/undef based on the input operands:
SELECT, SADDSAT, UADDSAT, SSUBSAT, USUBSAT, MULHU, MULHS, SMIN,
SMAX, UMIN, UMAX
Also handle the integer comparison variant of SELECT_CC.
The goal here is to allow pushing freeze through these operations when allowed, as well as letting analyses such as
isGuaranteedNotToBeUndefOrPoison to not break on such operations.
>From 305970430ea3969f1e5fd208a7c083cea4fac78a Mon Sep 17 00:00:00 2001
From: Bjorn Pettersson <bjorn.a.pettersson at ericsson.com>
Date: Tue, 12 Mar 2024 13:00:08 +0100
Subject: [PATCH] [SelectionDAG] Handle more opcodes in canCreateUndefOrPoison
Deal with these possibly poison generating operations in
SelectionDAG::canCreateUndefOrPoison:
EXTRACT_VECTOR_ELT, SRL, SRA
Also handle these operations that only propagate poison/undef based
on the input operands:
SELECT, SADDSAT, UADDSAT, SSUBSAT, USUBSAT, MULHU, MULHS, SMIN,
SMAX, UMIN, UMAX
Also handle the integer comparison variant of SELECT_CC.
The goal here is to allow pushing freeze through these operations
when allowed, as well as letting analyses such as
isGuaranteedNotToBeUndefOrPoison to not break on such operations.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 +
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 37 +-
llvm/test/CodeGen/AMDGPU/div_i128.ll | 96 ++--
llvm/test/CodeGen/AMDGPU/rem_i128.ll | 475 +++++++++---------
llvm/test/CodeGen/AMDGPU/sdiv64.ll | 12 +-
llvm/test/CodeGen/AMDGPU/srem64.ll | 26 +-
llvm/test/CodeGen/AMDGPU/udiv64.ll | 2 -
llvm/test/CodeGen/RISCV/iabs.ll | 4 +-
llvm/test/CodeGen/RISCV/rv64xtheadbb.ll | 12 +-
llvm/test/CodeGen/RISCV/rv64zbb.ll | 12 +-
.../CodeGen/RISCV/rvv/vec3-setcc-crash.ll | 12 +-
.../X86/div-rem-pair-recomposition-signed.ll | 366 +++++++-------
llvm/test/CodeGen/X86/freeze-binary.ll | 104 ++--
llvm/test/CodeGen/X86/midpoint-int-vec-512.ll | 206 ++++----
14 files changed, 708 insertions(+), 662 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5476ef87971436..9dfc64bc97a332 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15383,6 +15383,12 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, /*PoisonOnly*/ false))
return N0;
+ // There is a reverse transform in visitEXTRACT_VECTOR_ELT, so we need to
+ // avoid inifite looping by not pulling freeze through EXTRACT_VECTOR_ELT
+ // here.
+ if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
// Fold freeze(op(x, ...)) -> op(freeze(x), ...).
// Try to push freeze through instructions that propagate but don't produce
// poison as far as possible. If an operand of freeze follows three
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index c24303592769a0..02ee170303d4f9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5073,6 +5073,16 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
case ISD::FREEZE:
case ISD::CONCAT_VECTORS:
case ISD::INSERT_SUBVECTOR:
+ case ISD::SADDSAT:
+ case ISD::UADDSAT:
+ case ISD::SSUBSAT:
+ case ISD::USUBSAT:
+ case ISD::MULHU:
+ case ISD::MULHS:
+ case ISD::SMIN:
+ case ISD::SMAX:
+ case ISD::UMIN:
+ case ISD::UMAX:
case ISD::AND:
case ISD::XOR:
case ISD::ROTL:
@@ -5091,8 +5101,18 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
case ISD::BITCAST:
case ISD::BUILD_VECTOR:
case ISD::BUILD_PAIR:
+ case ISD::SELECT:
return false;
+ case ISD::SELECT_CC: {
+ // Integer select_cc cannot create undef or poison.
+ if (Op.getOperand(0).getValueType().isInteger())
+ return false;
+
+ // FIXME: Handle FP compares.
+ return true;
+ }
+
case ISD::SETCC: {
// Integer setcc cannot create undef or poison.
if (Op.getOperand(0).getValueType().isInteger())
@@ -5123,7 +5143,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
return ConsiderFlags && (Op->getFlags().hasNoSignedWrap() ||
Op->getFlags().hasNoUnsignedWrap());
- case ISD::SHL:
+ case ISD::SHL: {
// If the max shift amount isn't in range, then the shift can create poison.
if (!getValidMaximumShiftAmountConstant(Op, DemandedElts))
return true;
@@ -5131,15 +5151,28 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
// Matches hasPoisonGeneratingFlags().
return ConsiderFlags && (Op->getFlags().hasNoSignedWrap() ||
Op->getFlags().hasNoUnsignedWrap());
+ }
+
+ case ISD::SRL:
+ case ISD::SRA: {
+ // If the max shift amount isn't in range, then the shift can create poison.
+ if (!getValidMaximumShiftAmountConstant(Op, DemandedElts))
+ return true;
+
+ // Matches hasPoisonGeneratingFlags().
+ return ConsiderFlags && Op->getFlags().hasExact();
+ }
// Matches hasPoisonGeneratingFlags().
case ISD::OR:
return ConsiderFlags && Op->getFlags().hasDisjoint();
+ case ISD::EXTRACT_VECTOR_ELT:
case ISD::INSERT_VECTOR_ELT:{
// Ensure that the element index is in bounds.
EVT VecVT = Op.getOperand(0).getValueType();
- KnownBits KnownIdx = computeKnownBits(Op.getOperand(2), Depth + 1);
+ unsigned IdxOp = Opcode == ISD::INSERT_VECTOR_ELT ? 2 : 1;
+ KnownBits KnownIdx = computeKnownBits(Op.getOperand(IdxOp), Depth + 1);
return KnownIdx.getMaxValue().uge(VecVT.getVectorMinNumElements());
}
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 2f3d5d9d140c2c..16649821580912 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -283,21 +283,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v16, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v16
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v14
; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v4
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v10, v4, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v6, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v10, v4, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1
+; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
; GFX9-O0-NEXT: v_xor_b32_e64 v1, v5, v1
@@ -313,21 +313,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12
; GFX9-O0-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v3
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v8, v3, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v8, v5, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v7, v3, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v2, v5, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7
+; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7
; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, v6
; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v4
; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
@@ -340,18 +340,26 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2
; GFX9-O0-NEXT: v_or_b32_e64 v3, v8, v7
@@ -404,7 +412,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[6:7]
+; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
@@ -440,7 +449,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[6:7]
+; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[8:9]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
@@ -691,10 +701,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
@@ -904,14 +914,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(9)
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
@@ -1029,10 +1039,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 6ba66ccf71868e..b068d87c4d6f48 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -242,130 +242,137 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0: ; %bb.0: ; %_udiv-special-cases
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v2
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v7
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v1
+; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v3
; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: s_mov_b32 s4, 63
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v14
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15
-; GFX9-O0-NEXT: v_ashrrev_i64 v[12:13], s4, v[6:7]
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT: v_ashrrev_i64 v[11:12], s4, v[10:11]
+; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9
-; GFX9-O0-NEXT: v_ashrrev_i64 v[6:7], s4, v[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v15
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13
-; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v10
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14
-; GFX9-O0-NEXT: v_xor_b32_e64 v13, v11, v12
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6
+; GFX9-O0-NEXT: v_ashrrev_i64 v[15:16], s4, v[13:14]
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12
+; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v10
+; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v11
+; GFX9-O0-NEXT: v_xor_b32_e64 v13, v8, v12
; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v10
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT: v_xor_b32_e64 v15, v4, v12
-; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7
-; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT: v_xor_b32_e64 v7, v5, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT: v_xor_b32_e64 v3, v3, v10
+; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v12
+; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16
+; GFX9-O0-NEXT: v_xor_b32_e64 v9, v8, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v15
+; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, v6
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT: v_xor_b32_e64 v1, v1, v4
-; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT: v_xor_b32_e64 v2, v2, v6
-; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v16
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT: v_xor_b32_e64 v9, v9, v3
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, v6
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v14
; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v12
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v10, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v13, vcc, v11, v12, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v10, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
-; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8
; GFX9-O0-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v6
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v3, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v5, v6, vcc
-; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v2, v3, vcc
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -438,7 +445,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[6:7]
+; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
@@ -474,7 +482,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v12, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[6:7]
+; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[8:9]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
@@ -589,27 +598,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_readlane_b32 s5, v0, 5
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-O0-NEXT: ; %bb.2: ; %Flow
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(6)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_5
; GFX9-O0-NEXT: .LBB0_3: ; %Flow2
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
@@ -624,22 +633,22 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_9
; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b32 s4, 1
; GFX9-O0-NEXT: s_waitcnt vmcnt(2)
; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1]
@@ -679,27 +688,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 6
; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 7
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_4
; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while
; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -709,30 +718,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_readlane_b32 s6, v16, 8
; GFX9-O0-NEXT: v_readlane_b32 s7, v16, 9
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b32 s4, 63
; GFX9-O0-NEXT: s_waitcnt vmcnt(16)
; GFX9-O0-NEXT: v_lshrrev_b64 v[29:30], s4, v[2:3]
@@ -872,24 +881,24 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v13
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12
-; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
; GFX9-O0-NEXT: v_writelane_b32 v16, s6, 4
; GFX9-O0-NEXT: v_writelane_b32 v16, s7, 5
@@ -899,42 +908,42 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6
; GFX9-O0-NEXT: s_branch .LBB0_1
; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
@@ -1018,12 +1027,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v15, s9
; GFX9-O0-NEXT: v_mov_b32_e32 v14, s8
@@ -1034,30 +1043,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_6
; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1
; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
@@ -1099,14 +1108,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f
; GFX9-O0-NEXT: v_sub_u32_e64 v3, s4, v4
; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], v3, v[11:12]
@@ -1152,12 +1161,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10
; GFX9-O0-NEXT: v_or_b32_e64 v3, v3, v4
@@ -1172,18 +1181,18 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec
; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7]
@@ -1203,18 +1212,18 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b32 s4, 32
; GFX9-O0-NEXT: s_waitcnt vmcnt(2)
; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[5:6]
@@ -1486,11 +1495,11 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5
; GFX9-O0-NEXT: ; kill: killed $vgpr4
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index b086640c72f804..31e96f2fb3f662 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -1089,16 +1089,12 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
; GCN-IR-NEXT: s_mov_b32 s15, 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_sext_i32_i16 s5, s5
-; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24
-; GCN-IR-NEXT: s_sext_i32_i16 s7, s7
-; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
-; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 24
-; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 16
+; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[4:5], 16
+; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[6:7], 16
+; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 40
; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31
-; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], 16
; GCN-IR-NEXT: s_mov_b32 s3, s2
-; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 16
+; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 40
; GCN-IR-NEXT: s_ashr_i32 s4, s5, 31
; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[6:7]
; GCN-IR-NEXT: s_mov_b32 s5, s4
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index ed7f27b367fdaf..134164689c9a36 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1072,17 +1072,17 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-NEXT: s_mov_b32 s13, 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[6:7], 31
-; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[0:1], 31
-; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31
-; GCN-IR-NEXT: s_ashr_i32 s10, s7, 31
+; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 31
+; GCN-IR-NEXT: s_ashr_i32 s0, s7, 31
+; GCN-IR-NEXT: s_ashr_i32 s6, s1, 31
; GCN-IR-NEXT: s_mov_b32 s1, s0
-; GCN-IR-NEXT: s_mov_b32 s11, s10
+; GCN-IR-NEXT: s_mov_b32 s7, s6
; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1]
-; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7]
; GCN-IR-NEXT: s_sub_u32 s2, s2, s0
; GCN-IR-NEXT: s_subb_u32 s3, s3, s0
-; GCN-IR-NEXT: s_sub_u32 s8, s6, s10
-; GCN-IR-NEXT: s_subb_u32 s9, s7, s10
+; GCN-IR-NEXT: s_sub_u32 s8, s8, s6
+; GCN-IR-NEXT: s_subb_u32 s9, s9, s6
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[8:9]
@@ -1210,16 +1210,12 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
; GCN-IR-NEXT: s_mov_b32 s13, 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_sext_i32_i16 s5, s5
-; GCN-IR-NEXT: s_sext_i32_i16 s7, s7
-; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24
-; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 24
-; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
-; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], 16
-; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 16
+; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[4:5], 16
+; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[6:7], 16
+; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 40
; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31
; GCN-IR-NEXT: s_ashr_i32 s10, s5, 31
-; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 16
+; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 40
; GCN-IR-NEXT: s_mov_b32 s3, s2
; GCN-IR-NEXT: s_mov_b32 s11, s10
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 48b9c72ea68922..dfbab58a322611 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -840,8 +840,6 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
; GCN-IR-NEXT: s_and_b32 s4, s6, 0xff000000
; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], 24
; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[4:5], 24
-; GCN-IR-NEXT: s_and_b32 s9, s9, 0xffff
-; GCN-IR-NEXT: s_and_b32 s3, s3, 0xffff
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0
; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[2:3]
diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll
index 98c886333d69a0..a0c85ab4dca7f7 100644
--- a/llvm/test/CodeGen/RISCV/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/iabs.ll
@@ -630,8 +630,8 @@ define void @zext16_abs8(i8 %x, ptr %p) {
; RV32I-LABEL: zext16_abs8:
; RV32I: # %bb.0:
; RV32I-NEXT: slli a0, a0, 24
-; RV32I-NEXT: srai a0, a0, 24
; RV32I-NEXT: srai a2, a0, 31
+; RV32I-NEXT: srai a0, a0, 24
; RV32I-NEXT: xor a0, a0, a2
; RV32I-NEXT: sub a0, a0, a2
; RV32I-NEXT: sh a0, 0(a1)
@@ -648,8 +648,8 @@ define void @zext16_abs8(i8 %x, ptr %p) {
; RV64I-LABEL: zext16_abs8:
; RV64I: # %bb.0:
; RV64I-NEXT: slli a0, a0, 56
-; RV64I-NEXT: srai a0, a0, 56
; RV64I-NEXT: srai a2, a0, 63
+; RV64I-NEXT: srai a0, a0, 56
; RV64I-NEXT: xor a0, a0, a2
; RV64I-NEXT: subw a0, a0, a2
; RV64I-NEXT: sh a0, 0(a1)
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
index 1f62ea9f568191..dfcbc3b7d6497a 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
@@ -260,15 +260,17 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
; RV64I-NEXT: .cfi_def_cfa_offset 16
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
; RV64I-NEXT: .cfi_offset ra, -8
-; RV64I-NEXT: srliw a1, a0, 1
+; RV64I-NEXT: srli a1, a0, 1
; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: srliw a1, a0, 2
+; RV64I-NEXT: srli a1, a0, 2
; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: srliw a1, a0, 4
+; RV64I-NEXT: srli a1, a0, 4
; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: srliw a1, a0, 8
+; RV64I-NEXT: slli a1, a0, 33
+; RV64I-NEXT: srli a1, a1, 41
; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: srliw a1, a0, 16
+; RV64I-NEXT: slli a1, a0, 33
+; RV64I-NEXT: srli a1, a1, 49
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: not a0, a0
; RV64I-NEXT: srli a1, a0, 1
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index 2269d8d04c9cb0..ed09d554b84484 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -252,15 +252,17 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
; RV64I-NEXT: .cfi_def_cfa_offset 16
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
; RV64I-NEXT: .cfi_offset ra, -8
-; RV64I-NEXT: srliw a1, a0, 1
+; RV64I-NEXT: srli a1, a0, 1
; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: srliw a1, a0, 2
+; RV64I-NEXT: srli a1, a0, 2
; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: srliw a1, a0, 4
+; RV64I-NEXT: srli a1, a0, 4
; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: srliw a1, a0, 8
+; RV64I-NEXT: slli a1, a0, 33
+; RV64I-NEXT: srli a1, a1, 41
; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: srliw a1, a0, 16
+; RV64I-NEXT: slli a1, a0, 33
+; RV64I-NEXT: srli a1, a1, 49
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: not a0, a0
; RV64I-NEXT: srli a1, a0, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
index 122ac13cb25731..5d24aa083a4542 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
@@ -13,18 +13,18 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) {
; RV32: # %bb.0:
; RV32-NEXT: lw a0, 0(a0)
; RV32-NEXT: srli a2, a0, 16
-; RV32-NEXT: slli a3, a0, 16
-; RV32-NEXT: srli a4, a3, 24
-; RV32-NEXT: srai a3, a3, 24
+; RV32-NEXT: srli a3, a0, 8
+; RV32-NEXT: slli a4, a0, 16
+; RV32-NEXT: srai a4, a4, 24
; RV32-NEXT: slli a5, a0, 24
; RV32-NEXT: srai a5, a5, 24
; RV32-NEXT: slli a6, a0, 8
; RV32-NEXT: srai a6, a6, 24
; RV32-NEXT: sgtz a6, a6
; RV32-NEXT: sgtz a5, a5
-; RV32-NEXT: sgtz a3, a3
-; RV32-NEXT: neg a3, a3
-; RV32-NEXT: and a3, a3, a4
+; RV32-NEXT: sgtz a4, a4
+; RV32-NEXT: neg a4, a4
+; RV32-NEXT: and a3, a4, a3
; RV32-NEXT: slli a3, a3, 8
; RV32-NEXT: neg a4, a5
; RV32-NEXT: and a0, a4, a0
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index e12ca56023a7f2..163e3c9eda7232 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -182,101 +182,101 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: sarl $31, %eax
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: sarl $31, %edi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: sarl $31, %ebx
; X86-NEXT: movl %eax, %esi
; X86-NEXT: xorl %ecx, %esi
; X86-NEXT: movl %esi, %ebp
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: movl %ecx, %edi
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %eax, %esi
; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi
; X86-NEXT: subl %eax, %esi
-; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
; X86-NEXT: sbbl %eax, %ebp
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %esi
+; X86-NEXT: movl %ebx, %esi
; X86-NEXT: xorl %edx, %esi
-; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl %ebx, %edx
; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %edi, %ebp
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: xorl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %ebx, %ebp
; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: subl %edi, %ebp
-; X86-NEXT: sbbl %edi, %ebx
-; X86-NEXT: sbbl %edi, %edx
-; X86-NEXT: sbbl %edi, %esi
-; X86-NEXT: xorl %eax, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: subl %ebx, %ebp
+; X86-NEXT: sbbl %ebx, %edi
+; X86-NEXT: sbbl %ebx, %edx
+; X86-NEXT: sbbl %ebx, %esi
+; X86-NEXT: xorl %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl %ebp, %ecx
; X86-NEXT: orl %edx, %ecx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: sete %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl (%esp), %edx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl (%esp), %edx # 4-byte Folded Reload
; X86-NEXT: orl %eax, %edx
; X86-NEXT: sete %al
; X86-NEXT: orb %cl, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: bsrl %esi, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl %edi, %ecx
+; X86-NEXT: bsrl %ebx, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: addl $32, %ecx
; X86-NEXT: testl %esi, %esi
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: bsrl %ebx, %edx
+; X86-NEXT: bsrl %edi, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: bsrl %ebp, %ebp
; X86-NEXT: xorl $31, %ebp
; X86-NEXT: addl $32, %ebp
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %edx, %ebp
; X86-NEXT: addl $64, %ebp
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %edi
+; X86-NEXT: orl %esi, %ebx
; X86-NEXT: cmovnel %ecx, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: bsrl %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NEXT: bsrl %eax, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: addl $32, %ecx
-; X86-NEXT: testl %edi, %edi
+; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: bsrl %edi, %esi
; X86-NEXT: xorl $31, %esi
-; X86-NEXT: bsrl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: xorl $31, %edx
; X86-NEXT: addl $32, %edx
-; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %esi, %edx
; X86-NEXT: addl $64, %edx
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: orl %edi, %esi
+; X86-NEXT: orl %ebx, %esi
; X86-NEXT: cmovnel %ecx, %edx
; X86-NEXT: xorl %esi, %esi
; X86-NEXT: subl %edx, %ebp
-; X86-NEXT: movl $0, %ebx
-; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
; X86-NEXT: movl $0, %edx
; X86-NEXT: sbbl %edx, %edx
; X86-NEXT: movl $0, %eax
@@ -285,7 +285,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: cmpl %ebp, %ecx
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: sbbl %edi, %ecx
; X86-NEXT: movl $0, %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %edx, %ecx
@@ -294,30 +294,31 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: setb %cl
; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT: cmovnel %esi, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %ebx, %edx
; X86-NEXT: cmovnel %esi, %edx
+; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT: cmovnel %esi, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: cmovnel %esi, %eax
-; X86-NEXT: cmovel (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: jne .LBB4_8
; X86-NEXT: # %bb.1: # %_udiv-special-cases
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: xorl $127, %ebx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: xorl $127, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: orl %edi, %ecx
; X86-NEXT: je .LBB4_8
; X86-NEXT: # %bb.2: # %udiv-bb1
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -332,234 +333,233 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $15, %al
; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %ebx
-; X86-NEXT: movl 144(%esp,%ebx), %edx
-; X86-NEXT: movl 148(%esp,%ebx), %edi
+; X86-NEXT: movsbl %al, %edi
+; X86-NEXT: movl 144(%esp,%edi), %edx
+; X86-NEXT: movl 148(%esp,%edi), %esi
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shll %cl, %edx
; X86-NEXT: notb %cl
-; X86-NEXT: movl 140(%esp,%ebx), %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: shrl %esi
-; X86-NEXT: shrl %cl, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: movl 136(%esp,%ebx), %esi
+; X86-NEXT: movl 140(%esp,%edi), %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: shrl %ebx
+; X86-NEXT: shrl %cl, %ebx
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: movl 136(%esp,%edi), %edx
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $0, %edx
; X86-NEXT: jae .LBB4_3
; X86-NEXT: # %bb.6:
-; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: xorl %edi, %edi
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: jmp .LBB4_7
; X86-NEXT: .LBB4_3: # %udiv-preheader
-; X86-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movb %bl, %ch
+; X86-NEXT: movb %dl, %ch
; X86-NEXT: andb $7, %ch
-; X86-NEXT: movb %bl, %cl
+; X86-NEXT: movb %dl, %cl
; X86-NEXT: shrb $3, %cl
; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ebp
-; X86-NEXT: movl 100(%esp,%ebp), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 96(%esp,%ebp), %ebx
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrdl %cl, %esi, %edx
-; X86-NEXT: movl 88(%esp,%ebp), %ebp
-; X86-NEXT: movl 92(%esp,%eax), %esi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: shrl %cl, %eax
-; X86-NEXT: notb %cl
-; X86-NEXT: addl %ebx, %ebx
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: orl %eax, %ebx
+; X86-NEXT: movzbl %cl, %edx
+; X86-NEXT: movl 100(%esp,%edx), %esi
+; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT: movl 96(%esp,%edx), %edi
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %ebp
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: shrdl %cl, %esi, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: addl $-1, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl $-1, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl $-1, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 88(%esp,%edx), %ebx
+; X86-NEXT: movl 92(%esp,%edx), %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: shrl %cl, %edx
+; X86-NEXT: notb %cl
+; X86-NEXT: addl %edi, %edi
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movb %ch, %cl
+; X86-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
+; X86-NEXT: shrdl %cl, %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: adcl $-1, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: addl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB4_4: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, %ebx
+; X86-NEXT: movl %ebp, %esi
+; X86-NEXT: shldl $1, %ebp, (%esp) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: shldl $1, %ebp, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: shldl $1, %ebp, %edx
-; X86-NEXT: shldl $1, %edi, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl $1, %eax, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: shldl $1, %edx, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $1, %ecx, %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shldl $1, %eax, %ecx
-; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: orl %ebx, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %eax, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: orl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: sbbl %ebp, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: sbbl %esi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload
; X86-NEXT: sarl $31, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl $1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl %edi, %esi
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, %edi
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: subl %ecx, %ebp
-; X86-NEXT: sbbl %eax, %edx
+; X86-NEXT: subl %ecx, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: sbbl %edi, %edx
-; X86-NEXT: movl (%esp), %edi # 4-byte Reload
-; X86-NEXT: sbbl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl %eax, %ebp
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %edi, %esi
+; X86-NEXT: movl %esi, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %ebx, (%esp) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl $-1, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: adcl $-1, %esi
-; X86-NEXT: adcl $-1, %ebx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %ebx, %eax
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: orl %edi, %ecx
; X86-NEXT: jne .LBB4_4
; X86-NEXT: # %bb.5:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: .LBB4_7: # %udiv-loop-exit
-; X86-NEXT: shldl $1, %edx, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: shldl $1, %eax, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: shldl $1, %ebx, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: shldl $1, %eax, %ebx
+; X86-NEXT: orl %ecx, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: shldl $1, %esi, %eax
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: addl %esi, %esi
-; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: orl %edi, %esi
; X86-NEXT: .LBB4_8: # %udiv-end
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, %edi
; X86-NEXT: xorl %ecx, %edx
+; X86-NEXT: xorl %ecx, %ebx
; X86-NEXT: xorl %ecx, %eax
; X86-NEXT: xorl %ecx, %esi
; X86-NEXT: subl %ecx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %ecx, %ebx
; X86-NEXT: sbbl %ecx, %edx
-; X86-NEXT: sbbl %ecx, %edi
-; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, (%ebp)
; X86-NEXT: movl %eax, 4(%ebp)
-; X86-NEXT: movl %edx, 8(%ebp)
-; X86-NEXT: movl %edi, 12(%ebp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %ebx, 8(%ebp)
+; X86-NEXT: movl %edx, 12(%ebp)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-NEXT: imull %eax, %ecx
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT: addl %edx, %edi
+; X86-NEXT: addl %ecx, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: imull %esi, %ecx
@@ -568,12 +568,12 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: mull %edx
; X86-NEXT: addl %edx, %ebp
; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %ebx, %ebp
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl %edi, %ebp
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: subl (%esp), %edx # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index defd81e6ab7710..b34302f74f1b41 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -452,15 +452,13 @@ define i32 @freeze_ashr(i32 %a0) nounwind {
; X86-LABEL: freeze_ashr:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: sarl $3, %eax
-; X86-NEXT: sarl $3, %eax
+; X86-NEXT: sarl $6, %eax
; X86-NEXT: retl
;
; X64-LABEL: freeze_ashr:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: sarl $3, %eax
-; X64-NEXT: sarl $3, %eax
+; X64-NEXT: sarl $6, %eax
; X64-NEXT: retq
%x = ashr i32 %a0, 3
%y = freeze i32 %x
@@ -472,19 +470,41 @@ define i32 @freeze_ashr_exact(i32 %a0) nounwind {
; X86-LABEL: freeze_ashr_exact:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sarl $9, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: freeze_ashr_exact:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: sarl $9, %eax
+; X64-NEXT: retq
+ %x = ashr exact i32 %a0, 3
+ %y = freeze i32 %x
+ %z = ashr i32 %y, 6
+ ret i32 %z
+}
+
+define i32 @freeze_ashr_exact_extra_use(i32 %a0, ptr %escape) nounwind {
+; X86-LABEL: freeze_ashr_exact_extra_use:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: sarl $3, %eax
+; X86-NEXT: movl %eax, (%ecx)
; X86-NEXT: sarl $6, %eax
; X86-NEXT: retl
;
-; X64-LABEL: freeze_ashr_exact:
+; X64-LABEL: freeze_ashr_exact_extra_use:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: sarl $3, %eax
+; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: sarl $6, %eax
; X64-NEXT: retq
%x = ashr exact i32 %a0, 3
%y = freeze i32 %x
%z = ashr i32 %y, 6
+ store i32 %x, ptr %escape
ret i32 %z
}
@@ -507,30 +527,12 @@ define i32 @freeze_ashr_outofrange(i32 %a0) nounwind {
define <8 x i16> @freeze_ashr_vec(<8 x i16> %a0) nounwind {
; X86-LABEL: freeze_ashr_vec:
; X86: # %bb.0:
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: psraw $1, %xmm2
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
-; X86-NEXT: movdqa %xmm1, %xmm3
-; X86-NEXT: pandn %xmm2, %xmm3
-; X86-NEXT: psraw $3, %xmm0
-; X86-NEXT: pand %xmm1, %xmm0
-; X86-NEXT: por %xmm3, %xmm0
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: psraw $3, %xmm2
-; X86-NEXT: psraw $1, %xmm0
-; X86-NEXT: pand %xmm1, %xmm0
-; X86-NEXT: pandn %xmm2, %xmm1
-; X86-NEXT: por %xmm1, %xmm0
+; X86-NEXT: psraw $4, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: freeze_ashr_vec:
; X64: # %bb.0:
-; X64-NEXT: vpsraw $1, %xmm0, %xmm1
-; X64-NEXT: vpsraw $3, %xmm0, %xmm0
-; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; X64-NEXT: vpsraw $3, %xmm0, %xmm1
-; X64-NEXT: vpsraw $1, %xmm0, %xmm0
-; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; X64-NEXT: vpsraw $4, %xmm0, %xmm0
; X64-NEXT: retq
%x = ashr <8 x i16> %a0, <i16 3, i16 1, i16 3, i16 1, i16 3, i16 1, i16 3, i16 1>
%y = freeze <8 x i16> %x
@@ -561,15 +563,13 @@ define i32 @freeze_lshr(i32 %a0) nounwind {
; X86-LABEL: freeze_lshr:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $2, %eax
-; X86-NEXT: shrl %eax
+; X86-NEXT: shrl $3, %eax
; X86-NEXT: retl
;
; X64-LABEL: freeze_lshr:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shrl $2, %eax
-; X64-NEXT: shrl %eax
+; X64-NEXT: shrl $3, %eax
; X64-NEXT: retq
%x = lshr i32 %a0, 2
%y = freeze i32 %x
@@ -581,19 +581,41 @@ define i32 @freeze_lshr_exact(i32 %a0) nounwind {
; X86-LABEL: freeze_lshr_exact:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shrl $8, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: freeze_lshr_exact:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shrl $8, %eax
+; X64-NEXT: retq
+ %x = lshr exact i32 %a0, 3
+ %y = freeze i32 %x
+ %z = lshr i32 %y, 5
+ ret i32 %z
+}
+
+define i32 @freeze_lshr_exact_extra_use(i32 %a0, ptr %escape) nounwind {
+; X86-LABEL: freeze_lshr_exact_extra_use:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shrl $3, %eax
+; X86-NEXT: movl %eax, (%ecx)
; X86-NEXT: shrl $5, %eax
; X86-NEXT: retl
;
-; X64-LABEL: freeze_lshr_exact:
+; X64-LABEL: freeze_lshr_exact_extra_use:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shrl $3, %eax
+; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: shrl $5, %eax
; X64-NEXT: retq
%x = lshr exact i32 %a0, 3
%y = freeze i32 %x
%z = lshr i32 %y, 5
+ store i32 %x, ptr %escape
ret i32 %z
}
@@ -616,30 +638,12 @@ define i32 @freeze_lshr_outofrange(i32 %a0) nounwind {
define <8 x i16> @freeze_lshr_vec(<8 x i16> %a0) nounwind {
; X86-LABEL: freeze_lshr_vec:
; X86: # %bb.0:
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: psrlw $1, %xmm2
-; X86-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
-; X86-NEXT: movdqa %xmm1, %xmm3
-; X86-NEXT: pandn %xmm2, %xmm3
-; X86-NEXT: psrlw $2, %xmm0
-; X86-NEXT: pand %xmm1, %xmm0
-; X86-NEXT: por %xmm3, %xmm0
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: psrlw $2, %xmm2
-; X86-NEXT: psrlw $1, %xmm0
-; X86-NEXT: pand %xmm1, %xmm0
-; X86-NEXT: pandn %xmm2, %xmm1
-; X86-NEXT: por %xmm1, %xmm0
+; X86-NEXT: psrlw $3, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: freeze_lshr_vec:
; X64: # %bb.0:
-; X64-NEXT: vpsrlw $1, %xmm0, %xmm1
-; X64-NEXT: vpsrlw $2, %xmm0, %xmm0
-; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; X64-NEXT: vpsrlw $2, %xmm0, %xmm1
-; X64-NEXT: vpsrlw $1, %xmm0, %xmm0
-; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; X64-NEXT: vpsrlw $3, %xmm0, %xmm0
; X64-NEXT: retq
%x = lshr <8 x i16> %a0, <i16 2, i16 1, i16 2, i16 1, i16 2, i16 1, i16 2, i16 1>
%y = freeze <8 x i16> %x
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
index 2fdf6ef224ca96..133dcdcf295ebb 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
@@ -684,22 +684,21 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5
-; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2
; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5
; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5
+; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2
-; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
@@ -715,22 +714,21 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5
; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
@@ -772,20 +770,19 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm6
; AVX512F-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7
; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512F-NEXT: vpmaxub %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpmaxub %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpandq %zmm6, %zmm4, %zmm4
-; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2
-; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
@@ -803,20 +800,19 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm6
; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm2, %ymm3, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4
-; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm4, %zmm4
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
@@ -859,22 +855,21 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4
; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm5
-; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5
; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm5
+; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
-; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0
-; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT: vpsubb %ymm0, %ymm6, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
@@ -891,22 +886,21 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5
; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm6, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
@@ -949,22 +943,21 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind
; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5
-; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2
; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5
; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5
+; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2
-; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
@@ -981,22 +974,21 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5
; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
@@ -1040,22 +1032,21 @@ define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4
; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm5
-; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5
; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm5
+; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
-; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0
-; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT: vpsubb %ymm0, %ymm6, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
@@ -1073,22 +1064,21 @@ define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5
; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm6, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
More information about the llvm-commits
mailing list