[llvm] [DAG] SimplifyMultipleUseDemandedBits - ignore SRL node if we're just demanding known sign bits (PR #114389)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 31 06:36:21 PDT 2024
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/114389
>From e9e16961982ab368285d397a4aa83f00002654be Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 31 Oct 2024 10:52:30 +0000
Subject: [PATCH] [DAG] SimplifyMultipleUseDemandedBits - ignore SRL node if
we're just demanding known sign bits
Check to see if we are only demanding (shifted) signbits from a SRL node that are also signbits in the source node.
We can't demand any upper zero bits that the SRL will shift in (up to max shift amount), and the lower demanded bits bound must already be all signbits.
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 17 +
.../CodeGen/AMDGPU/div-rem-by-constant-64.ll | 50 +-
llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 152 ++---
llvm/test/CodeGen/RISCV/pr95284.ll | 22 +-
.../CodeGen/RISCV/urem-seteq-illegal-types.ll | 22 +-
llvm/test/CodeGen/X86/scmp.ll | 637 +++++++++---------
6 files changed, 445 insertions(+), 455 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index fabcbc5f0e856d..f7e97386056126 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -808,6 +808,23 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
}
break;
}
+ case ISD::SRL: {
+ // If we are only demanding sign bits then we can use the shift source
+ // directly.
+ if (std::optional<uint64_t> MaxSA =
+ DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) {
+ SDValue Op0 = Op.getOperand(0);
+ unsigned ShAmt = *MaxSA;
+ unsigned NumSignBits =
+ DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
+ // Must already be signbits in DemandedBits bounds, and can't demand any
+ // shifted in zeroes.
+ if (DemandedBits.countl_zero() >= ShAmt &&
+ DemandedBits.countr_zero() >= (BitWidth - NumSignBits))
+ return Op0;
+ }
+ break;
+ }
case ISD::SETCC: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
diff --git a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
index 4143c65a840d71..662de47413654f 100644
--- a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
@@ -1052,16 +1052,15 @@ define noundef i64 @srem64_i32max(i64 noundef %i) {
; GFX9-NEXT: s_mov_b32 s6, 0x80000001
; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v1
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, 3, v[2:3]
-; GFX9-NEXT: v_mul_i32_i24_e32 v8, 3, v6
-; GFX9-NEXT: v_lshl_add_u32 v9, v6, 31, v6
-; GFX9-NEXT: v_mov_b32_e32 v10, v5
+; GFX9-NEXT: v_lshl_add_u32 v8, v6, 31, v6
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0
+; GFX9-NEXT: v_mov_b32_e32 v9, v5
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5]
-; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, v3
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2
-; GFX9-NEXT: v_add3_u32 v7, v7, v9, v8
+; GFX9-NEXT: v_add3_u32 v7, v7, v8, v6
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, -1, v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2
; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3]
; GFX9-NEXT: v_sub_u32_e32 v5, v5, v1
@@ -1085,10 +1084,9 @@ define noundef i64 @srem64_i32max(i64 noundef %i) {
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; GFX942-NEXT: v_mul_i32_i24_e32 v4, 3, v2
-; GFX942-NEXT: v_lshl_add_u32 v5, v2, 31, v2
+; GFX942-NEXT: v_lshl_add_u32 v4, v2, 31, v2
; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, 3, 0
-; GFX942-NEXT: v_add3_u32 v3, v3, v5, v4
+; GFX942-NEXT: v_add3_u32 v3, v3, v4, v2
; GFX942-NEXT: v_mul_hi_u32 v4, v0, 3
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v1, 3, v[4:5]
@@ -1125,17 +1123,16 @@ define noundef i64 @srem64_i32max(i64 noundef %i) {
; GFX1030-NEXT: v_mul_hi_u32 v2, v0, 3
; GFX1030-NEXT: v_mov_b32_e32 v3, 0
; GFX1030-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; GFX1030-NEXT: v_mul_i32_i24_e32 v7, 3, v6
; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v1, 3, v[2:3]
-; GFX1030-NEXT: v_mov_b32_e32 v8, v5
+; GFX1030-NEXT: v_mov_b32_e32 v7, v5
; GFX1030-NEXT: v_mov_b32_e32 v5, v3
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v6, 3, 0
; GFX1030-NEXT: v_lshl_add_u32 v6, v6, 31, v6
; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0x80000001, v0, v[4:5]
-; GFX1030-NEXT: v_add3_u32 v3, v3, v6, v7
+; GFX1030-NEXT: v_add3_u32 v3, v3, v6, v2
; GFX1030-NEXT: v_mov_b32_e32 v4, v5
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, -1, v[2:3]
-; GFX1030-NEXT: v_add_co_u32 v4, s4, v8, v4
+; GFX1030-NEXT: v_add_co_u32 v4, s4, v7, v4
; GFX1030-NEXT: v_add_co_ci_u32_e64 v5, null, 0, 0, s4
; GFX1030-NEXT: v_sub_nc_u32_e32 v6, v3, v1
; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x80000001, v1, v[4:5]
@@ -1167,16 +1164,15 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) {
; GFX9-NEXT: s_mov_b32 s6, 0x80000001
; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v1
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, 3, v[2:3]
-; GFX9-NEXT: v_mul_i32_i24_e32 v8, 3, v6
-; GFX9-NEXT: v_lshl_add_u32 v9, v6, 31, v6
-; GFX9-NEXT: v_mov_b32_e32 v10, v5
+; GFX9-NEXT: v_lshl_add_u32 v8, v6, 31, v6
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0
+; GFX9-NEXT: v_mov_b32_e32 v9, v5
; GFX9-NEXT: v_mov_b32_e32 v5, v3
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5]
-; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, v3
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2
-; GFX9-NEXT: v_add3_u32 v7, v7, v9, v8
+; GFX9-NEXT: v_add3_u32 v7, v7, v8, v6
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, -1, v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2
; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3]
; GFX9-NEXT: v_sub_u32_e32 v5, v5, v1
@@ -1195,10 +1191,9 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) {
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; GFX942-NEXT: v_mul_i32_i24_e32 v4, 3, v2
-; GFX942-NEXT: v_lshl_add_u32 v5, v2, 31, v2
+; GFX942-NEXT: v_lshl_add_u32 v4, v2, 31, v2
; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, 3, 0
-; GFX942-NEXT: v_add3_u32 v3, v3, v5, v4
+; GFX942-NEXT: v_add3_u32 v3, v3, v4, v2
; GFX942-NEXT: v_mul_hi_u32 v4, v0, 3
; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v1, 3, v[4:5]
@@ -1227,17 +1222,16 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) {
; GFX1030-NEXT: v_mul_hi_u32 v2, v0, 3
; GFX1030-NEXT: v_mov_b32_e32 v3, 0
; GFX1030-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; GFX1030-NEXT: v_mul_i32_i24_e32 v7, 3, v6
; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v1, 3, v[2:3]
-; GFX1030-NEXT: v_mov_b32_e32 v8, v5
+; GFX1030-NEXT: v_mov_b32_e32 v7, v5
; GFX1030-NEXT: v_mov_b32_e32 v5, v3
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v6, 3, 0
; GFX1030-NEXT: v_lshl_add_u32 v6, v6, 31, v6
; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0x80000001, v0, v[4:5]
-; GFX1030-NEXT: v_add3_u32 v3, v3, v6, v7
+; GFX1030-NEXT: v_add3_u32 v3, v3, v6, v2
; GFX1030-NEXT: v_mov_b32_e32 v4, v5
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, -1, v[2:3]
-; GFX1030-NEXT: v_add_co_u32 v4, s4, v8, v4
+; GFX1030-NEXT: v_add_co_u32 v4, s4, v7, v4
; GFX1030-NEXT: v_add_co_ci_u32_e64 v5, null, 0, 0, s4
; GFX1030-NEXT: v_sub_nc_u32_e32 v6, v3, v1
; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x80000001, v1, v[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 786fe03164690e..68ebc21e2ba4d6 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -37,12 +37,11 @@ define i128 @fptosi_f64_to_i128(double %x) {
; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9
; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0
-; SDAG-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, -1, s[4:5]
; SDAG-NEXT: s_mov_b64 s[4:5], 0x432
; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc
; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
@@ -62,34 +61,33 @@ define i128 @fptosi_f64_to_i128(double %x) {
; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5]
; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5]
-; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v11, 0
+; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0
; SDAG-NEXT: v_mov_b32_e32 v3, 0
-; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2
-; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v2, 0
+; SDAG-NEXT: v_mul_lo_u32 v12, v8, v2
+; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0
; SDAG-NEXT: v_mov_b32_e32 v2, v1
-; SDAG-NEXT: v_mul_lo_u32 v6, v11, v6
-; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v11, v[2:3]
-; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12
-; SDAG-NEXT: v_add3_u32 v5, v5, v6, v13
+; SDAG-NEXT: v_mul_lo_u32 v6, v10, v6
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v10, v[2:3]
+; SDAG-NEXT: v_mul_lo_u32 v10, v9, v7
+; SDAG-NEXT: v_add3_u32 v5, v5, v6, v12
; SDAG-NEXT: v_mov_b32_e32 v6, v2
; SDAG-NEXT: v_mov_b32_e32 v2, v3
-; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[1:2]
-; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v12, v[4:5]
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2]
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[4:5]
; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2
; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: v_mul_lo_u32 v9, v9, v7
+; SDAG-NEXT: v_mul_lo_u32 v9, v9, v11
; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6]
-; SDAG-NEXT: ; implicit-def: $vgpr11
; SDAG-NEXT: ; implicit-def: $vgpr8
-; SDAG-NEXT: v_add3_u32 v4, v10, v4, v9
+; SDAG-NEXT: v_add3_u32 v4, v9, v4, v10
; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3
; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5]
; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
-; SDAG-NEXT: ; implicit-def: $vgpr9
; SDAG-NEXT: ; implicit-def: $vgpr10
+; SDAG-NEXT: ; implicit-def: $vgpr9
; SDAG-NEXT: .LBB0_4: ; %Flow
; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13]
; SDAG-NEXT: s_cbranch_execz .LBB0_6
@@ -102,9 +100,9 @@ define i128 @fptosi_f64_to_i128(double %x) {
; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v6, v0, v4, s[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7]
-; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v11, 0
+; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
; SDAG-NEXT: v_mov_b32_e32 v2, 0
-; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v11, v[1:2]
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v10, v[1:2]
; SDAG-NEXT: v_mov_b32_e32 v7, v4
; SDAG-NEXT: v_mov_b32_e32 v4, v2
; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[3:4]
@@ -112,7 +110,7 @@ define i128 @fptosi_f64_to_i128(double %x) {
; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5]
; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3]
; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3]
-; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4]
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v6, v[3:4]
; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3
; SDAG-NEXT: .LBB0_6: ; %Flow1
; SDAG-NEXT: s_or_b64 exec, exec, s[12:13]
@@ -409,12 +407,11 @@ define i128 @fptoui_f64_to_i128(double %x) {
; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9
; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0
-; SDAG-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, -1, s[4:5]
; SDAG-NEXT: s_mov_b64 s[4:5], 0x432
; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc
; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
@@ -434,34 +431,33 @@ define i128 @fptoui_f64_to_i128(double %x) {
; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5]
; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5]
-; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v11, 0
+; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0
; SDAG-NEXT: v_mov_b32_e32 v3, 0
-; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2
-; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v2, 0
+; SDAG-NEXT: v_mul_lo_u32 v12, v8, v2
+; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0
; SDAG-NEXT: v_mov_b32_e32 v2, v1
-; SDAG-NEXT: v_mul_lo_u32 v6, v11, v6
-; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v11, v[2:3]
-; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12
-; SDAG-NEXT: v_add3_u32 v5, v5, v6, v13
+; SDAG-NEXT: v_mul_lo_u32 v6, v10, v6
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v10, v[2:3]
+; SDAG-NEXT: v_mul_lo_u32 v10, v9, v7
+; SDAG-NEXT: v_add3_u32 v5, v5, v6, v12
; SDAG-NEXT: v_mov_b32_e32 v6, v2
; SDAG-NEXT: v_mov_b32_e32 v2, v3
-; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[1:2]
-; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v12, v[4:5]
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2]
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[4:5]
; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2
; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: v_mul_lo_u32 v9, v9, v7
+; SDAG-NEXT: v_mul_lo_u32 v9, v9, v11
; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6]
-; SDAG-NEXT: ; implicit-def: $vgpr11
; SDAG-NEXT: ; implicit-def: $vgpr8
-; SDAG-NEXT: v_add3_u32 v4, v10, v4, v9
+; SDAG-NEXT: v_add3_u32 v4, v9, v4, v10
; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3
; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5]
; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
-; SDAG-NEXT: ; implicit-def: $vgpr9
; SDAG-NEXT: ; implicit-def: $vgpr10
+; SDAG-NEXT: ; implicit-def: $vgpr9
; SDAG-NEXT: .LBB1_4: ; %Flow
; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13]
; SDAG-NEXT: s_cbranch_execz .LBB1_6
@@ -474,9 +470,9 @@ define i128 @fptoui_f64_to_i128(double %x) {
; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v6, v0, v4, s[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7]
-; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v11, 0
+; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
; SDAG-NEXT: v_mov_b32_e32 v2, 0
-; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v11, v[1:2]
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v10, v[1:2]
; SDAG-NEXT: v_mov_b32_e32 v7, v4
; SDAG-NEXT: v_mov_b32_e32 v4, v2
; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[3:4]
@@ -484,7 +480,7 @@ define i128 @fptoui_f64_to_i128(double %x) {
; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5]
; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3]
; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3]
-; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4]
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v6, v[3:4]
; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3
; SDAG-NEXT: .LBB1_6: ; %Flow1
; SDAG-NEXT: s_or_b64 exec, exec, s[12:13]
@@ -780,7 +776,6 @@ define i128 @fptosi_f32_to_i128(float %x) {
; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9
; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0
-; SDAG-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, -1, s[4:5]
; SDAG-NEXT: s_mov_b64 s[4:5], 0x95
; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6]
@@ -806,24 +801,24 @@ define i128 @fptosi_f32_to_i128(float %x) {
; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5]
; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5]
-; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, 0
-; SDAG-NEXT: v_mul_lo_u32 v14, v8, v2
-; SDAG-NEXT: v_mul_lo_u32 v15, v10, v3
+; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5]
+; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0
+; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2
+; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3
; SDAG-NEXT: v_mov_b32_e32 v6, v1
-; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[6:7]
+; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, v[6:7]
; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0
; SDAG-NEXT: v_mov_b32_e32 v6, v5
; SDAG-NEXT: v_mov_b32_e32 v5, v7
-; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v8, v[4:5]
-; SDAG-NEXT: v_add3_u32 v3, v3, v15, v14
-; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v13, v[2:3]
+; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5]
+; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3]
; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v5
; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: v_mul_lo_u32 v3, v9, v12
-; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13
-; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v8, v[5:6]
+; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11
+; SDAG-NEXT: v_mul_lo_u32 v7, v9, v12
+; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6]
; SDAG-NEXT: ; implicit-def: $vgpr10
; SDAG-NEXT: ; implicit-def: $vgpr8
; SDAG-NEXT: ; implicit-def: $vgpr9
@@ -1138,7 +1133,6 @@ define i128 @fptoui_f32_to_i128(float %x) {
; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9
; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0
-; SDAG-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, -1, s[4:5]
; SDAG-NEXT: s_mov_b64 s[4:5], 0x95
; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6]
@@ -1164,24 +1158,24 @@ define i128 @fptoui_f32_to_i128(float %x) {
; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5]
; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5]
-; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, 0
-; SDAG-NEXT: v_mul_lo_u32 v14, v8, v2
-; SDAG-NEXT: v_mul_lo_u32 v15, v10, v3
+; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5]
+; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0
+; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2
+; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3
; SDAG-NEXT: v_mov_b32_e32 v6, v1
-; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[6:7]
+; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, v[6:7]
; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0
; SDAG-NEXT: v_mov_b32_e32 v6, v5
; SDAG-NEXT: v_mov_b32_e32 v5, v7
-; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v8, v[4:5]
-; SDAG-NEXT: v_add3_u32 v3, v3, v15, v14
-; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v13, v[2:3]
+; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5]
+; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3]
; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v5
; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: v_mul_lo_u32 v3, v9, v12
-; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13
-; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v8, v[5:6]
+; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11
+; SDAG-NEXT: v_mul_lo_u32 v7, v9, v12
+; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6]
; SDAG-NEXT: ; implicit-def: $vgpr10
; SDAG-NEXT: ; implicit-def: $vgpr8
; SDAG-NEXT: ; implicit-def: $vgpr9
@@ -1551,26 +1545,25 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5]
; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0
-; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; SDAG-NEXT: v_mul_lo_u32 v13, v9, v2
+; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3
; SDAG-NEXT: v_mov_b32_e32 v6, v1
; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7]
-; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3
; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0
-; SDAG-NEXT: v_add_co_u32_e64 v6, s[4:5], -1, v10
-; SDAG-NEXT: v_mov_b32_e32 v10, v5
+; SDAG-NEXT: v_mov_b32_e32 v8, v5
; SDAG-NEXT: v_mov_b32_e32 v5, v7
-; SDAG-NEXT: v_addc_co_u32_e64 v8, s[4:5], 0, -1, s[4:5]
; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; SDAG-NEXT: v_add_co_u32_e64 v6, s[4:5], -1, v10
; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13
; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v12, v[2:3]
-; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v10, v5
+; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5
; SDAG-NEXT: v_mul_lo_u32 v3, v6, v11
+; SDAG-NEXT: v_mul_lo_u32 v7, v6, v12
; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: v_mul_lo_u32 v7, v8, v12
; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6]
-; SDAG-NEXT: ; implicit-def: $vgpr8
; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3
+; SDAG-NEXT: ; implicit-def: $vgpr8
; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1
; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6
@@ -1903,26 +1896,25 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5]
; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0
-; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; SDAG-NEXT: v_mul_lo_u32 v13, v9, v2
+; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3
; SDAG-NEXT: v_mov_b32_e32 v6, v1
; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7]
-; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3
; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0
-; SDAG-NEXT: v_add_co_u32_e64 v6, s[4:5], -1, v10
-; SDAG-NEXT: v_mov_b32_e32 v10, v5
+; SDAG-NEXT: v_mov_b32_e32 v8, v5
; SDAG-NEXT: v_mov_b32_e32 v5, v7
-; SDAG-NEXT: v_addc_co_u32_e64 v8, s[4:5], 0, -1, s[4:5]
; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; SDAG-NEXT: v_add_co_u32_e64 v6, s[4:5], -1, v10
; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13
; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v12, v[2:3]
-; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v10, v5
+; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5
; SDAG-NEXT: v_mul_lo_u32 v3, v6, v11
+; SDAG-NEXT: v_mul_lo_u32 v7, v6, v12
; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: v_mul_lo_u32 v7, v8, v12
; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6]
-; SDAG-NEXT: ; implicit-def: $vgpr8
; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3
+; SDAG-NEXT: ; implicit-def: $vgpr8
; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1
; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6
diff --git a/llvm/test/CodeGen/RISCV/pr95284.ll b/llvm/test/CodeGen/RISCV/pr95284.ll
index 135e128c00bac4..82600d8d3df51c 100644
--- a/llvm/test/CodeGen/RISCV/pr95284.ll
+++ b/llvm/test/CodeGen/RISCV/pr95284.ll
@@ -6,19 +6,17 @@
define signext i64 @PR95284(i32 signext %0) {
; RV32I-LABEL: PR95284:
; RV32I: # %bb.0: # %entry
-; RV32I-NEXT: seqz a1, a0
-; RV32I-NEXT: neg a2, a1
-; RV32I-NEXT: addi a0, a0, -1
-; RV32I-NEXT: srli a2, a2, 1
-; RV32I-NEXT: srli a0, a0, 1
-; RV32I-NEXT: slli a1, a1, 31
-; RV32I-NEXT: or a0, a1, a0
-; RV32I-NEXT: addi a0, a0, 1
-; RV32I-NEXT: seqz a1, a0
-; RV32I-NEXT: add a1, a2, a1
-; RV32I-NEXT: slli a1, a1, 1
+; RV32I-NEXT: addi a1, a0, -1
+; RV32I-NEXT: seqz a0, a0
+; RV32I-NEXT: slli a2, a0, 31
+; RV32I-NEXT: srli a1, a1, 1
+; RV32I-NEXT: or a1, a1, a2
+; RV32I-NEXT: addi a1, a1, 1
+; RV32I-NEXT: seqz a2, a1
+; RV32I-NEXT: sub a2, a2, a0
+; RV32I-NEXT: andi a0, a1, -2
+; RV32I-NEXT: slli a1, a2, 1
; RV32I-NEXT: srli a1, a1, 1
-; RV32I-NEXT: andi a0, a0, -2
; RV32I-NEXT: ret
;
; RV64I-LABEL: PR95284:
diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
index 0ee067b673da9a..b887036372f7b2 100644
--- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
@@ -329,6 +329,7 @@ define void @test_urem_vec(ptr %X) nounwind {
; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32-NEXT: mv s0, a0
; RV32-NEXT: lbu a0, 4(a0)
; RV32-NEXT: lw a1, 0(s0)
@@ -351,6 +352,7 @@ define void @test_urem_vec(ptr %X) nounwind {
; RV32-NEXT: addi a0, a0, -1638
; RV32-NEXT: andi a0, a0, 2047
; RV32-NEXT: sltiu s1, a0, 2
+; RV32-NEXT: xori s4, s1, 1
; RV32-NEXT: li a1, 1463
; RV32-NEXT: mv a0, s2
; RV32-NEXT: call __mulsi3
@@ -358,23 +360,22 @@ define void @test_urem_vec(ptr %X) nounwind {
; RV32-NEXT: andi a0, a0, 2047
; RV32-NEXT: sltiu a0, a0, 293
; RV32-NEXT: addi s3, s3, -1
-; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: addi s1, s1, -1
-; RV32-NEXT: slli a1, s1, 21
-; RV32-NEXT: srli a1, a1, 31
-; RV32-NEXT: andi a2, s3, 2047
+; RV32-NEXT: addi a0, a0, -1
+; RV32-NEXT: andi a1, s3, 2047
; RV32-NEXT: andi a0, a0, 2047
; RV32-NEXT: slli a0, a0, 11
; RV32-NEXT: slli s1, s1, 22
; RV32-NEXT: or a0, a0, s1
-; RV32-NEXT: or a0, a2, a0
+; RV32-NEXT: or a0, a1, a0
; RV32-NEXT: sw a0, 0(s0)
-; RV32-NEXT: sb a1, 4(s0)
+; RV32-NEXT: sb s4, 4(s0)
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
@@ -457,16 +458,15 @@ define void @test_urem_vec(ptr %X) nounwind {
; RV32M-NEXT: addi a1, a1, -1638
; RV32M-NEXT: andi a1, a1, 2047
; RV32M-NEXT: sltiu a1, a1, 2
-; RV32M-NEXT: li a4, 1463
-; RV32M-NEXT: mul a3, a3, a4
+; RV32M-NEXT: xori a4, a1, 1
+; RV32M-NEXT: li a5, 1463
+; RV32M-NEXT: mul a3, a3, a5
; RV32M-NEXT: addi a3, a3, -1463
; RV32M-NEXT: andi a3, a3, 2047
; RV32M-NEXT: sltiu a3, a3, 293
; RV32M-NEXT: addi a2, a2, -1
-; RV32M-NEXT: addi a3, a3, -1
; RV32M-NEXT: addi a1, a1, -1
-; RV32M-NEXT: slli a4, a1, 21
-; RV32M-NEXT: srli a4, a4, 31
+; RV32M-NEXT: addi a3, a3, -1
; RV32M-NEXT: andi a2, a2, 2047
; RV32M-NEXT: andi a3, a3, 2047
; RV32M-NEXT: slli a3, a3, 11
diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll
index 5ae5caf3e88b20..537e05310dbea8 100644
--- a/llvm/test/CodeGen/X86/scmp.ll
+++ b/llvm/test/CodeGen/X86/scmp.ll
@@ -1763,7 +1763,7 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; SSE2-NEXT: pushq %r13
; SSE2-NEXT: pushq %r12
; SSE2-NEXT: pushq %rbx
-; SSE2-NEXT: movq %rdi, %r14
+; SSE2-NEXT: movq %rdi, %rax
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
@@ -1779,11 +1779,11 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; SSE2-NEXT: setl %sil
; SSE2-NEXT: setg %dil
; SSE2-NEXT: subb %sil, %dil
-; SSE2-NEXT: movsbq %dil, %rax
-; SSE2-NEXT: movq %rax, (%r14)
-; SSE2-NEXT: movq %rax, %rsi
-; SSE2-NEXT: sarq $63, %rsi
-; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: movsbq %dil, %rdi
+; SSE2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: movq %rdi, (%rax)
+; SSE2-NEXT: sarq $63, %rdi
+; SSE2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE2-NEXT: addb %r11b, %r11b
; SSE2-NEXT: sarb %r11b
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
@@ -1793,9 +1793,9 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; SSE2-NEXT: setl %sil
; SSE2-NEXT: setg %r11b
; SSE2-NEXT: subb %sil, %r11b
-; SSE2-NEXT: movsbq %r11b, %rdi
-; SSE2-NEXT: movq %rdi, %r11
-; SSE2-NEXT: sarq $63, %r11
+; SSE2-NEXT: movsbq %r11b, %r11
+; SSE2-NEXT: movq %r11, %r14
+; SSE2-NEXT: sarq $63, %r14
; SSE2-NEXT: addb %r12b, %r12b
; SSE2-NEXT: sarb %r12b
; SSE2-NEXT: addb %dl, %dl
@@ -1804,18 +1804,18 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; SSE2-NEXT: setl %dl
; SSE2-NEXT: setg %sil
; SSE2-NEXT: subb %dl, %sil
-; SSE2-NEXT: movsbq %sil, %rdx
-; SSE2-NEXT: movq %rdx, %r13
-; SSE2-NEXT: sarq $63, %r13
+; SSE2-NEXT: movsbq %sil, %r13
+; SSE2-NEXT: movq %r13, %rdi
+; SSE2-NEXT: sarq $63, %rdi
; SSE2-NEXT: addb %r15b, %r15b
; SSE2-NEXT: sarb %r15b
; SSE2-NEXT: addb %cl, %cl
; SSE2-NEXT: sarb %cl
; SSE2-NEXT: cmpb %r15b, %cl
; SSE2-NEXT: setl %cl
-; SSE2-NEXT: setg %sil
-; SSE2-NEXT: subb %cl, %sil
-; SSE2-NEXT: movsbq %sil, %r15
+; SSE2-NEXT: setg %dl
+; SSE2-NEXT: subb %cl, %dl
+; SSE2-NEXT: movsbq %dl, %r15
; SSE2-NEXT: movq %r15, %rcx
; SSE2-NEXT: sarq $63, %rcx
; SSE2-NEXT: addb %bpl, %bpl
@@ -1823,9 +1823,9 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; SSE2-NEXT: addb %r8b, %r8b
; SSE2-NEXT: sarb %r8b
; SSE2-NEXT: cmpb %bpl, %r8b
-; SSE2-NEXT: setl %sil
+; SSE2-NEXT: setl %dl
; SSE2-NEXT: setg %r8b
-; SSE2-NEXT: subb %sil, %r8b
+; SSE2-NEXT: subb %dl, %r8b
; SSE2-NEXT: movsbq %r8b, %r8
; SSE2-NEXT: movq %r8, %r12
; SSE2-NEXT: sarq $63, %r12
@@ -1834,85 +1834,83 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; SSE2-NEXT: addb %r9b, %r9b
; SSE2-NEXT: sarb %r9b
; SSE2-NEXT: cmpb %bl, %r9b
-; SSE2-NEXT: setl %sil
+; SSE2-NEXT: setl %dl
; SSE2-NEXT: setg %r9b
-; SSE2-NEXT: subb %sil, %r9b
-; SSE2-NEXT: movsbq %r9b, %r9
-; SSE2-NEXT: movq %r9, %rbx
-; SSE2-NEXT: sarq $63, %rbx
+; SSE2-NEXT: subb %dl, %r9b
+; SSE2-NEXT: movsbq %r9b, %rsi
+; SSE2-NEXT: movq %rsi, %r9
+; SSE2-NEXT: sarq $63, %r9
; SSE2-NEXT: addb %r10b, %r10b
; SSE2-NEXT: sarb %r10b
-; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
-; SSE2-NEXT: addb %sil, %sil
-; SSE2-NEXT: sarb %sil
-; SSE2-NEXT: cmpb %r10b, %sil
-; SSE2-NEXT: setl %sil
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT: addb %dl, %dl
+; SSE2-NEXT: sarb %dl
+; SSE2-NEXT: cmpb %r10b, %dl
+; SSE2-NEXT: setl %dl
; SSE2-NEXT: setg %r10b
-; SSE2-NEXT: subb %sil, %r10b
-; SSE2-NEXT: movsbq %r10b, %rbp
-; SSE2-NEXT: movq %rbp, %r10
-; SSE2-NEXT: sarq $63, %r10
-; SSE2-NEXT: movq %r10, %rsi
-; SSE2-NEXT: shldq $62, %rbp, %rsi
-; SSE2-NEXT: movq %rax, %xmm0
-; SSE2-NEXT: movq %rsi, 88(%r14)
-; SSE2-NEXT: shrq $2, %r10
-; SSE2-NEXT: movl %r10d, 96(%r14)
-; SSE2-NEXT: movq %rbx, %rsi
-; SSE2-NEXT: shldq $20, %r9, %rsi
-; SSE2-NEXT: movq %rsi, 64(%r14)
-; SSE2-NEXT: movq %r12, %rsi
-; SSE2-NEXT: shldq $31, %r8, %rsi
-; SSE2-NEXT: movq %rsi, 48(%r14)
-; SSE2-NEXT: movq %rcx, %rsi
-; SSE2-NEXT: shldq $42, %r15, %rsi
-; SSE2-NEXT: movabsq $9007199254738944, %rax # imm = 0x1FFFFFFFFFF800
-; SSE2-NEXT: andq %r13, %rax
-; SSE2-NEXT: shldq $53, %rdx, %r13
-; SSE2-NEXT: movq %rsi, 32(%r14)
-; SSE2-NEXT: movq %r13, 16(%r14)
-; SSE2-NEXT: movabsq $9007199254740991, %rsi # imm = 0x1FFFFFFFFFFFFF
-; SSE2-NEXT: andq %rsi, %r11
-; SSE2-NEXT: shldq $9, %rdi, %r11
-; SSE2-NEXT: shlq $62, %rbp
-; SSE2-NEXT: orq %r11, %rbp
-; SSE2-NEXT: movq %rbp, 80(%r14)
-; SSE2-NEXT: movabsq $2251799813685247, %r11 # imm = 0x7FFFFFFFFFFFF
-; SSE2-NEXT: andq %r10, %r11
-; SSE2-NEXT: movq %r11, %r10
-; SSE2-NEXT: shrq $48, %r10
-; SSE2-NEXT: movb %r10b, 102(%r14)
-; SSE2-NEXT: shrq $32, %r11
-; SSE2-NEXT: movw %r11w, 100(%r14)
+; SSE2-NEXT: subb %dl, %r10b
+; SSE2-NEXT: movsbq %r10b, %r10
+; SSE2-NEXT: movq %r10, %rdx
+; SSE2-NEXT: sarq $63, %rdx
+; SSE2-NEXT: movl %edx, 96(%rax)
+; SSE2-NEXT: movabsq $2251799813685247, %rbp # imm = 0x7FFFFFFFFFFFF
+; SSE2-NEXT: andq %rdx, %rbp
+; SSE2-NEXT: shldq $62, %r10, %rdx
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero
+; SSE2-NEXT: movq %r9, %rbx
+; SSE2-NEXT: shldq $20, %rsi, %rbx
+; SSE2-NEXT: movq %rdx, 88(%rax)
+; SSE2-NEXT: movq %r12, %rdx
+; SSE2-NEXT: shldq $31, %r8, %rdx
+; SSE2-NEXT: movq %rbx, 64(%rax)
+; SSE2-NEXT: movq %rcx, %rbx
+; SSE2-NEXT: shldq $42, %r15, %rbx
+; SSE2-NEXT: movq %rdx, 48(%rax)
+; SSE2-NEXT: movq %rbx, 32(%rax)
+; SSE2-NEXT: movabsq $9007199254738944, %rbx # imm = 0x1FFFFFFFFFF800
+; SSE2-NEXT: andq %rdi, %rbx
+; SSE2-NEXT: shldq $53, %r13, %rdi
+; SSE2-NEXT: movq %rdi, 16(%rax)
+; SSE2-NEXT: movq %rbp, %rdx
+; SSE2-NEXT: shrq $48, %rdx
+; SSE2-NEXT: movb %dl, 102(%rax)
+; SSE2-NEXT: shrq $32, %rbp
+; SSE2-NEXT: movabsq $9007199254740991, %rdx # imm = 0x1FFFFFFFFFFFFF
+; SSE2-NEXT: andq %rdx, %r14
+; SSE2-NEXT: shldq $9, %r11, %r14
+; SSE2-NEXT: movw %bp, 100(%rax)
+; SSE2-NEXT: shlq $62, %r10
+; SSE2-NEXT: orq %r14, %r10
+; SSE2-NEXT: movq %r10, 80(%rax)
; SSE2-NEXT: shlq $42, %r15
-; SSE2-NEXT: shrq $11, %rax
-; SSE2-NEXT: orq %r15, %rax
-; SSE2-NEXT: movq %rax, 24(%r14)
-; SSE2-NEXT: shlq $9, %rdi
-; SSE2-NEXT: shrq $44, %rbx
-; SSE2-NEXT: andl $511, %ebx # imm = 0x1FF
-; SSE2-NEXT: orq %rdi, %rbx
-; SSE2-NEXT: movq %rbx, 72(%r14)
-; SSE2-NEXT: shlq $20, %r9
+; SSE2-NEXT: shrq $11, %rbx
+; SSE2-NEXT: orq %r15, %rbx
+; SSE2-NEXT: movq %rbx, 24(%rax)
+; SSE2-NEXT: shlq $9, %r11
+; SSE2-NEXT: shrq $44, %r9
+; SSE2-NEXT: andl $511, %r9d # imm = 0x1FF
+; SSE2-NEXT: orq %r11, %r9
+; SSE2-NEXT: movq %r9, 72(%rax)
+; SSE2-NEXT: shlq $20, %rsi
; SSE2-NEXT: shrq $33, %r12
; SSE2-NEXT: andl $1048575, %r12d # imm = 0xFFFFF
-; SSE2-NEXT: orq %r9, %r12
-; SSE2-NEXT: movq %r12, 56(%r14)
+; SSE2-NEXT: orq %rsi, %r12
+; SSE2-NEXT: movq %r12, 56(%rax)
; SSE2-NEXT: shlq $31, %r8
; SSE2-NEXT: shrq $22, %rcx
; SSE2-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF
; SSE2-NEXT: orq %r8, %rcx
-; SSE2-NEXT: movq %rcx, 40(%r14)
+; SSE2-NEXT: movq %rcx, 40(%rax)
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload
; SSE2-NEXT: # xmm1 = mem[0],zero
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: andq %rsi, %rax
-; SSE2-NEXT: shlq $53, %rdx
-; SSE2-NEXT: orq %rax, %rdx
-; SSE2-NEXT: movq %rdx, 8(%r14)
-; SSE2-NEXT: movq %r14, %rax
+; SSE2-NEXT: movq %xmm0, %rcx
+; SSE2-NEXT: andq %rdx, %rcx
+; SSE2-NEXT: shlq $53, %r13
+; SSE2-NEXT: orq %rcx, %r13
+; SSE2-NEXT: movq %r13, 8(%rax)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
; SSE2-NEXT: popq %r13
@@ -1929,151 +1927,148 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; SSE4-NEXT: pushq %r13
; SSE4-NEXT: pushq %r12
; SSE4-NEXT: pushq %rbx
-; SSE4-NEXT: movq %rdi, %r14
+; SSE4-NEXT: movq %rdi, %rbx
+; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
+; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
-; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
-; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
-; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
-; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
-; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; SSE4-NEXT: addb %dil, %dil
-; SSE4-NEXT: sarb %dil
+; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; SSE4-NEXT: addb %r14b, %r14b
+; SSE4-NEXT: sarb %r14b
; SSE4-NEXT: addb %sil, %sil
; SSE4-NEXT: sarb %sil
-; SSE4-NEXT: cmpb %dil, %sil
+; SSE4-NEXT: cmpb %r14b, %sil
; SSE4-NEXT: setl %sil
-; SSE4-NEXT: setg %dil
-; SSE4-NEXT: subb %sil, %dil
-; SSE4-NEXT: movsbq %dil, %r12
-; SSE4-NEXT: movq %r12, %rdi
-; SSE4-NEXT: sarq $63, %rdi
-; SSE4-NEXT: addb %r10b, %r10b
-; SSE4-NEXT: sarb %r10b
+; SSE4-NEXT: setg %r14b
+; SSE4-NEXT: subb %sil, %r14b
+; SSE4-NEXT: movsbq %r14b, %r14
+; SSE4-NEXT: movq %r14, (%rbx)
+; SSE4-NEXT: sarq $63, %r14
+; SSE4-NEXT: addb %r15b, %r15b
+; SSE4-NEXT: sarb %r15b
; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
; SSE4-NEXT: addb %sil, %sil
; SSE4-NEXT: sarb %sil
-; SSE4-NEXT: cmpb %r10b, %sil
+; SSE4-NEXT: cmpb %r15b, %sil
; SSE4-NEXT: setl %sil
-; SSE4-NEXT: setg %r10b
-; SSE4-NEXT: subb %sil, %r10b
-; SSE4-NEXT: movsbq %r10b, %r10
-; SSE4-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: sarq $63, %r10
-; SSE4-NEXT: addb %r11b, %r11b
-; SSE4-NEXT: sarb %r11b
+; SSE4-NEXT: setg %r15b
+; SSE4-NEXT: subb %sil, %r15b
+; SSE4-NEXT: movsbq %r15b, %r15
+; SSE4-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT: sarq $63, %r15
+; SSE4-NEXT: addb %bpl, %bpl
+; SSE4-NEXT: sarb %bpl
; SSE4-NEXT: addb %dl, %dl
; SSE4-NEXT: sarb %dl
-; SSE4-NEXT: cmpb %r11b, %dl
+; SSE4-NEXT: cmpb %bpl, %dl
; SSE4-NEXT: setl %dl
-; SSE4-NEXT: setg %r11b
-; SSE4-NEXT: subb %dl, %r11b
-; SSE4-NEXT: movsbq %r11b, %r11
-; SSE4-NEXT: movq %r11, %rsi
-; SSE4-NEXT: sarq $63, %rsi
-; SSE4-NEXT: addb %bl, %bl
-; SSE4-NEXT: sarb %bl
+; SSE4-NEXT: setg %bpl
+; SSE4-NEXT: subb %dl, %bpl
+; SSE4-NEXT: movsbq %bpl, %r12
+; SSE4-NEXT: movq %r12, %r13
+; SSE4-NEXT: sarq $63, %r13
+; SSE4-NEXT: addb %al, %al
+; SSE4-NEXT: sarb %al
; SSE4-NEXT: addb %cl, %cl
; SSE4-NEXT: sarb %cl
-; SSE4-NEXT: cmpb %bl, %cl
+; SSE4-NEXT: cmpb %al, %cl
; SSE4-NEXT: setl %cl
; SSE4-NEXT: setg %dl
; SSE4-NEXT: subb %cl, %dl
-; SSE4-NEXT: movsbq %dl, %rbx
-; SSE4-NEXT: movq %rbx, %rcx
+; SSE4-NEXT: movsbq %dl, %rsi
+; SSE4-NEXT: movq %rsi, %rcx
; SSE4-NEXT: sarq $63, %rcx
-; SSE4-NEXT: addb %r13b, %r13b
-; SSE4-NEXT: sarb %r13b
+; SSE4-NEXT: addb %r11b, %r11b
+; SSE4-NEXT: sarb %r11b
; SSE4-NEXT: addb %r8b, %r8b
; SSE4-NEXT: sarb %r8b
-; SSE4-NEXT: cmpb %r13b, %r8b
+; SSE4-NEXT: cmpb %r11b, %r8b
; SSE4-NEXT: setl %dl
; SSE4-NEXT: setg %r8b
; SSE4-NEXT: subb %dl, %r8b
; SSE4-NEXT: movsbq %r8b, %rdx
; SSE4-NEXT: movq %rdx, %r8
; SSE4-NEXT: sarq $63, %r8
-; SSE4-NEXT: addb %r15b, %r15b
-; SSE4-NEXT: sarb %r15b
+; SSE4-NEXT: addb %r10b, %r10b
+; SSE4-NEXT: sarb %r10b
; SSE4-NEXT: addb %r9b, %r9b
; SSE4-NEXT: sarb %r9b
-; SSE4-NEXT: cmpb %r15b, %r9b
+; SSE4-NEXT: cmpb %r10b, %r9b
; SSE4-NEXT: setl %r9b
-; SSE4-NEXT: setg %r15b
-; SSE4-NEXT: subb %r9b, %r15b
-; SSE4-NEXT: movsbq %r15b, %r9
-; SSE4-NEXT: movq %r9, %r15
-; SSE4-NEXT: sarq $63, %r15
-; SSE4-NEXT: addb %bpl, %bpl
-; SSE4-NEXT: sarb %bpl
-; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
-; SSE4-NEXT: addb %r13b, %r13b
-; SSE4-NEXT: sarb %r13b
-; SSE4-NEXT: cmpb %bpl, %r13b
-; SSE4-NEXT: setl %bpl
-; SSE4-NEXT: setg %r13b
-; SSE4-NEXT: subb %bpl, %r13b
-; SSE4-NEXT: movsbq %r13b, %rbp
+; SSE4-NEXT: setg %r10b
+; SSE4-NEXT: subb %r9b, %r10b
+; SSE4-NEXT: movsbq %r10b, %r9
+; SSE4-NEXT: movq %r9, %r10
+; SSE4-NEXT: sarq $63, %r10
+; SSE4-NEXT: addb %dil, %dil
+; SSE4-NEXT: sarb %dil
+; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
+; SSE4-NEXT: addb %r11b, %r11b
+; SSE4-NEXT: sarb %r11b
+; SSE4-NEXT: cmpb %dil, %r11b
+; SSE4-NEXT: setl %dil
+; SSE4-NEXT: setg %r11b
+; SSE4-NEXT: subb %dil, %r11b
+; SSE4-NEXT: movsbq %r11b, %rdi
+; SSE4-NEXT: movq %rdi, %rbp
+; SSE4-NEXT: sarq $63, %rbp
+; SSE4-NEXT: movl %ebp, 96(%rbx)
; SSE4-NEXT: movq %rbp, %rax
-; SSE4-NEXT: sarq $63, %rax
-; SSE4-NEXT: movq %rax, %r13
-; SSE4-NEXT: shldq $62, %rbp, %r13
-; SSE4-NEXT: movq %r12, (%r14)
-; SSE4-NEXT: movq %r13, 88(%r14)
-; SSE4-NEXT: shrq $2, %rax
-; SSE4-NEXT: movl %eax, 96(%r14)
-; SSE4-NEXT: movq %r15, %r12
-; SSE4-NEXT: shldq $20, %r9, %r12
-; SSE4-NEXT: movq %r12, 64(%r14)
-; SSE4-NEXT: movq %r8, %r12
-; SSE4-NEXT: shldq $31, %rdx, %r12
-; SSE4-NEXT: movq %r12, 48(%r14)
-; SSE4-NEXT: movq %rcx, %r12
-; SSE4-NEXT: shldq $42, %rbx, %r12
-; SSE4-NEXT: movabsq $9007199254738944, %r13 # imm = 0x1FFFFFFFFFF800
-; SSE4-NEXT: andq %rsi, %r13
-; SSE4-NEXT: shldq $53, %r11, %rsi
-; SSE4-NEXT: movq %r12, 32(%r14)
-; SSE4-NEXT: movq %rsi, 16(%r14)
-; SSE4-NEXT: movabsq $9007199254740991, %rsi # imm = 0x1FFFFFFFFFFFFF
-; SSE4-NEXT: andq %rsi, %r10
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE4-NEXT: shldq $9, %r12, %r10
-; SSE4-NEXT: shlq $62, %rbp
-; SSE4-NEXT: orq %r10, %rbp
-; SSE4-NEXT: movq %rbp, 80(%r14)
-; SSE4-NEXT: andq %rsi, %rdi
-; SSE4-NEXT: shlq $53, %r11
-; SSE4-NEXT: orq %rdi, %r11
-; SSE4-NEXT: movq %r11, 8(%r14)
-; SSE4-NEXT: movabsq $2251799813685247, %rsi # imm = 0x7FFFFFFFFFFFF
-; SSE4-NEXT: andq %rax, %rsi
-; SSE4-NEXT: movq %rsi, %rax
-; SSE4-NEXT: shrq $48, %rax
-; SSE4-NEXT: movb %al, 102(%r14)
-; SSE4-NEXT: shrq $32, %rsi
-; SSE4-NEXT: movw %si, 100(%r14)
-; SSE4-NEXT: shlq $42, %rbx
-; SSE4-NEXT: shrq $11, %r13
-; SSE4-NEXT: orq %rbx, %r13
-; SSE4-NEXT: movq %r13, 24(%r14)
-; SSE4-NEXT: movq %r12, %rax
-; SSE4-NEXT: shlq $9, %rax
-; SSE4-NEXT: shrq $44, %r15
-; SSE4-NEXT: andl $511, %r15d # imm = 0x1FF
-; SSE4-NEXT: orq %rax, %r15
-; SSE4-NEXT: movq %r15, 72(%r14)
+; SSE4-NEXT: shldq $62, %rdi, %rax
+; SSE4-NEXT: movabsq $2251799813685247, %r11 # imm = 0x7FFFFFFFFFFFF
+; SSE4-NEXT: andq %rbp, %r11
+; SSE4-NEXT: movq %r10, %rbp
+; SSE4-NEXT: shldq $20, %r9, %rbp
+; SSE4-NEXT: movq %rax, 88(%rbx)
+; SSE4-NEXT: movq %r8, %rax
+; SSE4-NEXT: shldq $31, %rdx, %rax
+; SSE4-NEXT: movq %rbp, 64(%rbx)
+; SSE4-NEXT: movq %rcx, %rbp
+; SSE4-NEXT: shldq $42, %rsi, %rbp
+; SSE4-NEXT: movq %rax, 48(%rbx)
+; SSE4-NEXT: movq %rbp, 32(%rbx)
+; SSE4-NEXT: movabsq $9007199254738944, %rax # imm = 0x1FFFFFFFFFF800
+; SSE4-NEXT: andq %r13, %rax
+; SSE4-NEXT: shldq $53, %r12, %r13
+; SSE4-NEXT: movq %r13, 16(%rbx)
+; SSE4-NEXT: movq %r11, %r13
+; SSE4-NEXT: shrq $48, %r13
+; SSE4-NEXT: movb %r13b, 102(%rbx)
+; SSE4-NEXT: shrq $32, %r11
+; SSE4-NEXT: movabsq $9007199254740991, %r13 # imm = 0x1FFFFFFFFFFFFF
+; SSE4-NEXT: andq %r13, %r15
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; SSE4-NEXT: shldq $9, %rbp, %r15
+; SSE4-NEXT: movw %r11w, 100(%rbx)
+; SSE4-NEXT: shlq $62, %rdi
+; SSE4-NEXT: orq %r15, %rdi
+; SSE4-NEXT: movq %rdi, 80(%rbx)
+; SSE4-NEXT: andq %r13, %r14
+; SSE4-NEXT: shlq $53, %r12
+; SSE4-NEXT: orq %r14, %r12
+; SSE4-NEXT: movq %r12, 8(%rbx)
+; SSE4-NEXT: shlq $42, %rsi
+; SSE4-NEXT: shrq $11, %rax
+; SSE4-NEXT: orq %rsi, %rax
+; SSE4-NEXT: movq %rax, 24(%rbx)
+; SSE4-NEXT: shlq $9, %rbp
+; SSE4-NEXT: shrq $44, %r10
+; SSE4-NEXT: andl $511, %r10d # imm = 0x1FF
+; SSE4-NEXT: orq %rbp, %r10
+; SSE4-NEXT: movq %r10, 72(%rbx)
; SSE4-NEXT: shlq $20, %r9
; SSE4-NEXT: shrq $33, %r8
; SSE4-NEXT: andl $1048575, %r8d # imm = 0xFFFFF
; SSE4-NEXT: orq %r9, %r8
-; SSE4-NEXT: movq %r8, 56(%r14)
+; SSE4-NEXT: movq %r8, 56(%rbx)
; SSE4-NEXT: shlq $31, %rdx
; SSE4-NEXT: shrq $22, %rcx
; SSE4-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF
; SSE4-NEXT: orq %rdx, %rcx
-; SSE4-NEXT: movq %rcx, 40(%r14)
-; SSE4-NEXT: movq %r14, %rax
+; SSE4-NEXT: movq %rcx, 40(%rbx)
+; SSE4-NEXT: movq %rbx, %rax
; SSE4-NEXT: popq %rbx
; SSE4-NEXT: popq %r12
; SSE4-NEXT: popq %r13
@@ -2174,14 +2169,14 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; AVX-NEXT: setl %dil
; AVX-NEXT: setg %r11b
; AVX-NEXT: subb %dil, %r11b
-; AVX-NEXT: movsbq %r11b, %r11
-; AVX-NEXT: movq %r11, %rdi
-; AVX-NEXT: sarq $63, %rdi
+; AVX-NEXT: movsbq %r11b, %rdi
; AVX-NEXT: movq %rdi, %rbp
-; AVX-NEXT: shldq $62, %r11, %rbp
+; AVX-NEXT: sarq $63, %rbp
+; AVX-NEXT: movl %ebp, 96(%rax)
+; AVX-NEXT: movb $51, %r11b
+; AVX-NEXT: bzhiq %r11, %rbp, %r11
+; AVX-NEXT: shldq $62, %rdi, %rbp
; AVX-NEXT: movq %rbp, 88(%rax)
-; AVX-NEXT: shrq $2, %rdi
-; AVX-NEXT: movl %edi, 96(%rax)
; AVX-NEXT: movq %r10, %rbp
; AVX-NEXT: shldq $20, %r9, %rbp
; AVX-NEXT: movq %rbp, 64(%rax)
@@ -2195,23 +2190,21 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; AVX-NEXT: bzhiq %rbp, %r13, %rbp
; AVX-NEXT: shldq $53, %r12, %r13
; AVX-NEXT: movq %r13, 16(%rax)
-; AVX-NEXT: movb $53, %r13b
-; AVX-NEXT: bzhiq %r13, %r15, %r15
+; AVX-NEXT: movq %r11, %r13
+; AVX-NEXT: shrq $48, %r13
+; AVX-NEXT: movb %r13b, 102(%rax)
+; AVX-NEXT: shrq $32, %r11
+; AVX-NEXT: movw %r11w, 100(%rax)
+; AVX-NEXT: movb $53, %r11b
+; AVX-NEXT: bzhiq %r11, %r15, %r15
; AVX-NEXT: shldq $9, %rsi, %r15
-; AVX-NEXT: shlq $62, %r11
-; AVX-NEXT: orq %r15, %r11
-; AVX-NEXT: movq %r11, 80(%rax)
-; AVX-NEXT: bzhiq %r13, %r14, %r11
+; AVX-NEXT: shlq $62, %rdi
+; AVX-NEXT: orq %r15, %rdi
+; AVX-NEXT: movq %rdi, 80(%rax)
+; AVX-NEXT: bzhiq %r11, %r14, %rdi
; AVX-NEXT: shlq $53, %r12
-; AVX-NEXT: orq %r11, %r12
+; AVX-NEXT: orq %rdi, %r12
; AVX-NEXT: movq %r12, 8(%rax)
-; AVX-NEXT: movb $51, %r11b
-; AVX-NEXT: bzhiq %r11, %rdi, %rdi
-; AVX-NEXT: movq %rdi, %r11
-; AVX-NEXT: shrq $48, %r11
-; AVX-NEXT: movb %r11b, 102(%rax)
-; AVX-NEXT: shrq $32, %rdi
-; AVX-NEXT: movw %di, 100(%rax)
; AVX-NEXT: shlq $42, %rbx
; AVX-NEXT: shrq $11, %rbp
; AVX-NEXT: orq %rbx, %rbp
@@ -2270,24 +2263,24 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; X86-NEXT: addb %al, %al
; X86-NEXT: sarb %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: addb %dl, %dl
-; X86-NEXT: sarb %dl
-; X86-NEXT: movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT: addb %ah, %ah
-; X86-NEXT: sarb %ah
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: addb %cl, %cl
-; X86-NEXT: sarb %cl
-; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT: addb %ch, %ch
-; X86-NEXT: sarb %ch
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: addb %bl, %bl
; X86-NEXT: sarb %bl
; X86-NEXT: movb {{[0-9]+}}(%esp), %bh
; X86-NEXT: addb %bh, %bh
; X86-NEXT: sarb %bh
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: addb %dl, %dl
+; X86-NEXT: sarb %dl
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT: addb %ch, %ch
+; X86-NEXT: sarb %ch
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: addb %cl, %cl
+; X86-NEXT: sarb %cl
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ah
+; X86-NEXT: addb %ah, %ah
+; X86-NEXT: sarb %ah
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: addb %al, %al
; X86-NEXT: sarb %al
@@ -2304,140 +2297,136 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: andl $2097151, %esi # imm = 0x1FFFFF
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpb %bl, %bh
+; X86-NEXT: cmpb %cl, %ah
; X86-NEXT: setl %al
-; X86-NEXT: setg %dh
-; X86-NEXT: subb %al, %dh
-; X86-NEXT: movsbl %dh, %esi
+; X86-NEXT: setg %cl
+; X86-NEXT: subb %al, %cl
+; X86-NEXT: movsbl %cl, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sarl $31, %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %esi, %ebp
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $2097151, %esi # imm = 0x1FFFFF
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpb %cl, %ch
+; X86-NEXT: andl $2097151, %eax # imm = 0x1FFFFF
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpb %dl, %ch
; X86-NEXT: setl %al
; X86-NEXT: setg %cl
; X86-NEXT: subb %al, %cl
-; X86-NEXT: movsbl %cl, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ecx, (%ebp)
-; X86-NEXT: sarl $31, %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl $2097151, %esi # imm = 0x1FFFFF
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpb %dl, %ah
-; X86-NEXT: setl %al
+; X86-NEXT: movsbl %cl, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %eax, (%ecx)
+; X86-NEXT: sarl $31, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andl $2097151, %ecx # imm = 0x1FFFFF
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpb %bl, %bh
+; X86-NEXT: setl %cl
; X86-NEXT: setg %dl
-; X86-NEXT: subb %al, %dl
+; X86-NEXT: subb %cl, %dl
; X86-NEXT: movsbl %dl, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sarl $31, %edi
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; X86-NEXT: setl %al
-; X86-NEXT: setg %dl
-; X86-NEXT: subb %al, %dl
-; X86-NEXT: movsbl %dl, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sarl $31, %esi
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; X86-NEXT: setl %al
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; X86-NEXT: setl %cl
; X86-NEXT: setg %dl
-; X86-NEXT: subb %al, %dl
-; X86-NEXT: movsbl %dl, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sarl $31, %eax
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
-; X86-NEXT: setl %dl
-; X86-NEXT: setg %dh
-; X86-NEXT: subb %dl, %dh
-; X86-NEXT: movsbl %dh, %edx
+; X86-NEXT: subb %cl, %dl
+; X86-NEXT: movsbl %dl, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sarl $31, %edx
-; X86-NEXT: movl %edx, 96(%ebp)
-; X86-NEXT: movl %edx, 92(%ebp)
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, 80(%ebp)
-; X86-NEXT: movl %eax, 68(%ebp)
-; X86-NEXT: movl %eax, 64(%ebp)
-; X86-NEXT: movl %esi, 52(%ebp)
-; X86-NEXT: movl %esi, 48(%ebp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, 36(%ebp)
-; X86-NEXT: movl %edi, 24(%ebp)
-; X86-NEXT: movl %edi, 20(%ebp)
-; X86-NEXT: movl %ecx, 8(%ebp)
-; X86-NEXT: movl %ecx, 4(%ebp)
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $30, %edx, %ecx
-; X86-NEXT: movl %ecx, 88(%ebp)
-; X86-NEXT: movl %ebp, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $9, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; X86-NEXT: setl %cl
+; X86-NEXT: setg %ch
+; X86-NEXT: subb %cl, %ch
+; X86-NEXT: movsbl %ch, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sarl $31, %ebx
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; X86-NEXT: setl %cl
+; X86-NEXT: setg %ch
+; X86-NEXT: subb %cl, %ch
+; X86-NEXT: movsbl %ch, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %esi, 96(%ecx)
+; X86-NEXT: movl %esi, 92(%ecx)
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: movl %ebp, 80(%ecx)
+; X86-NEXT: movl %ebx, 68(%ecx)
+; X86-NEXT: movl %ebx, 64(%ecx)
+; X86-NEXT: movl %edx, 52(%ecx)
+; X86-NEXT: movl %edx, 48(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: shldl $9, %ebp, %ecx
-; X86-NEXT: movl %ebx, %ebp
-; X86-NEXT: movl %ecx, 76(%ebx)
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $20, %ebx, %ecx
-; X86-NEXT: movl %ecx, 60(%ebp)
-; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $31, %ebx, %ecx
-; X86-NEXT: movl %ecx, 44(%ebp)
-; X86-NEXT: movl %ebp, %ebx
+; X86-NEXT: movl %ebp, 36(%ecx)
+; X86-NEXT: movl %edi, 24(%ecx)
+; X86-NEXT: movl %edi, 20(%ecx)
+; X86-NEXT: movl %eax, 8(%ecx)
+; X86-NEXT: movl %eax, 4(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movw %ax, 100(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $10, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shldl $30, %ecx, %eax
+; X86-NEXT: movl %eax, 88(%esi)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $9, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: shldl $9, %ebp, %eax
+; X86-NEXT: movl %eax, 76(%esi)
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: shldl $10, %ebp, %ecx
-; X86-NEXT: movl %ecx, 32(%ebx)
-; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: shldl $20, %ebp, %eax
+; X86-NEXT: movl %eax, 60(%esi)
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: shldl $31, %ebp, %eax
+; X86-NEXT: movl %eax, 44(%esi)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $10, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: shldl $21, %ebp, %ecx
-; X86-NEXT: movl %ecx, 16(%ebx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: shrl $2, %ecx
-; X86-NEXT: movw %cx, 100(%ebx)
+; X86-NEXT: shldl $10, %ebp, %eax
+; X86-NEXT: movl %eax, 32(%esi)
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: shldl $21, %ebp, %eax
+; X86-NEXT: movl %eax, 16(%esi)
; X86-NEXT: shll $21, %ebp
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: movl %ebp, 12(%ebx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shll $30, %ecx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, 84(%ebx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shll $9, %ecx
-; X86-NEXT: shrl $12, %eax
-; X86-NEXT: andl $511, %eax # imm = 0x1FF
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: movl %eax, 72(%ebx)
+; X86-NEXT: movl %ebp, 12(%esi)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl $7, %eax
+; X86-NEXT: movb %al, 102(%esi)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shll $30, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, 84(%esi)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shll $9, %eax
+; X86-NEXT: shrl $12, %ebx
+; X86-NEXT: andl $511, %ebx # imm = 0x1FF
+; X86-NEXT: orl %eax, %ebx
+; X86-NEXT: movl %ebx, 72(%esi)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shll $20, %eax
-; X86-NEXT: shrl %esi
-; X86-NEXT: andl $1048575, %esi # imm = 0xFFFFF
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl %esi, 56(%ebx)
+; X86-NEXT: shrl %edx
+; X86-NEXT: andl $1048575, %edx # imm = 0xFFFFF
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, 56(%esi)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shll $31, %eax
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, 40(%ebx)
+; X86-NEXT: movl %eax, 40(%esi)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shll $10, %eax
; X86-NEXT: shrl $11, %edi
; X86-NEXT: andl $1023, %edi # imm = 0x3FF
; X86-NEXT: orl %eax, %edi
-; X86-NEXT: movl %edi, 28(%ebx)
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: shrl $18, %eax
-; X86-NEXT: andl $7, %eax
-; X86-NEXT: movb %al, 102(%ebx)
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %edi, 28(%esi)
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: addl $52, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
More information about the llvm-commits
mailing list