[llvm] [DAG] visitFREEZE - always allow freezing multiple operands (PR #145939)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 26 14:04:16 PDT 2025


https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/145939

>From f30ba4ede6d62b93c2721ebe272d45ffb054c452 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 26 Jun 2025 18:43:18 +0100
Subject: [PATCH] [DAG] visitFREEZE - always allow freezing multiple operands

Always try to fold freeze(op(....)) -> op(freeze(),freeze(),freeze(),...).

This patch proposes we drop the opt-in limit for opcodes that are allowed to push a freeze through the op to freeze all its operands, bringing us more in line with how InstCombine handles it.

I'm struggling to find a strong reason for this limit apart from the DAG freeze handling being immature for so long - as we've improved coverage in canCreateUndefOrPoison/isGuaranteedNotToBeUndefOrPoison it looks like the regressions are not as severe.

If there's no objections to this approach I will yak shave some of the remaining regressions.

Hopefully this will help some of the regression issues in #143102 etc.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   14 +-
 llvm/test/CodeGen/AMDGPU/div_i128.ll          |   64 +-
 llvm/test/CodeGen/AMDGPU/rem_i128.ll          |   64 +-
 llvm/test/CodeGen/NVPTX/i1-select.ll          |   30 +-
 llvm/test/CodeGen/NVPTX/i128.ll               |  664 +++---
 llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll   |  417 ++--
 llvm/test/CodeGen/RISCV/fpclamptosat.ll       |   88 +-
 .../RISCV/intrinsic-cttz-elts-vscale.ll       |   36 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll     |   18 +
 .../RISCV/wide-scalar-shift-legalization.ll   | 1901 ++++++++---------
 llvm/test/CodeGen/SystemZ/pr60413.ll          |   13 +-
 llvm/test/CodeGen/X86/abds-neg.ll             |   36 +-
 llvm/test/CodeGen/X86/abds-vector-128.ll      |    2 +-
 llvm/test/CodeGen/X86/avg.ll                  |   66 +-
 llvm/test/CodeGen/X86/freeze-vector.ll        |  124 +-
 .../test/CodeGen/X86/setcc-non-simple-type.ll |    4 +-
 16 files changed, 1731 insertions(+), 1810 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 08dab7c697b99..6d99cbac4223a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16609,8 +16609,7 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
   // Fold freeze(op(x, ...)) -> op(freeze(x), ...).
   // Try to push freeze through instructions that propagate but don't produce
   // poison as far as possible. If an operand of freeze follows three
-  // conditions 1) one-use, 2) does not produce poison, and 3) has all but one
-  // guaranteed-non-poison operands (or is a BUILD_VECTOR or similar) then push
+  // conditions 1) one-use, and 2) does not produce poison then push
   // the freeze through to the operands that are not guaranteed non-poison.
   // NOTE: we will strip poison-generating flags, so ignore them here.
   if (DAG.canCreateUndefOrPoison(N0, /*PoisonOnly*/ false,
@@ -16618,13 +16617,6 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
       N0->getNumValues() != 1 || !N0->hasOneUse())
     return SDValue();
 
-  bool AllowMultipleMaybePoisonOperands =
-      N0.getOpcode() == ISD::SELECT_CC || N0.getOpcode() == ISD::SETCC ||
-      N0.getOpcode() == ISD::BUILD_VECTOR ||
-      N0.getOpcode() == ISD::BUILD_PAIR ||
-      N0.getOpcode() == ISD::VECTOR_SHUFFLE ||
-      N0.getOpcode() == ISD::CONCAT_VECTORS || N0.getOpcode() == ISD::FMUL;
-
   // Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all
   // ones" or "constant" into something that depends on FrozenUndef. We can
   // instead pick undef values to keep those properties, while at the same time
@@ -16657,10 +16649,6 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
       MaybePoisonOperandNumbers.push_back(OpNo);
     if (!HadMaybePoisonOperands)
       continue;
-    if (IsNewMaybePoisonOperand && !AllowMultipleMaybePoisonOperands) {
-      // Multiple maybe-poison ops when not allowed - bail out.
-      return SDValue();
-    }
   }
   // NOTE: the whole op may be not guaranteed to not be undef or poison because
   // it could create undef or poison due to it's poison-generating flags.
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 51398a45055eb..f8e13fcdd2273 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -475,21 +475,28 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
@@ -500,6 +507,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1035,10 +1043,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
@@ -2656,21 +2664,28 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
@@ -2681,6 +2696,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -3216,10 +3232,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 6512bee36e88b..ba9dd8f7c2468 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -513,21 +513,28 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
@@ -538,6 +545,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1073,10 +1081,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
@@ -1889,21 +1897,28 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
@@ -1914,6 +1929,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -2449,10 +2465,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll
index 6fb5aad4b1eb9..562c746200d87 100644
--- a/llvm/test/CodeGen/NVPTX/i1-select.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-select.ll
@@ -94,27 +94,27 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals
 define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %false) {
 ; CHECK-LABEL: test_select_i1_basic_folding(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<13>;
-; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .pred %p<12>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_select_i1_basic_folding_param_0];
 ; CHECK-NEXT:    setp.eq.s32 %p1, %r1, 0;
-; CHECK-NEXT:    ld.param.b32 %r2, [test_select_i1_basic_folding_param_1];
-; CHECK-NEXT:    setp.ne.s32 %p2, %r2, 0;
-; CHECK-NEXT:    setp.eq.s32 %p3, %r2, 0;
-; CHECK-NEXT:    ld.param.b32 %r3, [test_select_i1_basic_folding_param_2];
-; CHECK-NEXT:    setp.eq.s32 %p4, %r3, 0;
-; CHECK-NEXT:    ld.param.b32 %r4, [test_select_i1_basic_folding_param_3];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_select_i1_basic_folding_param_1];
+; CHECK-NEXT:    setp.ne.s32 %p2, %r3, 0;
+; CHECK-NEXT:    setp.eq.s32 %p3, %r3, 0;
+; CHECK-NEXT:    ld.param.b32 %r5, [test_select_i1_basic_folding_param_2];
+; CHECK-NEXT:    setp.eq.s32 %p4, %r5, 0;
+; CHECK-NEXT:    ld.param.b32 %r6, [test_select_i1_basic_folding_param_3];
 ; CHECK-NEXT:    xor.pred %p6, %p1, %p3;
-; CHECK-NEXT:    ld.param.b32 %r5, [test_select_i1_basic_folding_param_4];
+; CHECK-NEXT:    ld.param.b32 %r7, [test_select_i1_basic_folding_param_4];
 ; CHECK-NEXT:    and.pred %p7, %p6, %p4;
-; CHECK-NEXT:    and.pred %p9, %p2, %p4;
-; CHECK-NEXT:    and.pred %p10, %p3, %p7;
-; CHECK-NEXT:    or.pred %p11, %p10, %p9;
-; CHECK-NEXT:    xor.pred %p12, %p11, %p3;
-; CHECK-NEXT:    selp.b32 %r6, %r4, %r5, %p12;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    and.pred %p8, %p2, %p4;
+; CHECK-NEXT:    and.pred %p9, %p3, %p7;
+; CHECK-NEXT:    or.pred %p10, %p9, %p8;
+; CHECK-NEXT:    xor.pred %p11, %p10, %p3;
+; CHECK-NEXT:    selp.b32 %r8, %r6, %r7, %p11;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
 ; CHECK-NEXT:    ret;
   %b1 = icmp eq i32 %v1, 0
   %b2 = icmp eq i32 %v2, 0
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index ecd42fd6ceb3c..899b5bafd0976 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -5,9 +5,9 @@
 define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: srem_i128(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<20>;
+; CHECK-NEXT:    .reg .pred %p<22>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<127>;
+; CHECK-NEXT:    .reg .b64 %rd<128>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd45, %rd46}, [srem_i128_param_0];
@@ -37,107 +37,108 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    selp.b64 %rd60, %rd57, %rd59, %p6;
 ; CHECK-NEXT:    setp.ne.s64 %p7, %rd4, 0;
 ; CHECK-NEXT:    clz.b64 %r3, %rd4;
-; CHECK-NEXT:    cvt.u64.u32 %rd61, %r3;
+; CHECK-NEXT:    cvt.u64.u32 %rd62, %r3;
 ; CHECK-NEXT:    clz.b64 %r4, %rd3;
-; CHECK-NEXT:    cvt.u64.u32 %rd62, %r4;
-; CHECK-NEXT:    add.s64 %rd63, %rd62, 64;
-; CHECK-NEXT:    selp.b64 %rd64, %rd61, %rd63, %p7;
-; CHECK-NEXT:    mov.b64 %rd117, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd66, %rd60, %rd64;
-; CHECK-NEXT:    subc.cc.s64 %rd67, %rd117, 0;
-; CHECK-NEXT:    setp.gt.u64 %p8, %rd66, 127;
-; CHECK-NEXT:    setp.eq.s64 %p9, %rd67, 0;
-; CHECK-NEXT:    and.pred %p10, %p9, %p8;
-; CHECK-NEXT:    setp.ne.s64 %p11, %rd67, 0;
-; CHECK-NEXT:    or.pred %p12, %p10, %p11;
-; CHECK-NEXT:    or.pred %p13, %p5, %p12;
-; CHECK-NEXT:    xor.b64 %rd68, %rd66, 127;
-; CHECK-NEXT:    or.b64 %rd69, %rd68, %rd67;
-; CHECK-NEXT:    setp.eq.s64 %p14, %rd69, 0;
-; CHECK-NEXT:    selp.b64 %rd126, 0, %rd4, %p13;
-; CHECK-NEXT:    selp.b64 %rd125, 0, %rd3, %p13;
-; CHECK-NEXT:    or.pred %p15, %p13, %p14;
-; CHECK-NEXT:    @%p15 bra $L__BB0_5;
+; CHECK-NEXT:    cvt.u64.u32 %rd63, %r4;
+; CHECK-NEXT:    add.s64 %rd64, %rd63, 64;
+; CHECK-NEXT:    selp.b64 %rd65, %rd62, %rd64, %p7;
+; CHECK-NEXT:    mov.b64 %rd118, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd68, %rd60, %rd65;
+; CHECK-NEXT:    subc.cc.s64 %rd8, %rd118, 0;
+; CHECK-NEXT:    setp.ne.s64 %p8, %rd8, 0;
+; CHECK-NEXT:    and.pred %p10, %p8, %p8;
+; CHECK-NEXT:    setp.eq.s64 %p11, %rd8, 0;
+; CHECK-NEXT:    setp.gt.u64 %p12, %rd68, 127;
+; CHECK-NEXT:    and.pred %p13, %p11, %p12;
+; CHECK-NEXT:    or.pred %p14, %p13, %p10;
+; CHECK-NEXT:    or.pred %p15, %p5, %p14;
+; CHECK-NEXT:    xor.b64 %rd69, %rd68, 127;
+; CHECK-NEXT:    or.b64 %rd70, %rd69, %rd8;
+; CHECK-NEXT:    setp.eq.s64 %p16, %rd70, 0;
+; CHECK-NEXT:    selp.b64 %rd127, 0, %rd4, %p15;
+; CHECK-NEXT:    selp.b64 %rd126, 0, %rd3, %p15;
+; CHECK-NEXT:    or.pred %p17, %p15, %p16;
+; CHECK-NEXT:    @%p17 bra $L__BB0_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd119, %rd66, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd120, %rd67, 0;
-; CHECK-NEXT:    or.b64 %rd72, %rd119, %rd120;
-; CHECK-NEXT:    setp.eq.s64 %p16, %rd72, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r5, %rd66;
+; CHECK-NEXT:    add.cc.s64 %rd120, %rd68, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd121, %rd8, 0;
+; CHECK-NEXT:    or.b64 %rd73, %rd120, %rd121;
+; CHECK-NEXT:    setp.eq.s64 %p18, %rd73, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r5, %rd68;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd73, %rd4, %r6;
+; CHECK-NEXT:    shl.b64 %rd74, %rd4, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd74, %rd3, %r7;
-; CHECK-NEXT:    or.b64 %rd75, %rd73, %rd74;
+; CHECK-NEXT:    shr.u64 %rd75, %rd3, %r7;
+; CHECK-NEXT:    or.b64 %rd76, %rd74, %rd75;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd76, %rd3, %r8;
-; CHECK-NEXT:    setp.gt.s32 %p17, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd124, %rd76, %rd75, %p17;
-; CHECK-NEXT:    shl.b64 %rd123, %rd3, %r6;
-; CHECK-NEXT:    mov.b64 %rd114, %rd117;
-; CHECK-NEXT:    @%p16 bra $L__BB0_4;
+; CHECK-NEXT:    shl.b64 %rd77, %rd3, %r8;
+; CHECK-NEXT:    setp.gt.s32 %p19, %r6, 63;
+; CHECK-NEXT:    selp.b64 %rd125, %rd77, %rd76, %p19;
+; CHECK-NEXT:    shl.b64 %rd124, %rd3, %r6;
+; CHECK-NEXT:    mov.b64 %rd115, %rd118;
+; CHECK-NEXT:    @%p18 bra $L__BB0_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd119;
-; CHECK-NEXT:    shr.u64 %rd79, %rd3, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd120;
+; CHECK-NEXT:    shr.u64 %rd80, %rd3, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd80, %rd4, %r10;
-; CHECK-NEXT:    or.b64 %rd81, %rd79, %rd80;
+; CHECK-NEXT:    shl.b64 %rd81, %rd4, %r10;
+; CHECK-NEXT:    or.b64 %rd82, %rd80, %rd81;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd82, %rd4, %r11;
-; CHECK-NEXT:    setp.gt.s32 %p18, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd121, %rd82, %rd81, %p18;
-; CHECK-NEXT:    shr.u64 %rd122, %rd4, %r9;
+; CHECK-NEXT:    shr.u64 %rd83, %rd4, %r11;
+; CHECK-NEXT:    setp.gt.s32 %p20, %r9, 63;
+; CHECK-NEXT:    selp.b64 %rd122, %rd83, %rd82, %p20;
+; CHECK-NEXT:    shr.u64 %rd123, %rd4, %r9;
 ; CHECK-NEXT:    add.cc.s64 %rd35, %rd5, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd36, %rd6, -1;
-; CHECK-NEXT:    mov.b64 %rd114, 0;
-; CHECK-NEXT:    mov.b64 %rd117, %rd114;
+; CHECK-NEXT:    mov.b64 %rd115, 0;
+; CHECK-NEXT:    mov.b64 %rd118, %rd115;
 ; CHECK-NEXT:  $L__BB0_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd83, %rd121, 63;
-; CHECK-NEXT:    shl.b64 %rd84, %rd122, 1;
-; CHECK-NEXT:    or.b64 %rd85, %rd84, %rd83;
-; CHECK-NEXT:    shl.b64 %rd86, %rd121, 1;
-; CHECK-NEXT:    shr.u64 %rd87, %rd124, 63;
-; CHECK-NEXT:    or.b64 %rd88, %rd86, %rd87;
-; CHECK-NEXT:    shr.u64 %rd89, %rd123, 63;
-; CHECK-NEXT:    shl.b64 %rd90, %rd124, 1;
-; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
-; CHECK-NEXT:    shl.b64 %rd92, %rd123, 1;
-; CHECK-NEXT:    or.b64 %rd123, %rd117, %rd92;
-; CHECK-NEXT:    or.b64 %rd124, %rd114, %rd91;
-; CHECK-NEXT:    sub.cc.s64 %rd93, %rd35, %rd88;
-; CHECK-NEXT:    subc.cc.s64 %rd94, %rd36, %rd85;
-; CHECK-NEXT:    shr.s64 %rd95, %rd94, 63;
-; CHECK-NEXT:    and.b64 %rd117, %rd95, 1;
-; CHECK-NEXT:    and.b64 %rd96, %rd95, %rd5;
-; CHECK-NEXT:    and.b64 %rd97, %rd95, %rd6;
-; CHECK-NEXT:    sub.cc.s64 %rd121, %rd88, %rd96;
-; CHECK-NEXT:    subc.cc.s64 %rd122, %rd85, %rd97;
-; CHECK-NEXT:    add.cc.s64 %rd119, %rd119, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd120, %rd120, -1;
-; CHECK-NEXT:    or.b64 %rd98, %rd119, %rd120;
-; CHECK-NEXT:    setp.eq.s64 %p19, %rd98, 0;
-; CHECK-NEXT:    @%p19 bra $L__BB0_4;
+; CHECK-NEXT:    shr.u64 %rd84, %rd122, 63;
+; CHECK-NEXT:    shl.b64 %rd85, %rd123, 1;
+; CHECK-NEXT:    or.b64 %rd86, %rd85, %rd84;
+; CHECK-NEXT:    shl.b64 %rd87, %rd122, 1;
+; CHECK-NEXT:    shr.u64 %rd88, %rd125, 63;
+; CHECK-NEXT:    or.b64 %rd89, %rd87, %rd88;
+; CHECK-NEXT:    shr.u64 %rd90, %rd124, 63;
+; CHECK-NEXT:    shl.b64 %rd91, %rd125, 1;
+; CHECK-NEXT:    or.b64 %rd92, %rd91, %rd90;
+; CHECK-NEXT:    shl.b64 %rd93, %rd124, 1;
+; CHECK-NEXT:    or.b64 %rd124, %rd118, %rd93;
+; CHECK-NEXT:    or.b64 %rd125, %rd115, %rd92;
+; CHECK-NEXT:    sub.cc.s64 %rd94, %rd35, %rd89;
+; CHECK-NEXT:    subc.cc.s64 %rd95, %rd36, %rd86;
+; CHECK-NEXT:    shr.s64 %rd96, %rd95, 63;
+; CHECK-NEXT:    and.b64 %rd118, %rd96, 1;
+; CHECK-NEXT:    and.b64 %rd97, %rd96, %rd5;
+; CHECK-NEXT:    and.b64 %rd98, %rd96, %rd6;
+; CHECK-NEXT:    sub.cc.s64 %rd122, %rd89, %rd97;
+; CHECK-NEXT:    subc.cc.s64 %rd123, %rd86, %rd98;
+; CHECK-NEXT:    add.cc.s64 %rd120, %rd120, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd121, %rd121, -1;
+; CHECK-NEXT:    or.b64 %rd99, %rd120, %rd121;
+; CHECK-NEXT:    setp.eq.s64 %p21, %rd99, 0;
+; CHECK-NEXT:    @%p21 bra $L__BB0_4;
 ; CHECK-NEXT:    bra.uni $L__BB0_2;
 ; CHECK-NEXT:  $L__BB0_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd99, %rd123, 63;
-; CHECK-NEXT:    shl.b64 %rd100, %rd124, 1;
-; CHECK-NEXT:    or.b64 %rd101, %rd100, %rd99;
-; CHECK-NEXT:    shl.b64 %rd102, %rd123, 1;
-; CHECK-NEXT:    or.b64 %rd125, %rd117, %rd102;
-; CHECK-NEXT:    or.b64 %rd126, %rd114, %rd101;
+; CHECK-NEXT:    shr.u64 %rd100, %rd124, 63;
+; CHECK-NEXT:    shl.b64 %rd101, %rd125, 1;
+; CHECK-NEXT:    or.b64 %rd102, %rd101, %rd100;
+; CHECK-NEXT:    shl.b64 %rd103, %rd124, 1;
+; CHECK-NEXT:    or.b64 %rd126, %rd118, %rd103;
+; CHECK-NEXT:    or.b64 %rd127, %rd115, %rd102;
 ; CHECK-NEXT:  $L__BB0_5: // %udiv-end
-; CHECK-NEXT:    mul.hi.u64 %rd103, %rd5, %rd125;
-; CHECK-NEXT:    mad.lo.s64 %rd104, %rd5, %rd126, %rd103;
-; CHECK-NEXT:    mad.lo.s64 %rd105, %rd6, %rd125, %rd104;
-; CHECK-NEXT:    mul.lo.s64 %rd106, %rd5, %rd125;
-; CHECK-NEXT:    sub.cc.s64 %rd107, %rd3, %rd106;
-; CHECK-NEXT:    subc.cc.s64 %rd108, %rd4, %rd105;
-; CHECK-NEXT:    xor.b64 %rd109, %rd107, %rd2;
+; CHECK-NEXT:    mul.hi.u64 %rd104, %rd5, %rd126;
+; CHECK-NEXT:    mad.lo.s64 %rd105, %rd5, %rd127, %rd104;
+; CHECK-NEXT:    mad.lo.s64 %rd106, %rd6, %rd126, %rd105;
+; CHECK-NEXT:    mul.lo.s64 %rd107, %rd5, %rd126;
+; CHECK-NEXT:    sub.cc.s64 %rd108, %rd3, %rd107;
+; CHECK-NEXT:    subc.cc.s64 %rd109, %rd4, %rd106;
 ; CHECK-NEXT:    xor.b64 %rd110, %rd108, %rd2;
-; CHECK-NEXT:    sub.cc.s64 %rd111, %rd109, %rd2;
-; CHECK-NEXT:    subc.cc.s64 %rd112, %rd110, %rd2;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd111, %rd112};
+; CHECK-NEXT:    xor.b64 %rd111, %rd109, %rd2;
+; CHECK-NEXT:    sub.cc.s64 %rd112, %rd110, %rd2;
+; CHECK-NEXT:    subc.cc.s64 %rd113, %rd111, %rd2;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd112, %rd113};
 ; CHECK-NEXT:    ret;
   %div = srem i128 %lhs, %rhs
   ret i128 %div
@@ -146,9 +147,9 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: urem_i128(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<18>;
+; CHECK-NEXT:    .reg .pred %p<20>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<113>;
+; CHECK-NEXT:    .reg .b64 %rd<114>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd41, %rd42}, [urem_i128_param_0];
@@ -167,103 +168,104 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    selp.b64 %rd50, %rd47, %rd49, %p4;
 ; CHECK-NEXT:    setp.ne.s64 %p5, %rd42, 0;
 ; CHECK-NEXT:    clz.b64 %r3, %rd42;
-; CHECK-NEXT:    cvt.u64.u32 %rd51, %r3;
+; CHECK-NEXT:    cvt.u64.u32 %rd52, %r3;
 ; CHECK-NEXT:    clz.b64 %r4, %rd41;
-; CHECK-NEXT:    cvt.u64.u32 %rd52, %r4;
-; CHECK-NEXT:    add.s64 %rd53, %rd52, 64;
-; CHECK-NEXT:    selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT:    mov.b64 %rd103, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd56, %rd50, %rd54;
-; CHECK-NEXT:    subc.cc.s64 %rd57, %rd103, 0;
-; CHECK-NEXT:    setp.gt.u64 %p6, %rd56, 127;
-; CHECK-NEXT:    setp.eq.s64 %p7, %rd57, 0;
-; CHECK-NEXT:    and.pred %p8, %p7, %p6;
-; CHECK-NEXT:    setp.ne.s64 %p9, %rd57, 0;
-; CHECK-NEXT:    or.pred %p10, %p8, %p9;
-; CHECK-NEXT:    or.pred %p11, %p3, %p10;
-; CHECK-NEXT:    xor.b64 %rd58, %rd56, 127;
-; CHECK-NEXT:    or.b64 %rd59, %rd58, %rd57;
-; CHECK-NEXT:    setp.eq.s64 %p12, %rd59, 0;
-; CHECK-NEXT:    selp.b64 %rd112, 0, %rd42, %p11;
-; CHECK-NEXT:    selp.b64 %rd111, 0, %rd41, %p11;
-; CHECK-NEXT:    or.pred %p13, %p11, %p12;
-; CHECK-NEXT:    @%p13 bra $L__BB1_5;
+; CHECK-NEXT:    cvt.u64.u32 %rd53, %r4;
+; CHECK-NEXT:    add.s64 %rd54, %rd53, 64;
+; CHECK-NEXT:    selp.b64 %rd55, %rd52, %rd54, %p5;
+; CHECK-NEXT:    mov.b64 %rd104, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd58, %rd50, %rd55;
+; CHECK-NEXT:    subc.cc.s64 %rd6, %rd104, 0;
+; CHECK-NEXT:    setp.ne.s64 %p6, %rd6, 0;
+; CHECK-NEXT:    and.pred %p8, %p6, %p6;
+; CHECK-NEXT:    setp.eq.s64 %p9, %rd6, 0;
+; CHECK-NEXT:    setp.gt.u64 %p10, %rd58, 127;
+; CHECK-NEXT:    and.pred %p11, %p9, %p10;
+; CHECK-NEXT:    or.pred %p12, %p11, %p8;
+; CHECK-NEXT:    or.pred %p13, %p3, %p12;
+; CHECK-NEXT:    xor.b64 %rd59, %rd58, 127;
+; CHECK-NEXT:    or.b64 %rd60, %rd59, %rd6;
+; CHECK-NEXT:    setp.eq.s64 %p14, %rd60, 0;
+; CHECK-NEXT:    selp.b64 %rd113, 0, %rd42, %p13;
+; CHECK-NEXT:    selp.b64 %rd112, 0, %rd41, %p13;
+; CHECK-NEXT:    or.pred %p15, %p13, %p14;
+; CHECK-NEXT:    @%p15 bra $L__BB1_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd105, %rd56, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd106, %rd57, 0;
-; CHECK-NEXT:    or.b64 %rd62, %rd105, %rd106;
-; CHECK-NEXT:    setp.eq.s64 %p14, %rd62, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r5, %rd56;
+; CHECK-NEXT:    add.cc.s64 %rd106, %rd58, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd107, %rd6, 0;
+; CHECK-NEXT:    or.b64 %rd63, %rd106, %rd107;
+; CHECK-NEXT:    setp.eq.s64 %p16, %rd63, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r5, %rd58;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd63, %rd42, %r6;
+; CHECK-NEXT:    shl.b64 %rd64, %rd42, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd64, %rd41, %r7;
-; CHECK-NEXT:    or.b64 %rd65, %rd63, %rd64;
+; CHECK-NEXT:    shr.u64 %rd65, %rd41, %r7;
+; CHECK-NEXT:    or.b64 %rd66, %rd64, %rd65;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd66, %rd41, %r8;
-; CHECK-NEXT:    setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd110, %rd66, %rd65, %p15;
-; CHECK-NEXT:    shl.b64 %rd109, %rd41, %r6;
-; CHECK-NEXT:    mov.b64 %rd100, %rd103;
-; CHECK-NEXT:    @%p14 bra $L__BB1_4;
+; CHECK-NEXT:    shl.b64 %rd67, %rd41, %r8;
+; CHECK-NEXT:    setp.gt.s32 %p17, %r6, 63;
+; CHECK-NEXT:    selp.b64 %rd111, %rd67, %rd66, %p17;
+; CHECK-NEXT:    shl.b64 %rd110, %rd41, %r6;
+; CHECK-NEXT:    mov.b64 %rd101, %rd104;
+; CHECK-NEXT:    @%p16 bra $L__BB1_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd105;
-; CHECK-NEXT:    shr.u64 %rd69, %rd41, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd106;
+; CHECK-NEXT:    shr.u64 %rd70, %rd41, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd70, %rd42, %r10;
-; CHECK-NEXT:    or.b64 %rd71, %rd69, %rd70;
+; CHECK-NEXT:    shl.b64 %rd71, %rd42, %r10;
+; CHECK-NEXT:    or.b64 %rd72, %rd70, %rd71;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd72, %rd42, %r11;
-; CHECK-NEXT:    setp.gt.s32 %p16, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd107, %rd72, %rd71, %p16;
-; CHECK-NEXT:    shr.u64 %rd108, %rd42, %r9;
+; CHECK-NEXT:    shr.u64 %rd73, %rd42, %r11;
+; CHECK-NEXT:    setp.gt.s32 %p18, %r9, 63;
+; CHECK-NEXT:    selp.b64 %rd108, %rd73, %rd72, %p18;
+; CHECK-NEXT:    shr.u64 %rd109, %rd42, %r9;
 ; CHECK-NEXT:    add.cc.s64 %rd33, %rd3, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd34, %rd4, -1;
-; CHECK-NEXT:    mov.b64 %rd100, 0;
-; CHECK-NEXT:    mov.b64 %rd103, %rd100;
+; CHECK-NEXT:    mov.b64 %rd101, 0;
+; CHECK-NEXT:    mov.b64 %rd104, %rd101;
 ; CHECK-NEXT:  $L__BB1_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd73, %rd107, 63;
-; CHECK-NEXT:    shl.b64 %rd74, %rd108, 1;
-; CHECK-NEXT:    or.b64 %rd75, %rd74, %rd73;
-; CHECK-NEXT:    shl.b64 %rd76, %rd107, 1;
-; CHECK-NEXT:    shr.u64 %rd77, %rd110, 63;
-; CHECK-NEXT:    or.b64 %rd78, %rd76, %rd77;
-; CHECK-NEXT:    shr.u64 %rd79, %rd109, 63;
-; CHECK-NEXT:    shl.b64 %rd80, %rd110, 1;
-; CHECK-NEXT:    or.b64 %rd81, %rd80, %rd79;
-; CHECK-NEXT:    shl.b64 %rd82, %rd109, 1;
-; CHECK-NEXT:    or.b64 %rd109, %rd103, %rd82;
-; CHECK-NEXT:    or.b64 %rd110, %rd100, %rd81;
-; CHECK-NEXT:    sub.cc.s64 %rd83, %rd33, %rd78;
-; CHECK-NEXT:    subc.cc.s64 %rd84, %rd34, %rd75;
-; CHECK-NEXT:    shr.s64 %rd85, %rd84, 63;
-; CHECK-NEXT:    and.b64 %rd103, %rd85, 1;
-; CHECK-NEXT:    and.b64 %rd86, %rd85, %rd3;
-; CHECK-NEXT:    and.b64 %rd87, %rd85, %rd4;
-; CHECK-NEXT:    sub.cc.s64 %rd107, %rd78, %rd86;
-; CHECK-NEXT:    subc.cc.s64 %rd108, %rd75, %rd87;
-; CHECK-NEXT:    add.cc.s64 %rd105, %rd105, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd106, %rd106, -1;
-; CHECK-NEXT:    or.b64 %rd88, %rd105, %rd106;
-; CHECK-NEXT:    setp.eq.s64 %p17, %rd88, 0;
-; CHECK-NEXT:    @%p17 bra $L__BB1_4;
+; CHECK-NEXT:    shr.u64 %rd74, %rd108, 63;
+; CHECK-NEXT:    shl.b64 %rd75, %rd109, 1;
+; CHECK-NEXT:    or.b64 %rd76, %rd75, %rd74;
+; CHECK-NEXT:    shl.b64 %rd77, %rd108, 1;
+; CHECK-NEXT:    shr.u64 %rd78, %rd111, 63;
+; CHECK-NEXT:    or.b64 %rd79, %rd77, %rd78;
+; CHECK-NEXT:    shr.u64 %rd80, %rd110, 63;
+; CHECK-NEXT:    shl.b64 %rd81, %rd111, 1;
+; CHECK-NEXT:    or.b64 %rd82, %rd81, %rd80;
+; CHECK-NEXT:    shl.b64 %rd83, %rd110, 1;
+; CHECK-NEXT:    or.b64 %rd110, %rd104, %rd83;
+; CHECK-NEXT:    or.b64 %rd111, %rd101, %rd82;
+; CHECK-NEXT:    sub.cc.s64 %rd84, %rd33, %rd79;
+; CHECK-NEXT:    subc.cc.s64 %rd85, %rd34, %rd76;
+; CHECK-NEXT:    shr.s64 %rd86, %rd85, 63;
+; CHECK-NEXT:    and.b64 %rd104, %rd86, 1;
+; CHECK-NEXT:    and.b64 %rd87, %rd86, %rd3;
+; CHECK-NEXT:    and.b64 %rd88, %rd86, %rd4;
+; CHECK-NEXT:    sub.cc.s64 %rd108, %rd79, %rd87;
+; CHECK-NEXT:    subc.cc.s64 %rd109, %rd76, %rd88;
+; CHECK-NEXT:    add.cc.s64 %rd106, %rd106, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd107, %rd107, -1;
+; CHECK-NEXT:    or.b64 %rd89, %rd106, %rd107;
+; CHECK-NEXT:    setp.eq.s64 %p19, %rd89, 0;
+; CHECK-NEXT:    @%p19 bra $L__BB1_4;
 ; CHECK-NEXT:    bra.uni $L__BB1_2;
 ; CHECK-NEXT:  $L__BB1_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd89, %rd109, 63;
-; CHECK-NEXT:    shl.b64 %rd90, %rd110, 1;
-; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
-; CHECK-NEXT:    shl.b64 %rd92, %rd109, 1;
-; CHECK-NEXT:    or.b64 %rd111, %rd103, %rd92;
-; CHECK-NEXT:    or.b64 %rd112, %rd100, %rd91;
+; CHECK-NEXT:    shr.u64 %rd90, %rd110, 63;
+; CHECK-NEXT:    shl.b64 %rd91, %rd111, 1;
+; CHECK-NEXT:    or.b64 %rd92, %rd91, %rd90;
+; CHECK-NEXT:    shl.b64 %rd93, %rd110, 1;
+; CHECK-NEXT:    or.b64 %rd112, %rd104, %rd93;
+; CHECK-NEXT:    or.b64 %rd113, %rd101, %rd92;
 ; CHECK-NEXT:  $L__BB1_5: // %udiv-end
-; CHECK-NEXT:    mul.hi.u64 %rd93, %rd3, %rd111;
-; CHECK-NEXT:    mad.lo.s64 %rd94, %rd3, %rd112, %rd93;
-; CHECK-NEXT:    mad.lo.s64 %rd95, %rd4, %rd111, %rd94;
-; CHECK-NEXT:    mul.lo.s64 %rd96, %rd3, %rd111;
-; CHECK-NEXT:    sub.cc.s64 %rd97, %rd41, %rd96;
-; CHECK-NEXT:    subc.cc.s64 %rd98, %rd42, %rd95;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd97, %rd98};
+; CHECK-NEXT:    mul.hi.u64 %rd94, %rd3, %rd112;
+; CHECK-NEXT:    mad.lo.s64 %rd95, %rd3, %rd113, %rd94;
+; CHECK-NEXT:    mad.lo.s64 %rd96, %rd4, %rd112, %rd95;
+; CHECK-NEXT:    mul.lo.s64 %rd97, %rd3, %rd112;
+; CHECK-NEXT:    sub.cc.s64 %rd98, %rd41, %rd97;
+; CHECK-NEXT:    subc.cc.s64 %rd99, %rd42, %rd96;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd98, %rd99};
 ; CHECK-NEXT:    ret;
   %div = urem i128 %lhs, %rhs
   ret i128 %div
@@ -307,9 +309,9 @@ define i128 @urem_i128_pow2k(i128 %lhs) {
 define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: sdiv_i128(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<20>;
+; CHECK-NEXT:    .reg .pred %p<22>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<122>;
+; CHECK-NEXT:    .reg .b64 %rd<123>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd45, %rd46}, [sdiv_i128_param_0];
@@ -340,101 +342,102 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    selp.b64 %rd61, %rd58, %rd60, %p6;
 ; CHECK-NEXT:    setp.ne.s64 %p7, %rd2, 0;
 ; CHECK-NEXT:    clz.b64 %r3, %rd2;
-; CHECK-NEXT:    cvt.u64.u32 %rd62, %r3;
+; CHECK-NEXT:    cvt.u64.u32 %rd63, %r3;
 ; CHECK-NEXT:    clz.b64 %r4, %rd1;
-; CHECK-NEXT:    cvt.u64.u32 %rd63, %r4;
-; CHECK-NEXT:    add.s64 %rd64, %rd63, 64;
-; CHECK-NEXT:    selp.b64 %rd65, %rd62, %rd64, %p7;
-; CHECK-NEXT:    mov.b64 %rd112, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd67, %rd61, %rd65;
-; CHECK-NEXT:    subc.cc.s64 %rd68, %rd112, 0;
-; CHECK-NEXT:    setp.gt.u64 %p8, %rd67, 127;
-; CHECK-NEXT:    setp.eq.s64 %p9, %rd68, 0;
-; CHECK-NEXT:    and.pred %p10, %p9, %p8;
-; CHECK-NEXT:    setp.ne.s64 %p11, %rd68, 0;
-; CHECK-NEXT:    or.pred %p12, %p10, %p11;
-; CHECK-NEXT:    or.pred %p13, %p5, %p12;
-; CHECK-NEXT:    xor.b64 %rd69, %rd67, 127;
-; CHECK-NEXT:    or.b64 %rd70, %rd69, %rd68;
-; CHECK-NEXT:    setp.eq.s64 %p14, %rd70, 0;
-; CHECK-NEXT:    selp.b64 %rd121, 0, %rd2, %p13;
-; CHECK-NEXT:    selp.b64 %rd120, 0, %rd1, %p13;
-; CHECK-NEXT:    or.pred %p15, %p13, %p14;
-; CHECK-NEXT:    @%p15 bra $L__BB4_5;
+; CHECK-NEXT:    cvt.u64.u32 %rd64, %r4;
+; CHECK-NEXT:    add.s64 %rd65, %rd64, 64;
+; CHECK-NEXT:    selp.b64 %rd66, %rd63, %rd65, %p7;
+; CHECK-NEXT:    mov.b64 %rd113, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd69, %rd61, %rd66;
+; CHECK-NEXT:    subc.cc.s64 %rd8, %rd113, 0;
+; CHECK-NEXT:    setp.ne.s64 %p8, %rd8, 0;
+; CHECK-NEXT:    and.pred %p10, %p8, %p8;
+; CHECK-NEXT:    setp.eq.s64 %p11, %rd8, 0;
+; CHECK-NEXT:    setp.gt.u64 %p12, %rd69, 127;
+; CHECK-NEXT:    and.pred %p13, %p11, %p12;
+; CHECK-NEXT:    or.pred %p14, %p13, %p10;
+; CHECK-NEXT:    or.pred %p15, %p5, %p14;
+; CHECK-NEXT:    xor.b64 %rd70, %rd69, 127;
+; CHECK-NEXT:    or.b64 %rd71, %rd70, %rd8;
+; CHECK-NEXT:    setp.eq.s64 %p16, %rd71, 0;
+; CHECK-NEXT:    selp.b64 %rd122, 0, %rd2, %p15;
+; CHECK-NEXT:    selp.b64 %rd121, 0, %rd1, %p15;
+; CHECK-NEXT:    or.pred %p17, %p15, %p16;
+; CHECK-NEXT:    @%p17 bra $L__BB4_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd114, %rd67, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd115, %rd68, 0;
-; CHECK-NEXT:    or.b64 %rd73, %rd114, %rd115;
-; CHECK-NEXT:    setp.eq.s64 %p16, %rd73, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r5, %rd67;
+; CHECK-NEXT:    add.cc.s64 %rd115, %rd69, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd116, %rd8, 0;
+; CHECK-NEXT:    or.b64 %rd74, %rd115, %rd116;
+; CHECK-NEXT:    setp.eq.s64 %p18, %rd74, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r5, %rd69;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd74, %rd2, %r6;
+; CHECK-NEXT:    shl.b64 %rd75, %rd2, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd75, %rd1, %r7;
-; CHECK-NEXT:    or.b64 %rd76, %rd74, %rd75;
+; CHECK-NEXT:    shr.u64 %rd76, %rd1, %r7;
+; CHECK-NEXT:    or.b64 %rd77, %rd75, %rd76;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd77, %rd1, %r8;
-; CHECK-NEXT:    setp.gt.s32 %p17, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd119, %rd77, %rd76, %p17;
-; CHECK-NEXT:    shl.b64 %rd118, %rd1, %r6;
-; CHECK-NEXT:    mov.b64 %rd109, %rd112;
-; CHECK-NEXT:    @%p16 bra $L__BB4_4;
+; CHECK-NEXT:    shl.b64 %rd78, %rd1, %r8;
+; CHECK-NEXT:    setp.gt.s32 %p19, %r6, 63;
+; CHECK-NEXT:    selp.b64 %rd120, %rd78, %rd77, %p19;
+; CHECK-NEXT:    shl.b64 %rd119, %rd1, %r6;
+; CHECK-NEXT:    mov.b64 %rd110, %rd113;
+; CHECK-NEXT:    @%p18 bra $L__BB4_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd114;
-; CHECK-NEXT:    shr.u64 %rd80, %rd1, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd115;
+; CHECK-NEXT:    shr.u64 %rd81, %rd1, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd81, %rd2, %r10;
-; CHECK-NEXT:    or.b64 %rd82, %rd80, %rd81;
+; CHECK-NEXT:    shl.b64 %rd82, %rd2, %r10;
+; CHECK-NEXT:    or.b64 %rd83, %rd81, %rd82;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd83, %rd2, %r11;
-; CHECK-NEXT:    setp.gt.s32 %p18, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd116, %rd83, %rd82, %p18;
-; CHECK-NEXT:    shr.u64 %rd117, %rd2, %r9;
+; CHECK-NEXT:    shr.u64 %rd84, %rd2, %r11;
+; CHECK-NEXT:    setp.gt.s32 %p20, %r9, 63;
+; CHECK-NEXT:    selp.b64 %rd117, %rd84, %rd83, %p20;
+; CHECK-NEXT:    shr.u64 %rd118, %rd2, %r9;
 ; CHECK-NEXT:    add.cc.s64 %rd35, %rd3, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd36, %rd4, -1;
-; CHECK-NEXT:    mov.b64 %rd109, 0;
-; CHECK-NEXT:    mov.b64 %rd112, %rd109;
+; CHECK-NEXT:    mov.b64 %rd110, 0;
+; CHECK-NEXT:    mov.b64 %rd113, %rd110;
 ; CHECK-NEXT:  $L__BB4_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd84, %rd116, 63;
-; CHECK-NEXT:    shl.b64 %rd85, %rd117, 1;
-; CHECK-NEXT:    or.b64 %rd86, %rd85, %rd84;
-; CHECK-NEXT:    shl.b64 %rd87, %rd116, 1;
-; CHECK-NEXT:    shr.u64 %rd88, %rd119, 63;
-; CHECK-NEXT:    or.b64 %rd89, %rd87, %rd88;
-; CHECK-NEXT:    shr.u64 %rd90, %rd118, 63;
-; CHECK-NEXT:    shl.b64 %rd91, %rd119, 1;
-; CHECK-NEXT:    or.b64 %rd92, %rd91, %rd90;
-; CHECK-NEXT:    shl.b64 %rd93, %rd118, 1;
-; CHECK-NEXT:    or.b64 %rd118, %rd112, %rd93;
-; CHECK-NEXT:    or.b64 %rd119, %rd109, %rd92;
-; CHECK-NEXT:    sub.cc.s64 %rd94, %rd35, %rd89;
-; CHECK-NEXT:    subc.cc.s64 %rd95, %rd36, %rd86;
-; CHECK-NEXT:    shr.s64 %rd96, %rd95, 63;
-; CHECK-NEXT:    and.b64 %rd112, %rd96, 1;
-; CHECK-NEXT:    and.b64 %rd97, %rd96, %rd3;
-; CHECK-NEXT:    and.b64 %rd98, %rd96, %rd4;
-; CHECK-NEXT:    sub.cc.s64 %rd116, %rd89, %rd97;
-; CHECK-NEXT:    subc.cc.s64 %rd117, %rd86, %rd98;
-; CHECK-NEXT:    add.cc.s64 %rd114, %rd114, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd115, %rd115, -1;
-; CHECK-NEXT:    or.b64 %rd99, %rd114, %rd115;
-; CHECK-NEXT:    setp.eq.s64 %p19, %rd99, 0;
-; CHECK-NEXT:    @%p19 bra $L__BB4_4;
+; CHECK-NEXT:    shr.u64 %rd85, %rd117, 63;
+; CHECK-NEXT:    shl.b64 %rd86, %rd118, 1;
+; CHECK-NEXT:    or.b64 %rd87, %rd86, %rd85;
+; CHECK-NEXT:    shl.b64 %rd88, %rd117, 1;
+; CHECK-NEXT:    shr.u64 %rd89, %rd120, 63;
+; CHECK-NEXT:    or.b64 %rd90, %rd88, %rd89;
+; CHECK-NEXT:    shr.u64 %rd91, %rd119, 63;
+; CHECK-NEXT:    shl.b64 %rd92, %rd120, 1;
+; CHECK-NEXT:    or.b64 %rd93, %rd92, %rd91;
+; CHECK-NEXT:    shl.b64 %rd94, %rd119, 1;
+; CHECK-NEXT:    or.b64 %rd119, %rd113, %rd94;
+; CHECK-NEXT:    or.b64 %rd120, %rd110, %rd93;
+; CHECK-NEXT:    sub.cc.s64 %rd95, %rd35, %rd90;
+; CHECK-NEXT:    subc.cc.s64 %rd96, %rd36, %rd87;
+; CHECK-NEXT:    shr.s64 %rd97, %rd96, 63;
+; CHECK-NEXT:    and.b64 %rd113, %rd97, 1;
+; CHECK-NEXT:    and.b64 %rd98, %rd97, %rd3;
+; CHECK-NEXT:    and.b64 %rd99, %rd97, %rd4;
+; CHECK-NEXT:    sub.cc.s64 %rd117, %rd90, %rd98;
+; CHECK-NEXT:    subc.cc.s64 %rd118, %rd87, %rd99;
+; CHECK-NEXT:    add.cc.s64 %rd115, %rd115, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd116, %rd116, -1;
+; CHECK-NEXT:    or.b64 %rd100, %rd115, %rd116;
+; CHECK-NEXT:    setp.eq.s64 %p21, %rd100, 0;
+; CHECK-NEXT:    @%p21 bra $L__BB4_4;
 ; CHECK-NEXT:    bra.uni $L__BB4_2;
 ; CHECK-NEXT:  $L__BB4_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd100, %rd118, 63;
-; CHECK-NEXT:    shl.b64 %rd101, %rd119, 1;
-; CHECK-NEXT:    or.b64 %rd102, %rd101, %rd100;
-; CHECK-NEXT:    shl.b64 %rd103, %rd118, 1;
-; CHECK-NEXT:    or.b64 %rd120, %rd112, %rd103;
-; CHECK-NEXT:    or.b64 %rd121, %rd109, %rd102;
+; CHECK-NEXT:    shr.u64 %rd101, %rd119, 63;
+; CHECK-NEXT:    shl.b64 %rd102, %rd120, 1;
+; CHECK-NEXT:    or.b64 %rd103, %rd102, %rd101;
+; CHECK-NEXT:    shl.b64 %rd104, %rd119, 1;
+; CHECK-NEXT:    or.b64 %rd121, %rd113, %rd104;
+; CHECK-NEXT:    or.b64 %rd122, %rd110, %rd103;
 ; CHECK-NEXT:  $L__BB4_5: // %udiv-end
-; CHECK-NEXT:    xor.b64 %rd104, %rd120, %rd5;
 ; CHECK-NEXT:    xor.b64 %rd105, %rd121, %rd5;
-; CHECK-NEXT:    sub.cc.s64 %rd106, %rd104, %rd5;
-; CHECK-NEXT:    subc.cc.s64 %rd107, %rd105, %rd5;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd106, %rd107};
+; CHECK-NEXT:    xor.b64 %rd106, %rd122, %rd5;
+; CHECK-NEXT:    sub.cc.s64 %rd107, %rd105, %rd5;
+; CHECK-NEXT:    subc.cc.s64 %rd108, %rd106, %rd5;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd107, %rd108};
 ; CHECK-NEXT:    ret;
   %div = sdiv i128 %lhs, %rhs
   ret i128 %div
@@ -443,9 +446,9 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
 define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: udiv_i128(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<18>;
+; CHECK-NEXT:    .reg .pred %p<20>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<107>;
+; CHECK-NEXT:    .reg .b64 %rd<108>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd41, %rd42}, [udiv_i128_param_0];
@@ -464,97 +467,98 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    selp.b64 %rd50, %rd47, %rd49, %p4;
 ; CHECK-NEXT:    setp.ne.s64 %p5, %rd42, 0;
 ; CHECK-NEXT:    clz.b64 %r3, %rd42;
-; CHECK-NEXT:    cvt.u64.u32 %rd51, %r3;
+; CHECK-NEXT:    cvt.u64.u32 %rd52, %r3;
 ; CHECK-NEXT:    clz.b64 %r4, %rd41;
-; CHECK-NEXT:    cvt.u64.u32 %rd52, %r4;
-; CHECK-NEXT:    add.s64 %rd53, %rd52, 64;
-; CHECK-NEXT:    selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT:    mov.b64 %rd97, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd56, %rd50, %rd54;
-; CHECK-NEXT:    subc.cc.s64 %rd57, %rd97, 0;
-; CHECK-NEXT:    setp.gt.u64 %p6, %rd56, 127;
-; CHECK-NEXT:    setp.eq.s64 %p7, %rd57, 0;
-; CHECK-NEXT:    and.pred %p8, %p7, %p6;
-; CHECK-NEXT:    setp.ne.s64 %p9, %rd57, 0;
-; CHECK-NEXT:    or.pred %p10, %p8, %p9;
-; CHECK-NEXT:    or.pred %p11, %p3, %p10;
-; CHECK-NEXT:    xor.b64 %rd58, %rd56, 127;
-; CHECK-NEXT:    or.b64 %rd59, %rd58, %rd57;
-; CHECK-NEXT:    setp.eq.s64 %p12, %rd59, 0;
-; CHECK-NEXT:    selp.b64 %rd106, 0, %rd42, %p11;
-; CHECK-NEXT:    selp.b64 %rd105, 0, %rd41, %p11;
-; CHECK-NEXT:    or.pred %p13, %p11, %p12;
-; CHECK-NEXT:    @%p13 bra $L__BB5_5;
+; CHECK-NEXT:    cvt.u64.u32 %rd53, %r4;
+; CHECK-NEXT:    add.s64 %rd54, %rd53, 64;
+; CHECK-NEXT:    selp.b64 %rd55, %rd52, %rd54, %p5;
+; CHECK-NEXT:    mov.b64 %rd98, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd58, %rd50, %rd55;
+; CHECK-NEXT:    subc.cc.s64 %rd6, %rd98, 0;
+; CHECK-NEXT:    setp.ne.s64 %p6, %rd6, 0;
+; CHECK-NEXT:    and.pred %p8, %p6, %p6;
+; CHECK-NEXT:    setp.eq.s64 %p9, %rd6, 0;
+; CHECK-NEXT:    setp.gt.u64 %p10, %rd58, 127;
+; CHECK-NEXT:    and.pred %p11, %p9, %p10;
+; CHECK-NEXT:    or.pred %p12, %p11, %p8;
+; CHECK-NEXT:    or.pred %p13, %p3, %p12;
+; CHECK-NEXT:    xor.b64 %rd59, %rd58, 127;
+; CHECK-NEXT:    or.b64 %rd60, %rd59, %rd6;
+; CHECK-NEXT:    setp.eq.s64 %p14, %rd60, 0;
+; CHECK-NEXT:    selp.b64 %rd107, 0, %rd42, %p13;
+; CHECK-NEXT:    selp.b64 %rd106, 0, %rd41, %p13;
+; CHECK-NEXT:    or.pred %p15, %p13, %p14;
+; CHECK-NEXT:    @%p15 bra $L__BB5_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd99, %rd56, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd100, %rd57, 0;
-; CHECK-NEXT:    or.b64 %rd62, %rd99, %rd100;
-; CHECK-NEXT:    setp.eq.s64 %p14, %rd62, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r5, %rd56;
+; CHECK-NEXT:    add.cc.s64 %rd100, %rd58, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd101, %rd6, 0;
+; CHECK-NEXT:    or.b64 %rd63, %rd100, %rd101;
+; CHECK-NEXT:    setp.eq.s64 %p16, %rd63, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r5, %rd58;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd63, %rd42, %r6;
+; CHECK-NEXT:    shl.b64 %rd64, %rd42, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd64, %rd41, %r7;
-; CHECK-NEXT:    or.b64 %rd65, %rd63, %rd64;
+; CHECK-NEXT:    shr.u64 %rd65, %rd41, %r7;
+; CHECK-NEXT:    or.b64 %rd66, %rd64, %rd65;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd66, %rd41, %r8;
-; CHECK-NEXT:    setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd104, %rd66, %rd65, %p15;
-; CHECK-NEXT:    shl.b64 %rd103, %rd41, %r6;
-; CHECK-NEXT:    mov.b64 %rd94, %rd97;
-; CHECK-NEXT:    @%p14 bra $L__BB5_4;
+; CHECK-NEXT:    shl.b64 %rd67, %rd41, %r8;
+; CHECK-NEXT:    setp.gt.s32 %p17, %r6, 63;
+; CHECK-NEXT:    selp.b64 %rd105, %rd67, %rd66, %p17;
+; CHECK-NEXT:    shl.b64 %rd104, %rd41, %r6;
+; CHECK-NEXT:    mov.b64 %rd95, %rd98;
+; CHECK-NEXT:    @%p16 bra $L__BB5_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd99;
-; CHECK-NEXT:    shr.u64 %rd69, %rd41, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd100;
+; CHECK-NEXT:    shr.u64 %rd70, %rd41, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd70, %rd42, %r10;
-; CHECK-NEXT:    or.b64 %rd71, %rd69, %rd70;
+; CHECK-NEXT:    shl.b64 %rd71, %rd42, %r10;
+; CHECK-NEXT:    or.b64 %rd72, %rd70, %rd71;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd72, %rd42, %r11;
-; CHECK-NEXT:    setp.gt.s32 %p16, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd101, %rd72, %rd71, %p16;
-; CHECK-NEXT:    shr.u64 %rd102, %rd42, %r9;
+; CHECK-NEXT:    shr.u64 %rd73, %rd42, %r11;
+; CHECK-NEXT:    setp.gt.s32 %p18, %r9, 63;
+; CHECK-NEXT:    selp.b64 %rd102, %rd73, %rd72, %p18;
+; CHECK-NEXT:    shr.u64 %rd103, %rd42, %r9;
 ; CHECK-NEXT:    add.cc.s64 %rd33, %rd43, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd34, %rd44, -1;
-; CHECK-NEXT:    mov.b64 %rd94, 0;
-; CHECK-NEXT:    mov.b64 %rd97, %rd94;
+; CHECK-NEXT:    mov.b64 %rd95, 0;
+; CHECK-NEXT:    mov.b64 %rd98, %rd95;
 ; CHECK-NEXT:  $L__BB5_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd73, %rd101, 63;
-; CHECK-NEXT:    shl.b64 %rd74, %rd102, 1;
-; CHECK-NEXT:    or.b64 %rd75, %rd74, %rd73;
-; CHECK-NEXT:    shl.b64 %rd76, %rd101, 1;
-; CHECK-NEXT:    shr.u64 %rd77, %rd104, 63;
-; CHECK-NEXT:    or.b64 %rd78, %rd76, %rd77;
-; CHECK-NEXT:    shr.u64 %rd79, %rd103, 63;
-; CHECK-NEXT:    shl.b64 %rd80, %rd104, 1;
-; CHECK-NEXT:    or.b64 %rd81, %rd80, %rd79;
-; CHECK-NEXT:    shl.b64 %rd82, %rd103, 1;
-; CHECK-NEXT:    or.b64 %rd103, %rd97, %rd82;
-; CHECK-NEXT:    or.b64 %rd104, %rd94, %rd81;
-; CHECK-NEXT:    sub.cc.s64 %rd83, %rd33, %rd78;
-; CHECK-NEXT:    subc.cc.s64 %rd84, %rd34, %rd75;
-; CHECK-NEXT:    shr.s64 %rd85, %rd84, 63;
-; CHECK-NEXT:    and.b64 %rd97, %rd85, 1;
-; CHECK-NEXT:    and.b64 %rd86, %rd85, %rd43;
-; CHECK-NEXT:    and.b64 %rd87, %rd85, %rd44;
-; CHECK-NEXT:    sub.cc.s64 %rd101, %rd78, %rd86;
-; CHECK-NEXT:    subc.cc.s64 %rd102, %rd75, %rd87;
-; CHECK-NEXT:    add.cc.s64 %rd99, %rd99, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd100, %rd100, -1;
-; CHECK-NEXT:    or.b64 %rd88, %rd99, %rd100;
-; CHECK-NEXT:    setp.eq.s64 %p17, %rd88, 0;
-; CHECK-NEXT:    @%p17 bra $L__BB5_4;
+; CHECK-NEXT:    shr.u64 %rd74, %rd102, 63;
+; CHECK-NEXT:    shl.b64 %rd75, %rd103, 1;
+; CHECK-NEXT:    or.b64 %rd76, %rd75, %rd74;
+; CHECK-NEXT:    shl.b64 %rd77, %rd102, 1;
+; CHECK-NEXT:    shr.u64 %rd78, %rd105, 63;
+; CHECK-NEXT:    or.b64 %rd79, %rd77, %rd78;
+; CHECK-NEXT:    shr.u64 %rd80, %rd104, 63;
+; CHECK-NEXT:    shl.b64 %rd81, %rd105, 1;
+; CHECK-NEXT:    or.b64 %rd82, %rd81, %rd80;
+; CHECK-NEXT:    shl.b64 %rd83, %rd104, 1;
+; CHECK-NEXT:    or.b64 %rd104, %rd98, %rd83;
+; CHECK-NEXT:    or.b64 %rd105, %rd95, %rd82;
+; CHECK-NEXT:    sub.cc.s64 %rd84, %rd33, %rd79;
+; CHECK-NEXT:    subc.cc.s64 %rd85, %rd34, %rd76;
+; CHECK-NEXT:    shr.s64 %rd86, %rd85, 63;
+; CHECK-NEXT:    and.b64 %rd98, %rd86, 1;
+; CHECK-NEXT:    and.b64 %rd87, %rd86, %rd43;
+; CHECK-NEXT:    and.b64 %rd88, %rd86, %rd44;
+; CHECK-NEXT:    sub.cc.s64 %rd102, %rd79, %rd87;
+; CHECK-NEXT:    subc.cc.s64 %rd103, %rd76, %rd88;
+; CHECK-NEXT:    add.cc.s64 %rd100, %rd100, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd101, %rd101, -1;
+; CHECK-NEXT:    or.b64 %rd89, %rd100, %rd101;
+; CHECK-NEXT:    setp.eq.s64 %p19, %rd89, 0;
+; CHECK-NEXT:    @%p19 bra $L__BB5_4;
 ; CHECK-NEXT:    bra.uni $L__BB5_2;
 ; CHECK-NEXT:  $L__BB5_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd89, %rd103, 63;
-; CHECK-NEXT:    shl.b64 %rd90, %rd104, 1;
-; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
-; CHECK-NEXT:    shl.b64 %rd92, %rd103, 1;
-; CHECK-NEXT:    or.b64 %rd105, %rd97, %rd92;
-; CHECK-NEXT:    or.b64 %rd106, %rd94, %rd91;
+; CHECK-NEXT:    shr.u64 %rd90, %rd104, 63;
+; CHECK-NEXT:    shl.b64 %rd91, %rd105, 1;
+; CHECK-NEXT:    or.b64 %rd92, %rd91, %rd90;
+; CHECK-NEXT:    shl.b64 %rd93, %rd104, 1;
+; CHECK-NEXT:    or.b64 %rd106, %rd98, %rd93;
+; CHECK-NEXT:    or.b64 %rd107, %rd95, %rd92;
 ; CHECK-NEXT:  $L__BB5_5: // %udiv-end
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd105, %rd106};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd106, %rd107};
 ; CHECK-NEXT:    ret;
   %div = udiv i128 %lhs, %rhs
   ret i128 %div
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
index b540948b20f75..821cfd00dcd07 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
@@ -764,13 +764,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ;
 ; CHECK-PWR7-LABEL: sub_absv_8_ext:
 ; CHECK-PWR7:       # %bb.0: # %entry
-; CHECK-PWR7-NEXT:    stdu r1, -512(r1)
-; CHECK-PWR7-NEXT:    .cfi_def_cfa_offset 512
-; CHECK-PWR7-NEXT:    .cfi_offset r14, -144
-; CHECK-PWR7-NEXT:    .cfi_offset r15, -136
-; CHECK-PWR7-NEXT:    .cfi_offset r16, -128
-; CHECK-PWR7-NEXT:    .cfi_offset r17, -120
-; CHECK-PWR7-NEXT:    .cfi_offset r18, -112
+; CHECK-PWR7-NEXT:    stdu r1, -448(r1)
+; CHECK-PWR7-NEXT:    .cfi_def_cfa_offset 448
 ; CHECK-PWR7-NEXT:    .cfi_offset r19, -104
 ; CHECK-PWR7-NEXT:    .cfi_offset r20, -96
 ; CHECK-PWR7-NEXT:    .cfi_offset r21, -88
@@ -783,244 +778,258 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR7-NEXT:    .cfi_offset r28, -32
 ; CHECK-PWR7-NEXT:    .cfi_offset r29, -24
 ; CHECK-PWR7-NEXT:    .cfi_offset r30, -16
-; CHECK-PWR7-NEXT:    .cfi_offset r31, -8
-; CHECK-PWR7-NEXT:    .cfi_offset r2, -152
-; CHECK-PWR7-NEXT:    addi r3, r1, 320
-; CHECK-PWR7-NEXT:    std r14, 368(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r15, 376(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r16, 384(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r17, 392(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r18, 400(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r19, 408(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r20, 416(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r21, 424(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r22, 432(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r23, 440(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r24, 448(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r25, 456(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r26, 464(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r27, 472(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r28, 480(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r29, 488(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r30, 496(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r31, 504(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r2, 360(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    addi r3, r1, 304
+; CHECK-PWR7-NEXT:    std r19, 344(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r20, 352(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r21, 360(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r22, 368(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r23, 376(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r24, 384(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r25, 392(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r26, 400(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r27, 408(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r28, 416(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r29, 424(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r30, 432(r1) # 8-byte Folded Spill
 ; CHECK-PWR7-NEXT:    stxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT:    lbz r3, 320(r1)
-; CHECK-PWR7-NEXT:    addi r4, r1, 336
-; CHECK-PWR7-NEXT:    stw r3, 60(r1) # 4-byte Folded Spill
-; CHECK-PWR7-NEXT:    stxvw4x v3, 0, r4
-; CHECK-PWR7-NEXT:    lbz r15, 334(r1)
-; CHECK-PWR7-NEXT:    lbz r14, 350(r1)
-; CHECK-PWR7-NEXT:    lbz r31, 335(r1)
-; CHECK-PWR7-NEXT:    lbz r2, 351(r1)
-; CHECK-PWR7-NEXT:    sub r15, r15, r14
-; CHECK-PWR7-NEXT:    sub r14, r31, r2
-; CHECK-PWR7-NEXT:    srawi r2, r14, 31
-; CHECK-PWR7-NEXT:    xor r14, r14, r2
-; CHECK-PWR7-NEXT:    lbz r3, 333(r1)
-; CHECK-PWR7-NEXT:    lbz r19, 331(r1)
-; CHECK-PWR7-NEXT:    lbz r18, 347(r1)
-; CHECK-PWR7-NEXT:    sub r19, r19, r18
-; CHECK-PWR7-NEXT:    lbz r17, 332(r1)
-; CHECK-PWR7-NEXT:    lbz r16, 348(r1)
-; CHECK-PWR7-NEXT:    sub r17, r17, r16
-; CHECK-PWR7-NEXT:    lbz r23, 329(r1)
-; CHECK-PWR7-NEXT:    sub r14, r14, r2
-; CHECK-PWR7-NEXT:    lbz r2, 349(r1)
-; CHECK-PWR7-NEXT:    lbz r22, 345(r1)
-; CHECK-PWR7-NEXT:    lbz r4, 336(r1)
-; CHECK-PWR7-NEXT:    lbz r5, 321(r1)
-; CHECK-PWR7-NEXT:    lbz r6, 337(r1)
-; CHECK-PWR7-NEXT:    lbz r7, 322(r1)
-; CHECK-PWR7-NEXT:    lbz r8, 338(r1)
-; CHECK-PWR7-NEXT:    lbz r9, 323(r1)
-; CHECK-PWR7-NEXT:    lbz r10, 339(r1)
-; CHECK-PWR7-NEXT:    lbz r11, 324(r1)
-; CHECK-PWR7-NEXT:    lbz r12, 340(r1)
-; CHECK-PWR7-NEXT:    lbz r0, 325(r1)
-; CHECK-PWR7-NEXT:    lbz r30, 341(r1)
-; CHECK-PWR7-NEXT:    lbz r29, 326(r1)
-; CHECK-PWR7-NEXT:    lbz r28, 342(r1)
-; CHECK-PWR7-NEXT:    lbz r27, 327(r1)
-; CHECK-PWR7-NEXT:    lbz r26, 343(r1)
-; CHECK-PWR7-NEXT:    sub r3, r3, r2
-; CHECK-PWR7-NEXT:    lbz r25, 328(r1)
-; CHECK-PWR7-NEXT:    lbz r24, 344(r1)
-; CHECK-PWR7-NEXT:    lbz r21, 330(r1)
-; CHECK-PWR7-NEXT:    lbz r20, 346(r1)
+; CHECK-PWR7-NEXT:    addi r3, r1, 320
+; CHECK-PWR7-NEXT:    lbz r7, 304(r1)
+; CHECK-PWR7-NEXT:    stxvw4x v3, 0, r3
+; CHECK-PWR7-NEXT:    lbz r8, 320(r1)
+; CHECK-PWR7-NEXT:    lbz r9, 305(r1)
+; CHECK-PWR7-NEXT:    lbz r10, 321(r1)
+; CHECK-PWR7-NEXT:    lbz r26, 325(r1)
+; CHECK-PWR7-NEXT:    clrlwi r7, r7, 24
+; CHECK-PWR7-NEXT:    clrlwi r8, r8, 24
+; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
+; CHECK-PWR7-NEXT:    clrlwi r10, r10, 24
+; CHECK-PWR7-NEXT:    lbz r11, 306(r1)
+; CHECK-PWR7-NEXT:    lbz r12, 322(r1)
+; CHECK-PWR7-NEXT:    lbz r23, 314(r1)
+; CHECK-PWR7-NEXT:    clrlwi r22, r26, 24
+; CHECK-PWR7-NEXT:    lbz r26, 330(r1)
+; CHECK-PWR7-NEXT:    sub r8, r7, r8
+; CHECK-PWR7-NEXT:    lbz r7, 315(r1)
+; CHECK-PWR7-NEXT:    sub r20, r9, r10
+; CHECK-PWR7-NEXT:    lbz r9, 331(r1)
+; CHECK-PWR7-NEXT:    lbz r0, 307(r1)
+; CHECK-PWR7-NEXT:    lbz r30, 323(r1)
+; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
+; CHECK-PWR7-NEXT:    clrlwi r12, r12, 24
+; CHECK-PWR7-NEXT:    clrlwi r23, r23, 24
+; CHECK-PWR7-NEXT:    clrlwi r21, r26, 24
+; CHECK-PWR7-NEXT:    clrlwi r7, r7, 24
+; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
+; CHECK-PWR7-NEXT:    clrlwi r0, r0, 24
+; CHECK-PWR7-NEXT:    clrlwi r30, r30, 24
+; CHECK-PWR7-NEXT:    lbz r29, 308(r1)
+; CHECK-PWR7-NEXT:    lbz r28, 324(r1)
+; CHECK-PWR7-NEXT:    lbz r27, 309(r1)
+; CHECK-PWR7-NEXT:    lbz r25, 310(r1)
+; CHECK-PWR7-NEXT:    lbz r24, 326(r1)
+; CHECK-PWR7-NEXT:    sub r19, r11, r12
+; CHECK-PWR7-NEXT:    sub r11, r23, r21
+; CHECK-PWR7-NEXT:    sub r9, r7, r9
+; CHECK-PWR7-NEXT:    sub r26, r0, r30
+; CHECK-PWR7-NEXT:    srawi r12, r11, 31
+; CHECK-PWR7-NEXT:    srawi r0, r9, 31
+; CHECK-PWR7-NEXT:    lbz r3, 312(r1)
+; CHECK-PWR7-NEXT:    clrlwi r29, r29, 24
+; CHECK-PWR7-NEXT:    clrlwi r28, r28, 24
+; CHECK-PWR7-NEXT:    clrlwi r27, r27, 24
+; CHECK-PWR7-NEXT:    clrlwi r25, r25, 24
+; CHECK-PWR7-NEXT:    clrlwi r24, r24, 24
+; CHECK-PWR7-NEXT:    xor r11, r11, r12
+; CHECK-PWR7-NEXT:    xor r9, r9, r0
+; CHECK-PWR7-NEXT:    sub r28, r29, r28
+; CHECK-PWR7-NEXT:    sub r30, r27, r22
+; CHECK-PWR7-NEXT:    sub r29, r25, r24
+; CHECK-PWR7-NEXT:    sub r27, r11, r12
+; CHECK-PWR7-NEXT:    sub r24, r9, r0
+; CHECK-PWR7-NEXT:    lbz r9, 316(r1)
+; CHECK-PWR7-NEXT:    lbz r11, 332(r1)
+; CHECK-PWR7-NEXT:    lbz r4, 328(r1)
+; CHECK-PWR7-NEXT:    lbz r5, 311(r1)
+; CHECK-PWR7-NEXT:    lbz r6, 327(r1)
+; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
+; CHECK-PWR7-NEXT:    clrlwi r3, r3, 24
+; CHECK-PWR7-NEXT:    clrlwi r4, r4, 24
+; CHECK-PWR7-NEXT:    clrlwi r5, r5, 24
+; CHECK-PWR7-NEXT:    clrlwi r6, r6, 24
+; CHECK-PWR7-NEXT:    sub r3, r3, r4
 ; CHECK-PWR7-NEXT:    sub r5, r5, r6
-; CHECK-PWR7-NEXT:    srawi r18, r3, 31
-; CHECK-PWR7-NEXT:    sub r7, r7, r8
-; CHECK-PWR7-NEXT:    sub r9, r9, r10
-; CHECK-PWR7-NEXT:    sub r11, r11, r12
-; CHECK-PWR7-NEXT:    sub r0, r0, r30
-; CHECK-PWR7-NEXT:    sub r29, r29, r28
-; CHECK-PWR7-NEXT:    sub r27, r27, r26
-; CHECK-PWR7-NEXT:    sub r25, r25, r24
-; CHECK-PWR7-NEXT:    srawi r31, r15, 31
-; CHECK-PWR7-NEXT:    ld r2, 360(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    xor r3, r3, r18
+; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
+; CHECK-PWR7-NEXT:    srawi r4, r3, 31
 ; CHECK-PWR7-NEXT:    srawi r6, r5, 31
-; CHECK-PWR7-NEXT:    srawi r8, r7, 31
-; CHECK-PWR7-NEXT:    srawi r10, r9, 31
-; CHECK-PWR7-NEXT:    srawi r12, r11, 31
-; CHECK-PWR7-NEXT:    srawi r30, r0, 31
-; CHECK-PWR7-NEXT:    sub r3, r3, r18
-; CHECK-PWR7-NEXT:    srawi r18, r19, 31
-; CHECK-PWR7-NEXT:    srawi r28, r29, 31
-; CHECK-PWR7-NEXT:    ld r16, 384(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    sldi r3, r3, 56
-; CHECK-PWR7-NEXT:    srawi r26, r27, 31
-; CHECK-PWR7-NEXT:    srawi r24, r25, 31
-; CHECK-PWR7-NEXT:    xor r19, r19, r18
-; CHECK-PWR7-NEXT:    xor r15, r15, r31
+; CHECK-PWR7-NEXT:    xor r3, r3, r4
+; CHECK-PWR7-NEXT:    sldi r27, r27, 56
 ; CHECK-PWR7-NEXT:    xor r5, r5, r6
-; CHECK-PWR7-NEXT:    std r3, 272(r1)
-; CHECK-PWR7-NEXT:    std r3, 280(r1)
-; CHECK-PWR7-NEXT:    srawi r3, r17, 31
-; CHECK-PWR7-NEXT:    sub r19, r19, r18
-; CHECK-PWR7-NEXT:    xor r7, r7, r8
-; CHECK-PWR7-NEXT:    sub r15, r15, r31
-; CHECK-PWR7-NEXT:    xor r17, r17, r3
-; CHECK-PWR7-NEXT:    xor r9, r9, r10
-; CHECK-PWR7-NEXT:    xor r11, r11, r12
-; CHECK-PWR7-NEXT:    xor r0, r0, r30
-; CHECK-PWR7-NEXT:    xor r29, r29, r28
-; CHECK-PWR7-NEXT:    xor r27, r27, r26
-; CHECK-PWR7-NEXT:    sub r3, r17, r3
-; CHECK-PWR7-NEXT:    xor r25, r25, r24
-; CHECK-PWR7-NEXT:    sub r25, r25, r24
-; CHECK-PWR7-NEXT:    sub r27, r27, r26
-; CHECK-PWR7-NEXT:    sub r29, r29, r28
+; CHECK-PWR7-NEXT:    sub r9, r9, r11
+; CHECK-PWR7-NEXT:    sub r3, r3, r4
+; CHECK-PWR7-NEXT:    sldi r24, r24, 56
 ; CHECK-PWR7-NEXT:    sldi r3, r3, 56
-; CHECK-PWR7-NEXT:    sub r0, r0, r30
-; CHECK-PWR7-NEXT:    sub r11, r11, r12
-; CHECK-PWR7-NEXT:    sub r9, r9, r10
-; CHECK-PWR7-NEXT:    sub r7, r7, r8
-; CHECK-PWR7-NEXT:    sub r5, r5, r6
-; CHECK-PWR7-NEXT:    sldi r14, r14, 56
-; CHECK-PWR7-NEXT:    sldi r15, r15, 56
-; CHECK-PWR7-NEXT:    ld r31, 504(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    std r3, 256(r1)
-; CHECK-PWR7-NEXT:    std r3, 264(r1)
-; CHECK-PWR7-NEXT:    sldi r3, r19, 56
+; CHECK-PWR7-NEXT:    srawi r11, r9, 31
+; CHECK-PWR7-NEXT:    std r27, 208(r1)
+; CHECK-PWR7-NEXT:    sub r4, r5, r6
+; CHECK-PWR7-NEXT:    std r27, 216(r1)
+; CHECK-PWR7-NEXT:    srawi r27, r29, 31
+; CHECK-PWR7-NEXT:    lbz r10, 313(r1)
+; CHECK-PWR7-NEXT:    xor r9, r9, r11
+; CHECK-PWR7-NEXT:    std r24, 224(r1)
+; CHECK-PWR7-NEXT:    lbz r22, 329(r1)
+; CHECK-PWR7-NEXT:    std r24, 232(r1)
+; CHECK-PWR7-NEXT:    srawi r24, r30, 31
+; CHECK-PWR7-NEXT:    ld r21, 360(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    sub r23, r9, r11
+; CHECK-PWR7-NEXT:    lbz r9, 317(r1)
+; CHECK-PWR7-NEXT:    lbz r11, 333(r1)
+; CHECK-PWR7-NEXT:    xor r29, r29, r27
+; CHECK-PWR7-NEXT:    std r3, 176(r1)
+; CHECK-PWR7-NEXT:    std r3, 184(r1)
+; CHECK-PWR7-NEXT:    sldi r3, r4, 56
+; CHECK-PWR7-NEXT:    sldi r23, r23, 56
+; CHECK-PWR7-NEXT:    xor r30, r30, r24
+; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
+; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
+; CHECK-PWR7-NEXT:    sub r4, r30, r24
+; CHECK-PWR7-NEXT:    ld r30, 432(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r3, 160(r1)
+; CHECK-PWR7-NEXT:    std r3, 168(r1)
+; CHECK-PWR7-NEXT:    sub r9, r9, r11
+; CHECK-PWR7-NEXT:    sub r3, r29, r27
+; CHECK-PWR7-NEXT:    std r23, 240(r1)
+; CHECK-PWR7-NEXT:    ld r29, 424(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    srawi r11, r9, 31
+; CHECK-PWR7-NEXT:    std r23, 248(r1)
+; CHECK-PWR7-NEXT:    ld r27, 408(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    srawi r23, r28, 31
+; CHECK-PWR7-NEXT:    sldi r3, r3, 56
+; CHECK-PWR7-NEXT:    xor r28, r28, r23
+; CHECK-PWR7-NEXT:    xor r9, r9, r11
+; CHECK-PWR7-NEXT:    std r3, 144(r1)
+; CHECK-PWR7-NEXT:    ld r24, 384(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r3, 152(r1)
+; CHECK-PWR7-NEXT:    sldi r3, r4, 56
+; CHECK-PWR7-NEXT:    sub r25, r9, r11
+; CHECK-PWR7-NEXT:    lbz r9, 318(r1)
+; CHECK-PWR7-NEXT:    lbz r11, 334(r1)
+; CHECK-PWR7-NEXT:    std r3, 128(r1)
 ; CHECK-PWR7-NEXT:    sldi r25, r25, 56
-; CHECK-PWR7-NEXT:    sldi r27, r27, 56
-; CHECK-PWR7-NEXT:    std r3, 240(r1)
-; CHECK-PWR7-NEXT:    std r3, 248(r1)
-; CHECK-PWR7-NEXT:    sub r3, r23, r22
-; CHECK-PWR7-NEXT:    srawi r23, r3, 31
-; CHECK-PWR7-NEXT:    sub r22, r21, r20
-; CHECK-PWR7-NEXT:    srawi r21, r22, 31
-; CHECK-PWR7-NEXT:    sldi r29, r29, 56
-; CHECK-PWR7-NEXT:    sldi r0, r0, 56
-; CHECK-PWR7-NEXT:    sldi r11, r11, 56
-; CHECK-PWR7-NEXT:    xor r3, r3, r23
-; CHECK-PWR7-NEXT:    xor r22, r22, r21
-; CHECK-PWR7-NEXT:    sldi r9, r9, 56
-; CHECK-PWR7-NEXT:    sldi r7, r7, 56
-; CHECK-PWR7-NEXT:    sldi r5, r5, 56
-; CHECK-PWR7-NEXT:    ld r30, 496(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    ld r28, 480(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    sub r3, r3, r23
-; CHECK-PWR7-NEXT:    sub r22, r22, r21
-; CHECK-PWR7-NEXT:    std r14, 304(r1)
-; CHECK-PWR7-NEXT:    ld r26, 464(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r3, 136(r1)
+; CHECK-PWR7-NEXT:    sub r3, r28, r23
 ; CHECK-PWR7-NEXT:    sldi r3, r3, 56
-; CHECK-PWR7-NEXT:    sldi r22, r22, 56
-; CHECK-PWR7-NEXT:    ld r24, 448(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    ld r23, 440(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    std r14, 312(r1)
-; CHECK-PWR7-NEXT:    std r15, 288(r1)
-; CHECK-PWR7-NEXT:    std r3, 208(r1)
-; CHECK-PWR7-NEXT:    std r3, 216(r1)
-; CHECK-PWR7-NEXT:    lwz r3, 60(r1) # 4-byte Folded Reload
-; CHECK-PWR7-NEXT:    std r15, 296(r1)
-; CHECK-PWR7-NEXT:    ld r21, 424(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    ld r20, 416(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    std r22, 224(r1)
-; CHECK-PWR7-NEXT:    std r22, 232(r1)
-; CHECK-PWR7-NEXT:    sub r4, r3, r4
-; CHECK-PWR7-NEXT:    std r25, 192(r1)
-; CHECK-PWR7-NEXT:    ld r22, 432(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    ld r19, 408(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    srawi r3, r4, 31
-; CHECK-PWR7-NEXT:    std r25, 200(r1)
-; CHECK-PWR7-NEXT:    ld r25, 456(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    std r27, 176(r1)
-; CHECK-PWR7-NEXT:    std r27, 184(r1)
-; CHECK-PWR7-NEXT:    xor r4, r4, r3
-; CHECK-PWR7-NEXT:    std r29, 160(r1)
-; CHECK-PWR7-NEXT:    ld r27, 472(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    std r29, 168(r1)
-; CHECK-PWR7-NEXT:    std r0, 144(r1)
-; CHECK-PWR7-NEXT:    sub r3, r4, r3
-; CHECK-PWR7-NEXT:    std r0, 152(r1)
-; CHECK-PWR7-NEXT:    ld r29, 488(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    ld r18, 400(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r3, 112(r1)
+; CHECK-PWR7-NEXT:    ld r28, 416(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
+; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
+; CHECK-PWR7-NEXT:    clrlwi r10, r10, 24
+; CHECK-PWR7-NEXT:    std r25, 256(r1)
+; CHECK-PWR7-NEXT:    std r25, 264(r1)
+; CHECK-PWR7-NEXT:    sub r9, r9, r11
+; CHECK-PWR7-NEXT:    srawi r25, r26, 31
+; CHECK-PWR7-NEXT:    xor r26, r26, r25
+; CHECK-PWR7-NEXT:    ld r23, 376(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    srawi r11, r9, 31
+; CHECK-PWR7-NEXT:    std r3, 120(r1)
+; CHECK-PWR7-NEXT:    sub r4, r26, r25
+; CHECK-PWR7-NEXT:    clrlwi r22, r22, 24
+; CHECK-PWR7-NEXT:    srawi r7, r8, 31
+; CHECK-PWR7-NEXT:    sub r10, r10, r22
+; CHECK-PWR7-NEXT:    ld r26, 400(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    xor r9, r9, r11
+; CHECK-PWR7-NEXT:    sldi r3, r4, 56
+; CHECK-PWR7-NEXT:    srawi r22, r10, 31
+; CHECK-PWR7-NEXT:    xor r8, r8, r7
+; CHECK-PWR7-NEXT:    xor r10, r10, r22
+; CHECK-PWR7-NEXT:    sub r10, r10, r22
+; CHECK-PWR7-NEXT:    ld r25, 392(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    sub r12, r9, r11
+; CHECK-PWR7-NEXT:    lbz r9, 319(r1)
+; CHECK-PWR7-NEXT:    lbz r11, 335(r1)
+; CHECK-PWR7-NEXT:    std r3, 96(r1)
+; CHECK-PWR7-NEXT:    sldi r12, r12, 56
+; CHECK-PWR7-NEXT:    std r3, 104(r1)
+; CHECK-PWR7-NEXT:    ld r22, 368(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    sldi r10, r10, 56
+; CHECK-PWR7-NEXT:    std r10, 192(r1)
+; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
+; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
+; CHECK-PWR7-NEXT:    sub r9, r9, r11
+; CHECK-PWR7-NEXT:    std r12, 272(r1)
+; CHECK-PWR7-NEXT:    std r12, 280(r1)
+; CHECK-PWR7-NEXT:    srawi r12, r19, 31
+; CHECK-PWR7-NEXT:    xor r0, r19, r12
+; CHECK-PWR7-NEXT:    ld r19, 344(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    sub r3, r0, r12
+; CHECK-PWR7-NEXT:    srawi r11, r9, 31
+; CHECK-PWR7-NEXT:    std r10, 200(r1)
+; CHECK-PWR7-NEXT:    xor r9, r9, r11
 ; CHECK-PWR7-NEXT:    sldi r3, r3, 56
-; CHECK-PWR7-NEXT:    std r11, 128(r1)
-; CHECK-PWR7-NEXT:    ld r17, 392(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    std r11, 136(r1)
-; CHECK-PWR7-NEXT:    std r9, 112(r1)
+; CHECK-PWR7-NEXT:    sub r9, r9, r11
+; CHECK-PWR7-NEXT:    std r3, 80(r1)
+; CHECK-PWR7-NEXT:    std r3, 88(r1)
+; CHECK-PWR7-NEXT:    sldi r9, r9, 56
+; CHECK-PWR7-NEXT:    std r9, 288(r1)
+; CHECK-PWR7-NEXT:    std r9, 296(r1)
+; CHECK-PWR7-NEXT:    srawi r9, r20, 31
+; CHECK-PWR7-NEXT:    xor r11, r20, r9
+; CHECK-PWR7-NEXT:    ld r20, 352(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    sub r4, r11, r9
+; CHECK-PWR7-NEXT:    sldi r3, r4, 56
 ; CHECK-PWR7-NEXT:    std r3, 64(r1)
 ; CHECK-PWR7-NEXT:    std r3, 72(r1)
-; CHECK-PWR7-NEXT:    addi r3, r1, 304
-; CHECK-PWR7-NEXT:    std r9, 120(r1)
-; CHECK-PWR7-NEXT:    ld r15, 376(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    std r7, 96(r1)
-; CHECK-PWR7-NEXT:    std r7, 104(r1)
-; CHECK-PWR7-NEXT:    std r5, 80(r1)
-; CHECK-PWR7-NEXT:    std r5, 88(r1)
-; CHECK-PWR7-NEXT:    lxvw4x v2, 0, r3
+; CHECK-PWR7-NEXT:    sub r3, r8, r7
+; CHECK-PWR7-NEXT:    sldi r3, r3, 56
+; CHECK-PWR7-NEXT:    std r3, 48(r1)
+; CHECK-PWR7-NEXT:    std r3, 56(r1)
 ; CHECK-PWR7-NEXT:    addi r3, r1, 288
-; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
+; CHECK-PWR7-NEXT:    lxvw4x v2, 0, r3
 ; CHECK-PWR7-NEXT:    addi r3, r1, 272
-; CHECK-PWR7-NEXT:    ld r14, 368(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    vmrghb v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
 ; CHECK-PWR7-NEXT:    addi r3, r1, 256
-; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
+; CHECK-PWR7-NEXT:    vmrghb v2, v3, v2
+; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
 ; CHECK-PWR7-NEXT:    addi r3, r1, 240
+; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
+; CHECK-PWR7-NEXT:    addi r3, r1, 224
 ; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
 ; CHECK-PWR7-NEXT:    vmrghh v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 224
-; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
 ; CHECK-PWR7-NEXT:    addi r3, r1, 208
-; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
 ; CHECK-PWR7-NEXT:    addi r3, r1, 192
-; CHECK-PWR7-NEXT:    lxvw4x v5, 0, r3
+; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
+; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
 ; CHECK-PWR7-NEXT:    addi r3, r1, 176
+; CHECK-PWR7-NEXT:    lxvw4x v5, 0, r3
+; CHECK-PWR7-NEXT:    addi r3, r1, 160
 ; CHECK-PWR7-NEXT:    vmrghb v4, v5, v4
 ; CHECK-PWR7-NEXT:    vmrghh v3, v4, v3
 ; CHECK-PWR7-NEXT:    xxmrghw vs0, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 160
-; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
 ; CHECK-PWR7-NEXT:    addi r3, r1, 144
-; CHECK-PWR7-NEXT:    vmrghb v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
 ; CHECK-PWR7-NEXT:    addi r3, r1, 128
+; CHECK-PWR7-NEXT:    vmrghb v2, v3, v2
+; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
+; CHECK-PWR7-NEXT:    addi r3, r1, 112
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
+; CHECK-PWR7-NEXT:    addi r3, r1, 96
 ; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
-; CHECK-PWR7-NEXT:    addi r3, r1, 112
 ; CHECK-PWR7-NEXT:    vmrghh v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 96
-; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
 ; CHECK-PWR7-NEXT:    addi r3, r1, 80
-; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
 ; CHECK-PWR7-NEXT:    addi r3, r1, 64
+; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
+; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
+; CHECK-PWR7-NEXT:    addi r3, r1, 48
 ; CHECK-PWR7-NEXT:    lxvw4x v5, 0, r3
 ; CHECK-PWR7-NEXT:    vmrghb v4, v5, v4
 ; CHECK-PWR7-NEXT:    vmrghh v3, v4, v3
 ; CHECK-PWR7-NEXT:    xxmrghw vs1, v3, v2
 ; CHECK-PWR7-NEXT:    xxmrghd v2, vs1, vs0
-; CHECK-PWR7-NEXT:    addi r1, r1, 512
+; CHECK-PWR7-NEXT:    addi r1, r1, 448
 ; CHECK-PWR7-NEXT:    blr
 entry:
   %vecext = extractelement <16 x i8> %a, i32 0
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index 117e3e4aac45d..246e6a614d6aa 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -3292,30 +3292,30 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    mv a1, a0
 ; RV32IF-NEXT:    addi a0, sp, 8
 ; RV32IF-NEXT:    call __fixdfti
-; RV32IF-NEXT:    lw a0, 20(sp)
-; RV32IF-NEXT:    lw a1, 8(sp)
-; RV32IF-NEXT:    lw a2, 12(sp)
+; RV32IF-NEXT:    lw a0, 8(sp)
+; RV32IF-NEXT:    lw a1, 12(sp)
+; RV32IF-NEXT:    lw a2, 20(sp)
 ; RV32IF-NEXT:    lw a3, 16(sp)
-; RV32IF-NEXT:    beqz a0, .LBB47_2
+; RV32IF-NEXT:    beqz a2, .LBB47_2
 ; RV32IF-NEXT:  # %bb.1: # %entry
-; RV32IF-NEXT:    slti a4, a0, 0
+; RV32IF-NEXT:    slti a4, a2, 0
 ; RV32IF-NEXT:    j .LBB47_3
 ; RV32IF-NEXT:  .LBB47_2:
 ; RV32IF-NEXT:    seqz a4, a3
 ; RV32IF-NEXT:  .LBB47_3: # %entry
 ; RV32IF-NEXT:    xori a3, a3, 1
-; RV32IF-NEXT:    or a3, a3, a0
+; RV32IF-NEXT:    or a3, a3, a2
 ; RV32IF-NEXT:    seqz a3, a3
 ; RV32IF-NEXT:    addi a3, a3, -1
 ; RV32IF-NEXT:    and a3, a3, a4
 ; RV32IF-NEXT:    neg a3, a3
-; RV32IF-NEXT:    and a2, a3, a2
 ; RV32IF-NEXT:    and a1, a3, a1
 ; RV32IF-NEXT:    and a0, a3, a0
-; RV32IF-NEXT:    slti a0, a0, 0
-; RV32IF-NEXT:    addi a3, a0, -1
-; RV32IF-NEXT:    and a0, a3, a1
-; RV32IF-NEXT:    and a1, a3, a2
+; RV32IF-NEXT:    and a2, a3, a2
+; RV32IF-NEXT:    slti a2, a2, 0
+; RV32IF-NEXT:    addi a2, a2, -1
+; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    and a1, a2, a1
 ; RV32IF-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    .cfi_restore ra
 ; RV32IF-NEXT:    addi sp, sp, 32
@@ -3354,30 +3354,30 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    .cfi_offset ra, -4
 ; RV32IFD-NEXT:    addi a0, sp, 8
 ; RV32IFD-NEXT:    call __fixdfti
-; RV32IFD-NEXT:    lw a0, 20(sp)
-; RV32IFD-NEXT:    lw a1, 8(sp)
-; RV32IFD-NEXT:    lw a2, 12(sp)
+; RV32IFD-NEXT:    lw a0, 8(sp)
+; RV32IFD-NEXT:    lw a1, 12(sp)
+; RV32IFD-NEXT:    lw a2, 20(sp)
 ; RV32IFD-NEXT:    lw a3, 16(sp)
-; RV32IFD-NEXT:    beqz a0, .LBB47_2
+; RV32IFD-NEXT:    beqz a2, .LBB47_2
 ; RV32IFD-NEXT:  # %bb.1: # %entry
-; RV32IFD-NEXT:    slti a4, a0, 0
+; RV32IFD-NEXT:    slti a4, a2, 0
 ; RV32IFD-NEXT:    j .LBB47_3
 ; RV32IFD-NEXT:  .LBB47_2:
 ; RV32IFD-NEXT:    seqz a4, a3
 ; RV32IFD-NEXT:  .LBB47_3: # %entry
 ; RV32IFD-NEXT:    xori a3, a3, 1
-; RV32IFD-NEXT:    or a3, a3, a0
+; RV32IFD-NEXT:    or a3, a3, a2
 ; RV32IFD-NEXT:    seqz a3, a3
 ; RV32IFD-NEXT:    addi a3, a3, -1
 ; RV32IFD-NEXT:    and a3, a3, a4
 ; RV32IFD-NEXT:    neg a3, a3
-; RV32IFD-NEXT:    and a2, a3, a2
 ; RV32IFD-NEXT:    and a1, a3, a1
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    slti a0, a0, 0
-; RV32IFD-NEXT:    addi a3, a0, -1
-; RV32IFD-NEXT:    and a0, a3, a1
-; RV32IFD-NEXT:    and a1, a3, a2
+; RV32IFD-NEXT:    and a2, a3, a2
+; RV32IFD-NEXT:    slti a2, a2, 0
+; RV32IFD-NEXT:    addi a2, a2, -1
+; RV32IFD-NEXT:    and a0, a2, a0
+; RV32IFD-NEXT:    and a1, a2, a1
 ; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    .cfi_restore ra
 ; RV32IFD-NEXT:    addi sp, sp, 32
@@ -3530,30 +3530,30 @@ define i64 @ustest_f32i64_mm(float %x) {
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti
-; RV32-NEXT:    lw a0, 20(sp)
-; RV32-NEXT:    lw a1, 8(sp)
-; RV32-NEXT:    lw a2, 12(sp)
+; RV32-NEXT:    lw a0, 8(sp)
+; RV32-NEXT:    lw a1, 12(sp)
+; RV32-NEXT:    lw a2, 20(sp)
 ; RV32-NEXT:    lw a3, 16(sp)
-; RV32-NEXT:    beqz a0, .LBB50_2
+; RV32-NEXT:    beqz a2, .LBB50_2
 ; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    slti a4, a0, 0
+; RV32-NEXT:    slti a4, a2, 0
 ; RV32-NEXT:    j .LBB50_3
 ; RV32-NEXT:  .LBB50_2:
 ; RV32-NEXT:    seqz a4, a3
 ; RV32-NEXT:  .LBB50_3: # %entry
 ; RV32-NEXT:    xori a3, a3, 1
-; RV32-NEXT:    or a3, a3, a0
+; RV32-NEXT:    or a3, a3, a2
 ; RV32-NEXT:    seqz a3, a3
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a3, a3, a4
 ; RV32-NEXT:    neg a3, a3
-; RV32-NEXT:    and a2, a3, a2
 ; RV32-NEXT:    and a1, a3, a1
 ; RV32-NEXT:    and a0, a3, a0
-; RV32-NEXT:    slti a0, a0, 0
-; RV32-NEXT:    addi a3, a0, -1
-; RV32-NEXT:    and a0, a3, a1
-; RV32-NEXT:    and a1, a3, a2
+; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    slti a2, a2, 0
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore ra
 ; RV32-NEXT:    addi sp, sp, 32
@@ -3767,30 +3767,30 @@ define i64 @ustest_f16i64_mm(half %x) {
 ; RV32-NEXT:    call __extendhfsf2
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti
-; RV32-NEXT:    lw a0, 20(sp)
-; RV32-NEXT:    lw a1, 8(sp)
-; RV32-NEXT:    lw a2, 12(sp)
+; RV32-NEXT:    lw a0, 8(sp)
+; RV32-NEXT:    lw a1, 12(sp)
+; RV32-NEXT:    lw a2, 20(sp)
 ; RV32-NEXT:    lw a3, 16(sp)
-; RV32-NEXT:    beqz a0, .LBB53_2
+; RV32-NEXT:    beqz a2, .LBB53_2
 ; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    slti a4, a0, 0
+; RV32-NEXT:    slti a4, a2, 0
 ; RV32-NEXT:    j .LBB53_3
 ; RV32-NEXT:  .LBB53_2:
 ; RV32-NEXT:    seqz a4, a3
 ; RV32-NEXT:  .LBB53_3: # %entry
 ; RV32-NEXT:    xori a3, a3, 1
-; RV32-NEXT:    or a3, a3, a0
+; RV32-NEXT:    or a3, a3, a2
 ; RV32-NEXT:    seqz a3, a3
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a3, a3, a4
 ; RV32-NEXT:    neg a3, a3
-; RV32-NEXT:    and a2, a3, a2
 ; RV32-NEXT:    and a1, a3, a1
 ; RV32-NEXT:    and a0, a3, a0
-; RV32-NEXT:    slti a0, a0, 0
-; RV32-NEXT:    addi a3, a0, -1
-; RV32-NEXT:    and a0, a3, a1
-; RV32-NEXT:    and a1, a3, a2
+; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    slti a2, a2, 0
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore ra
 ; RV32-NEXT:    addi sp, sp, 32
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
index 97d102561129d..b1a6d163664e5 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
@@ -7,18 +7,18 @@
 define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; RV32-LABEL: ctz_nxv4i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; RV32-NEXT:    vid.v v10
-; RV32-NEXT:    li a1, -1
+; RV32-NEXT:    vmv.v.i v11, -1
+; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v8, 0
 ; RV32-NEXT:    srli a0, a0, 1
 ; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vmadd.vx v10, a1, v8
-; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    vmerge.vvm v8, v8, v10, v0
+; RV32-NEXT:    vmacc.vv v8, v10, v11
+; RV32-NEXT:    vmv.v.i v9, 0
+; RV32-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    sub a0, a0, a1
@@ -28,18 +28,18 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ;
 ; RV64-LABEL: ctz_nxv4i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; RV64-NEXT:    vid.v v10
-; RV64-NEXT:    li a1, -1
+; RV64-NEXT:    vmv.v.i v11, -1
+; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-NEXT:    srli a0, a0, 1
 ; RV64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a0
-; RV64-NEXT:    vmadd.vx v10, a1, v8
-; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vvm v8, v8, v10, v0
+; RV64-NEXT:    vmacc.vv v8, v10, v11
+; RV64-NEXT:    vmv.v.i v9, 0
+; RV64-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
 ; RV64-NEXT:    subw a0, a0, a1
@@ -109,17 +109,17 @@ define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
 ;
 ; RV64-LABEL: ctz_nxv8i1_no_range:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vid.v v16
-; RV64-NEXT:    li a1, -1
+; RV64-NEXT:    vmv.v.i v24, -1
+; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a0
-; RV64-NEXT:    vmadd.vx v16, a1, v8
-; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vvm v8, v8, v16, v0
+; RV64-NEXT:    vmacc.vv v8, v16, v24
+; RV64-NEXT:    vmv.v.i v16, 0
+; RV64-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
 ; RV64-NEXT:    sub a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index 38df622998bf9..7b839be76a8b8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -3927,6 +3927,7 @@ define void @trunc_v8f16(ptr %x) {
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI171_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI171_0)(a1)
+; ZVFH-NEXT:    vmv.v.v v8, v8
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
@@ -4007,6 +4008,7 @@ define void @trunc_v4f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vmv.v.v v8, v8
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
@@ -4028,6 +4030,7 @@ define void @trunc_v2f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    lui a1, %hi(.LCPI174_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI174_0)(a1)
+; CHECK-NEXT:    vmv.v.v v8, v8
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
@@ -4103,6 +4106,7 @@ define void @ceil_v8f16(ptr %x) {
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI177_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI177_0)(a1)
+; ZVFH-NEXT:    vmv.v.v v8, v8
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    fsrmi a1, 3
@@ -4191,6 +4195,7 @@ define void @ceil_v4f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vmv.v.v v8, v8
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a1, 3
@@ -4214,6 +4219,7 @@ define void @ceil_v2f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    lui a1, %hi(.LCPI180_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI180_0)(a1)
+; CHECK-NEXT:    vmv.v.v v8, v8
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a1, 3
@@ -4291,6 +4297,7 @@ define void @floor_v8f16(ptr %x) {
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI183_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI183_0)(a1)
+; ZVFH-NEXT:    vmv.v.v v8, v8
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    fsrmi a1, 2
@@ -4379,6 +4386,7 @@ define void @floor_v4f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vmv.v.v v8, v8
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a1, 2
@@ -4402,6 +4410,7 @@ define void @floor_v2f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    lui a1, %hi(.LCPI186_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI186_0)(a1)
+; CHECK-NEXT:    vmv.v.v v8, v8
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a1, 2
@@ -4479,6 +4488,7 @@ define void @round_v8f16(ptr %x) {
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI189_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI189_0)(a1)
+; ZVFH-NEXT:    vmv.v.v v8, v8
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    fsrmi a1, 4
@@ -4567,6 +4577,7 @@ define void @round_v4f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vmv.v.v v8, v8
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a1, 4
@@ -4590,6 +4601,7 @@ define void @round_v2f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    lui a1, %hi(.LCPI192_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI192_0)(a1)
+; CHECK-NEXT:    vmv.v.v v8, v8
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a1, 4
@@ -4638,6 +4650,7 @@ define void @rint_v8f16(ptr %x) {
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI194_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI194_0)(a1)
+; ZVFH-NEXT:    vmv.v.v v8, v8
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
@@ -4678,6 +4691,7 @@ define void @rint_v4f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vmv.v.v v8, v8
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
@@ -4699,6 +4713,7 @@ define void @rint_v2f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    lui a1, %hi(.LCPI196_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI196_0)(a1)
+; CHECK-NEXT:    vmv.v.v v8, v8
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
@@ -4747,6 +4762,7 @@ define void @nearbyint_v8f16(ptr %x) {
 ; ZVFH-NEXT:    vle16.v v8, (a0)
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI198_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI198_0)(a1)
+; ZVFH-NEXT:    vmv.v.v v8, v8
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    frflags a1
@@ -4791,6 +4807,7 @@ define void @nearbyint_v4f32(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vmv.v.v v8, v8
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a1
@@ -4814,6 +4831,7 @@ define void @nearbyint_v2f64(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    lui a1, %hi(.LCPI200_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI200_0)(a1)
+; CHECK-NEXT:    vmv.v.v v8, v8
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a1
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index cd7f30d8f5898..32753ca382fc7 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -716,101 +716,92 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 8(a0)
-; RV32I-NEXT:    lbu a6, 9(a0)
-; RV32I-NEXT:    lbu t3, 10(a0)
-; RV32I-NEXT:    lbu t4, 11(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a4, a4, a3
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a3, t0, a7
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    lbu t0, 10(a0)
+; RV32I-NEXT:    lbu t3, 11(a0)
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 12(a0)
-; RV32I-NEXT:    lbu t1, 13(a0)
-; RV32I-NEXT:    lbu t2, 14(a0)
-; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or t3, t4, t3
-; RV32I-NEXT:    or a6, t1, a6
-; RV32I-NEXT:    or a0, a0, t2
-; RV32I-NEXT:    lbu t1, 1(a1)
-; RV32I-NEXT:    lbu t2, 0(a1)
-; RV32I-NEXT:    lbu t4, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    or a7, t3, t0
+; RV32I-NEXT:    lbu t0, 12(a0)
+; RV32I-NEXT:    lbu t2, 13(a0)
+; RV32I-NEXT:    lbu t3, 14(a0)
+; RV32I-NEXT:    lbu t4, 15(a0)
+; RV32I-NEXT:    lbu a0, 0(a1)
 ; RV32I-NEXT:    sw zero, 16(sp)
 ; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t4
-; RV32I-NEXT:    mv t2, sp
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, t0, a7
-; RV32I-NEXT:    or a5, t3, a5
-; RV32I-NEXT:    or a0, a0, a6
-; RV32I-NEXT:    or a1, a1, t1
-; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a0, 12(sp)
-; RV32I-NEXT:    srli a0, a1, 3
-; RV32I-NEXT:    andi a3, a1, 31
-; RV32I-NEXT:    andi a0, a0, 12
-; RV32I-NEXT:    xori a3, a3, 31
-; RV32I-NEXT:    add a0, t2, a0
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    lw a5, 8(a0)
-; RV32I-NEXT:    lw a6, 0(a0)
-; RV32I-NEXT:    lw a0, 12(a0)
-; RV32I-NEXT:    srl a7, a4, a1
-; RV32I-NEXT:    slli t0, a5, 1
-; RV32I-NEXT:    srl a6, a6, a1
-; RV32I-NEXT:    slli a4, a4, 1
-; RV32I-NEXT:    srl a5, a5, a1
-; RV32I-NEXT:    slli t1, a0, 1
-; RV32I-NEXT:    srl a0, a0, a1
-; RV32I-NEXT:    sll a1, t0, a3
-; RV32I-NEXT:    sll a4, a4, a3
-; RV32I-NEXT:    sll a3, t1, a3
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or a1, t2, t0
+; RV32I-NEXT:    mv t0, sp
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    srli t3, a0, 3
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    andi a5, a0, 31
+; RV32I-NEXT:    andi t3, t3, 12
+; RV32I-NEXT:    xori a5, a5, 31
+; RV32I-NEXT:    or a3, t1, a3
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    or a1, t2, a1
+; RV32I-NEXT:    add t0, t0, t3
+; RV32I-NEXT:    sw a4, 0(sp)
+; RV32I-NEXT:    sw a3, 4(sp)
+; RV32I-NEXT:    sw a6, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
+; RV32I-NEXT:    lw a1, 4(t0)
+; RV32I-NEXT:    lw a3, 8(t0)
+; RV32I-NEXT:    lw a4, 0(t0)
+; RV32I-NEXT:    lw a6, 12(t0)
+; RV32I-NEXT:    srl a7, a1, a0
+; RV32I-NEXT:    slli t0, a3, 1
+; RV32I-NEXT:    srl a4, a4, a0
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    srl a3, a3, a0
+; RV32I-NEXT:    slli t1, a6, 1
+; RV32I-NEXT:    srl a0, a6, a0
+; RV32I-NEXT:    sll a6, t0, a5
+; RV32I-NEXT:    sll a1, a1, a5
+; RV32I-NEXT:    sll a5, t1, a5
 ; RV32I-NEXT:    srli t0, a0, 16
 ; RV32I-NEXT:    srli t1, a0, 24
 ; RV32I-NEXT:    srli t2, a0, 8
-; RV32I-NEXT:    or a1, a7, a1
-; RV32I-NEXT:    or a4, a6, a4
-; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    or a3, a3, a5
 ; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    sb t2, 13(a2)
 ; RV32I-NEXT:    sb t0, 14(a2)
 ; RV32I-NEXT:    sb t1, 15(a2)
 ; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    srli a5, a3, 24
-; RV32I-NEXT:    srli a6, a3, 8
-; RV32I-NEXT:    srli a7, a4, 16
-; RV32I-NEXT:    srli t0, a4, 24
-; RV32I-NEXT:    srli t1, a4, 8
-; RV32I-NEXT:    srli t2, a1, 16
-; RV32I-NEXT:    srli t3, a1, 24
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    srli a5, a3, 8
+; RV32I-NEXT:    srli a7, a1, 16
+; RV32I-NEXT:    srli t0, a1, 24
+; RV32I-NEXT:    srli t1, a1, 8
+; RV32I-NEXT:    srli t2, a6, 16
+; RV32I-NEXT:    srli t3, a6, 24
 ; RV32I-NEXT:    sb a3, 8(a2)
-; RV32I-NEXT:    sb a6, 9(a2)
+; RV32I-NEXT:    sb a5, 9(a2)
 ; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    sb a5, 11(a2)
-; RV32I-NEXT:    srli a0, a1, 8
-; RV32I-NEXT:    sb a4, 0(a2)
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a0, a6, 8
+; RV32I-NEXT:    sb a1, 0(a2)
 ; RV32I-NEXT:    sb t1, 1(a2)
 ; RV32I-NEXT:    sb a7, 2(a2)
 ; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a1, 4(a2)
+; RV32I-NEXT:    sb a6, 4(a2)
 ; RV32I-NEXT:    sb a0, 5(a2)
 ; RV32I-NEXT:    sb t2, 6(a2)
 ; RV32I-NEXT:    sb t3, 7(a2)
@@ -952,102 +943,93 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 8(a0)
-; RV32I-NEXT:    lbu a6, 9(a0)
-; RV32I-NEXT:    lbu t3, 10(a0)
-; RV32I-NEXT:    lbu t4, 11(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a4, a4, a3
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a3, t0, a7
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    lbu t0, 10(a0)
+; RV32I-NEXT:    lbu t3, 11(a0)
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    lbu a6, 12(a0)
-; RV32I-NEXT:    lbu t1, 13(a0)
-; RV32I-NEXT:    lbu t2, 14(a0)
-; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or t3, t4, t3
-; RV32I-NEXT:    or a6, t1, a6
-; RV32I-NEXT:    or a0, a0, t2
-; RV32I-NEXT:    lbu t1, 1(a1)
-; RV32I-NEXT:    lbu t2, 0(a1)
-; RV32I-NEXT:    lbu t4, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t2
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    or a7, t3, t0
+; RV32I-NEXT:    lbu t0, 12(a0)
+; RV32I-NEXT:    lbu t2, 13(a0)
+; RV32I-NEXT:    lbu t3, 14(a0)
+; RV32I-NEXT:    lbu t4, 15(a0)
+; RV32I-NEXT:    lbu a0, 0(a1)
 ; RV32I-NEXT:    sw zero, 0(sp)
 ; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t4
-; RV32I-NEXT:    addi t2, sp, 16
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or a1, t2, t0
+; RV32I-NEXT:    addi t0, sp, 16
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    srli t3, a0, 3
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    andi a5, a0, 31
+; RV32I-NEXT:    andi t3, t3, 12
+; RV32I-NEXT:    or a3, t1, a3
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    or a1, t2, a1
+; RV32I-NEXT:    sub a7, t0, t3
+; RV32I-NEXT:    sw a4, 16(sp)
+; RV32I-NEXT:    sw a3, 20(sp)
+; RV32I-NEXT:    sw a6, 24(sp)
+; RV32I-NEXT:    sw a1, 28(sp)
+; RV32I-NEXT:    lw a1, 0(a7)
+; RV32I-NEXT:    lw a3, 4(a7)
+; RV32I-NEXT:    lw a4, 8(a7)
+; RV32I-NEXT:    lw a6, 12(a7)
+; RV32I-NEXT:    xori a5, a5, 31
+; RV32I-NEXT:    sll a7, a3, a0
+; RV32I-NEXT:    srli t0, a1, 1
+; RV32I-NEXT:    sll a6, a6, a0
+; RV32I-NEXT:    srli t1, a4, 1
+; RV32I-NEXT:    sll a4, a4, a0
+; RV32I-NEXT:    srli a3, a3, 1
+; RV32I-NEXT:    sll a0, a1, a0
+; RV32I-NEXT:    srl a1, t0, a5
+; RV32I-NEXT:    srl t0, t1, a5
+; RV32I-NEXT:    srl a3, a3, a5
+; RV32I-NEXT:    srli a5, a0, 16
+; RV32I-NEXT:    srli t1, a0, 24
+; RV32I-NEXT:    srli t2, a0, 8
+; RV32I-NEXT:    or a1, a7, a1
+; RV32I-NEXT:    or a6, a6, t0
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, t0, a7
-; RV32I-NEXT:    or a5, t3, a5
-; RV32I-NEXT:    or a0, a0, a6
-; RV32I-NEXT:    or a1, a1, t1
-; RV32I-NEXT:    sw a3, 16(sp)
-; RV32I-NEXT:    sw a4, 20(sp)
-; RV32I-NEXT:    sw a5, 24(sp)
-; RV32I-NEXT:    sw a0, 28(sp)
-; RV32I-NEXT:    srli a0, a1, 3
-; RV32I-NEXT:    andi a3, a1, 31
-; RV32I-NEXT:    andi a0, a0, 12
-; RV32I-NEXT:    sub a0, t2, a0
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 4(a0)
-; RV32I-NEXT:    lw a6, 8(a0)
-; RV32I-NEXT:    lw a0, 12(a0)
-; RV32I-NEXT:    xori a3, a3, 31
-; RV32I-NEXT:    sll a7, a5, a1
-; RV32I-NEXT:    srli t0, a4, 1
-; RV32I-NEXT:    sll a0, a0, a1
-; RV32I-NEXT:    srli t1, a6, 1
-; RV32I-NEXT:    sll a6, a6, a1
-; RV32I-NEXT:    srli a5, a5, 1
-; RV32I-NEXT:    sll a1, a4, a1
-; RV32I-NEXT:    srl a4, t0, a3
-; RV32I-NEXT:    srl t0, t1, a3
-; RV32I-NEXT:    srl a3, a5, a3
-; RV32I-NEXT:    srli a5, a1, 16
-; RV32I-NEXT:    srli t1, a1, 24
-; RV32I-NEXT:    srli t2, a1, 8
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    or a0, a0, t0
-; RV32I-NEXT:    or a3, a6, a3
-; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a0, 0(a2)
 ; RV32I-NEXT:    sb t2, 1(a2)
 ; RV32I-NEXT:    sb a5, 2(a2)
 ; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    srli a1, a3, 16
-; RV32I-NEXT:    srli a5, a3, 24
-; RV32I-NEXT:    srli a6, a3, 8
-; RV32I-NEXT:    srli a7, a0, 16
-; RV32I-NEXT:    srli t0, a0, 24
-; RV32I-NEXT:    srli t1, a0, 8
-; RV32I-NEXT:    srli t2, a4, 16
-; RV32I-NEXT:    srli t3, a4, 24
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    srli a4, a3, 24
+; RV32I-NEXT:    srli a5, a3, 8
+; RV32I-NEXT:    srli a7, a6, 16
+; RV32I-NEXT:    srli t0, a6, 24
+; RV32I-NEXT:    srli t1, a6, 8
+; RV32I-NEXT:    srli t2, a1, 16
+; RV32I-NEXT:    srli t3, a1, 24
 ; RV32I-NEXT:    sb a3, 8(a2)
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    sb a1, 10(a2)
-; RV32I-NEXT:    sb a5, 11(a2)
-; RV32I-NEXT:    srli a1, a4, 8
-; RV32I-NEXT:    sb a0, 12(a2)
+; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    sb a4, 11(a2)
+; RV32I-NEXT:    srli a0, a1, 8
+; RV32I-NEXT:    sb a6, 12(a2)
 ; RV32I-NEXT:    sb t1, 13(a2)
 ; RV32I-NEXT:    sb a7, 14(a2)
 ; RV32I-NEXT:    sb t0, 15(a2)
-; RV32I-NEXT:    sb a4, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    sb a1, 4(a2)
+; RV32I-NEXT:    sb a0, 5(a2)
 ; RV32I-NEXT:    sb t2, 6(a2)
 ; RV32I-NEXT:    sb t3, 7(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
@@ -1186,82 +1168,73 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 8(a0)
-; RV32I-NEXT:    lbu t3, 9(a0)
-; RV32I-NEXT:    lbu t4, 10(a0)
-; RV32I-NEXT:    lbu t5, 11(a0)
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    lbu a6, 8(a0)
+; RV32I-NEXT:    lbu a7, 9(a0)
+; RV32I-NEXT:    lbu t0, 10(a0)
+; RV32I-NEXT:    lbu t3, 11(a0)
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli t3, t3, 24
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    or a7, t3, t0
 ; RV32I-NEXT:    lbu t0, 12(a0)
-; RV32I-NEXT:    lbu t1, 13(a0)
-; RV32I-NEXT:    lbu t2, 14(a0)
-; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t5, t5, 24
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or a4, t3, a4
-; RV32I-NEXT:    or t3, t5, t4
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    lbu t1, 1(a1)
-; RV32I-NEXT:    lbu t4, 0(a1)
-; RV32I-NEXT:    lbu t5, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t1, t1, t4
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t5
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    mv a5, sp
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or t2, a0, t2
-; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    lbu t2, 13(a0)
+; RV32I-NEXT:    lbu t3, 14(a0)
+; RV32I-NEXT:    lbu t4, 15(a0)
+; RV32I-NEXT:    lbu a0, 0(a1)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or a1, t2, t0
+; RV32I-NEXT:    mv t0, sp
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    srli a4, a0, 3
+; RV32I-NEXT:    or a5, t1, a5
+; RV32I-NEXT:    andi t1, a0, 31
+; RV32I-NEXT:    or t2, t4, t3
+; RV32I-NEXT:    srai t3, t4, 31
+; RV32I-NEXT:    andi a4, a4, 12
+; RV32I-NEXT:    xori t1, t1, 31
 ; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a4, t3, a4
-; RV32I-NEXT:    or a7, t2, t0
-; RV32I-NEXT:    or a1, a1, t1
-; RV32I-NEXT:    sw a0, 16(sp)
-; RV32I-NEXT:    sw a0, 20(sp)
-; RV32I-NEXT:    sw a0, 24(sp)
-; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    or a1, t2, a1
+; RV32I-NEXT:    sw t3, 16(sp)
+; RV32I-NEXT:    sw t3, 20(sp)
+; RV32I-NEXT:    sw t3, 24(sp)
+; RV32I-NEXT:    sw t3, 28(sp)
+; RV32I-NEXT:    add a4, t0, a4
 ; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a6, 4(sp)
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a7, 12(sp)
-; RV32I-NEXT:    srli a0, a1, 3
-; RV32I-NEXT:    andi a3, a1, 31
-; RV32I-NEXT:    andi a0, a0, 12
-; RV32I-NEXT:    xori a3, a3, 31
-; RV32I-NEXT:    add a0, a5, a0
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    lw a5, 8(a0)
-; RV32I-NEXT:    lw a6, 0(a0)
-; RV32I-NEXT:    lw a0, 12(a0)
-; RV32I-NEXT:    srl a7, a4, a1
-; RV32I-NEXT:    slli t0, a5, 1
-; RV32I-NEXT:    srl a6, a6, a1
-; RV32I-NEXT:    slli a4, a4, 1
-; RV32I-NEXT:    srl a5, a5, a1
-; RV32I-NEXT:    slli t1, a0, 1
-; RV32I-NEXT:    sra a0, a0, a1
-; RV32I-NEXT:    sll a1, t0, a3
-; RV32I-NEXT:    sll a4, a4, a3
-; RV32I-NEXT:    sll a3, t1, a3
+; RV32I-NEXT:    sw a5, 4(sp)
+; RV32I-NEXT:    sw a6, 8(sp)
+; RV32I-NEXT:    sw a1, 12(sp)
+; RV32I-NEXT:    lw a1, 4(a4)
+; RV32I-NEXT:    lw a3, 8(a4)
+; RV32I-NEXT:    lw a5, 0(a4)
+; RV32I-NEXT:    lw a4, 12(a4)
+; RV32I-NEXT:    srl a6, a1, a0
+; RV32I-NEXT:    slli a7, a3, 1
+; RV32I-NEXT:    srl a5, a5, a0
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    srl a3, a3, a0
+; RV32I-NEXT:    slli t0, a4, 1
+; RV32I-NEXT:    sra a0, a4, a0
+; RV32I-NEXT:    sll a4, a7, t1
+; RV32I-NEXT:    sll a1, a1, t1
+; RV32I-NEXT:    sll a7, t0, t1
 ; RV32I-NEXT:    srli t0, a0, 16
 ; RV32I-NEXT:    srli t1, a0, 24
 ; RV32I-NEXT:    srli t2, a0, 8
-; RV32I-NEXT:    or a1, a7, a1
 ; RV32I-NEXT:    or a4, a6, a4
-; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    or a1, a5, a1
+; RV32I-NEXT:    or a3, a3, a7
 ; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    sb t2, 13(a2)
 ; RV32I-NEXT:    sb t0, 14(a2)
@@ -1269,21 +1242,21 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli a0, a3, 16
 ; RV32I-NEXT:    srli a5, a3, 24
 ; RV32I-NEXT:    srli a6, a3, 8
-; RV32I-NEXT:    srli a7, a4, 16
-; RV32I-NEXT:    srli t0, a4, 24
-; RV32I-NEXT:    srli t1, a4, 8
-; RV32I-NEXT:    srli t2, a1, 16
-; RV32I-NEXT:    srli t3, a1, 24
+; RV32I-NEXT:    srli a7, a1, 16
+; RV32I-NEXT:    srli t0, a1, 24
+; RV32I-NEXT:    srli t1, a1, 8
+; RV32I-NEXT:    srli t2, a4, 16
+; RV32I-NEXT:    srli t3, a4, 24
 ; RV32I-NEXT:    sb a3, 8(a2)
 ; RV32I-NEXT:    sb a6, 9(a2)
 ; RV32I-NEXT:    sb a0, 10(a2)
 ; RV32I-NEXT:    sb a5, 11(a2)
-; RV32I-NEXT:    srli a0, a1, 8
-; RV32I-NEXT:    sb a4, 0(a2)
+; RV32I-NEXT:    srli a0, a4, 8
+; RV32I-NEXT:    sb a1, 0(a2)
 ; RV32I-NEXT:    sb t1, 1(a2)
 ; RV32I-NEXT:    sb a7, 2(a2)
 ; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a1, 4(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
 ; RV32I-NEXT:    sb a0, 5(a2)
 ; RV32I-NEXT:    sb t2, 6(a2)
 ; RV32I-NEXT:    sb t3, 7(a2)
@@ -1299,19 +1272,17 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -160
-; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -144
+; RV64I-NEXT:    sd s0, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 0(a0)
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
@@ -1328,143 +1299,122 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s1, 13(a0)
 ; RV64I-NEXT:    lbu s2, 14(a0)
 ; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    lbu s4, 16(a0)
 ; RV64I-NEXT:    lbu s5, 17(a0)
 ; RV64I-NEXT:    lbu s6, 18(a0)
 ; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    slli s8, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or a5, a4, a3
-; RV64I-NEXT:    or a6, a6, s8
-; RV64I-NEXT:    or a3, t0, a7
-; RV64I-NEXT:    or a4, t2, t1
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    lbu s10, 22(a0)
-; RV64I-NEXT:    lbu s11, 23(a0)
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t5, t5, 16
 ; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    lbu t5, 20(a0)
+; RV64I-NEXT:    lbu t6, 21(a0)
+; RV64I-NEXT:    lbu s8, 22(a0)
+; RV64I-NEXT:    lbu s9, 23(a0)
 ; RV64I-NEXT:    slli s1, s1, 8
 ; RV64I-NEXT:    slli s2, s2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    or t1, s1, s0
-; RV64I-NEXT:    or t2, s3, s2
-; RV64I-NEXT:    lbu t6, 24(a0)
-; RV64I-NEXT:    lbu s0, 25(a0)
-; RV64I-NEXT:    lbu s1, 26(a0)
-; RV64I-NEXT:    lbu s2, 27(a0)
 ; RV64I-NEXT:    slli s5, s5, 8
 ; RV64I-NEXT:    slli s6, s6, 16
 ; RV64I-NEXT:    slli s7, s7, 24
-; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
 ; RV64I-NEXT:    or t3, s5, s4
 ; RV64I-NEXT:    or t4, s7, s6
-; RV64I-NEXT:    or t5, s9, s8
-; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s0, 24(a0)
+; RV64I-NEXT:    lbu s1, 25(a0)
+; RV64I-NEXT:    lbu s2, 26(a0)
+; RV64I-NEXT:    lbu s3, 27(a0)
+; RV64I-NEXT:    slli t6, t6, 8
+; RV64I-NEXT:    slli s8, s8, 16
+; RV64I-NEXT:    slli s9, s9, 24
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    or t5, t6, t5
+; RV64I-NEXT:    or t6, s9, s8
+; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    lbu s1, 28(a0)
 ; RV64I-NEXT:    lbu s4, 29(a0)
 ; RV64I-NEXT:    lbu s5, 30(a0)
 ; RV64I-NEXT:    lbu s6, 31(a0)
-; RV64I-NEXT:    slli s10, s10, 16
-; RV64I-NEXT:    slli s11, s11, 24
-; RV64I-NEXT:    slli s0, s0, 8
-; RV64I-NEXT:    slli s1, s1, 16
-; RV64I-NEXT:    slli s2, s2, 24
-; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    or a0, s11, s10
-; RV64I-NEXT:    or t6, s0, t6
-; RV64I-NEXT:    or s0, s2, s1
-; RV64I-NEXT:    or s1, s4, s3
-; RV64I-NEXT:    lbu s2, 0(a1)
-; RV64I-NEXT:    lbu s3, 1(a1)
-; RV64I-NEXT:    lbu s4, 2(a1)
-; RV64I-NEXT:    lbu s7, 3(a1)
-; RV64I-NEXT:    slli s5, s5, 16
-; RV64I-NEXT:    slli s6, s6, 24
-; RV64I-NEXT:    slli s3, s3, 8
-; RV64I-NEXT:    slli s4, s4, 16
-; RV64I-NEXT:    slli s7, s7, 24
-; RV64I-NEXT:    or s5, s6, s5
-; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    or s3, s7, s4
-; RV64I-NEXT:    lbu s4, 5(a1)
-; RV64I-NEXT:    lbu s6, 4(a1)
-; RV64I-NEXT:    lbu s7, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    or s4, s4, s6
-; RV64I-NEXT:    slli s7, s7, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, s7
+; RV64I-NEXT:    lbu a0, 0(a1)
 ; RV64I-NEXT:    sd zero, 32(sp)
 ; RV64I-NEXT:    sd zero, 40(sp)
 ; RV64I-NEXT:    sd zero, 48(sp)
 ; RV64I-NEXT:    sd zero, 56(sp)
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    mv a6, sp
+; RV64I-NEXT:    slli s2, s2, 16
+; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a1, s3, s2
+; RV64I-NEXT:    mv s2, sp
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    or s1, s4, s1
+; RV64I-NEXT:    srli s3, a0, 3
+; RV64I-NEXT:    or s4, s6, s5
+; RV64I-NEXT:    andi s5, a0, 63
+; RV64I-NEXT:    andi s3, s3, 24
+; RV64I-NEXT:    xori s5, s5, 63
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, t0, a7
-; RV64I-NEXT:    or a7, t2, t1
-; RV64I-NEXT:    or t0, t4, t3
-; RV64I-NEXT:    or a0, a0, t5
-; RV64I-NEXT:    or t1, s0, t6
-; RV64I-NEXT:    or t2, s5, s1
-; RV64I-NEXT:    or t3, s3, s2
-; RV64I-NEXT:    or a1, a1, s4
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    slli a7, a7, 32
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    slli t2, t2, 32
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a3, a3, a5
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    or a5, t2, t1
-; RV64I-NEXT:    or a1, a1, t3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or a1, a1, s0
+; RV64I-NEXT:    or t1, s4, s1
+; RV64I-NEXT:    add s2, s2, s3
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    slli t0, t0, 32
+; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a1, t1, a1
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    sd a4, 8(sp)
-; RV64I-NEXT:    sd a0, 16(sp)
-; RV64I-NEXT:    sd a5, 24(sp)
-; RV64I-NEXT:    srli a0, a1, 3
-; RV64I-NEXT:    andi a3, a1, 63
-; RV64I-NEXT:    andi a0, a0, 24
-; RV64I-NEXT:    xori a3, a3, 63
-; RV64I-NEXT:    add a0, a6, a0
-; RV64I-NEXT:    ld a4, 8(a0)
-; RV64I-NEXT:    ld a5, 16(a0)
-; RV64I-NEXT:    ld a6, 0(a0)
-; RV64I-NEXT:    ld a0, 24(a0)
-; RV64I-NEXT:    srl a7, a4, a1
+; RV64I-NEXT:    sd a5, 16(sp)
+; RV64I-NEXT:    sd a1, 24(sp)
+; RV64I-NEXT:    ld a1, 8(s2)
+; RV64I-NEXT:    ld a3, 16(s2)
+; RV64I-NEXT:    ld a4, 0(s2)
+; RV64I-NEXT:    ld a5, 24(s2)
+; RV64I-NEXT:    srl a6, a1, a0
+; RV64I-NEXT:    slli a7, a3, 1
+; RV64I-NEXT:    srl a4, a4, a0
+; RV64I-NEXT:    slli a1, a1, 1
+; RV64I-NEXT:    srl a3, a3, a0
 ; RV64I-NEXT:    slli t0, a5, 1
-; RV64I-NEXT:    srl a6, a6, a1
-; RV64I-NEXT:    slli a4, a4, 1
-; RV64I-NEXT:    srl a5, a5, a1
-; RV64I-NEXT:    slli t1, a0, 1
-; RV64I-NEXT:    srl t2, a0, a1
-; RV64I-NEXT:    sll a0, t0, a3
-; RV64I-NEXT:    sll a1, a4, a3
-; RV64I-NEXT:    sll a3, t1, a3
-; RV64I-NEXT:    srli a4, t2, 56
-; RV64I-NEXT:    srli t0, t2, 48
-; RV64I-NEXT:    srli t1, t2, 40
-; RV64I-NEXT:    srli t3, t2, 32
-; RV64I-NEXT:    srli t4, t2, 24
-; RV64I-NEXT:    srli t5, t2, 16
-; RV64I-NEXT:    srli t6, t2, 8
-; RV64I-NEXT:    or a0, a7, a0
-; RV64I-NEXT:    or a1, a6, a1
-; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    srl a5, a5, a0
+; RV64I-NEXT:    sll a0, a7, s5
+; RV64I-NEXT:    sll a1, a1, s5
+; RV64I-NEXT:    sll a7, t0, s5
+; RV64I-NEXT:    srli t0, a5, 56
+; RV64I-NEXT:    srli t1, a5, 48
+; RV64I-NEXT:    srli t2, a5, 40
+; RV64I-NEXT:    srli t3, a5, 32
+; RV64I-NEXT:    srli t4, a5, 24
+; RV64I-NEXT:    srli t5, a5, 16
+; RV64I-NEXT:    srli t6, a5, 8
+; RV64I-NEXT:    or a0, a6, a0
+; RV64I-NEXT:    or a1, a4, a1
+; RV64I-NEXT:    or a3, a3, a7
 ; RV64I-NEXT:    sb t3, 28(a2)
-; RV64I-NEXT:    sb t1, 29(a2)
-; RV64I-NEXT:    sb t0, 30(a2)
-; RV64I-NEXT:    sb a4, 31(a2)
-; RV64I-NEXT:    sb t2, 24(a2)
+; RV64I-NEXT:    sb t2, 29(a2)
+; RV64I-NEXT:    sb t1, 30(a2)
+; RV64I-NEXT:    sb t0, 31(a2)
+; RV64I-NEXT:    sb a5, 24(a2)
 ; RV64I-NEXT:    sb t6, 25(a2)
 ; RV64I-NEXT:    sb t5, 26(a2)
 ; RV64I-NEXT:    sb t4, 27(a2)
@@ -1513,19 +1463,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a1, 9(a2)
 ; RV64I-NEXT:    sb a5, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 160
+; RV64I-NEXT:    ld s0, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 144
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_32bytes:
@@ -1550,67 +1498,55 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a7, 3(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s2, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu s6, 16(a0)
-; RV32I-NEXT:    lbu s7, 17(a0)
-; RV32I-NEXT:    lbu s8, 18(a0)
-; RV32I-NEXT:    lbu s9, 19(a0)
+; RV32I-NEXT:    lbu t3, 6(a0)
+; RV32I-NEXT:    lbu t6, 7(a0)
+; RV32I-NEXT:    lbu s2, 8(a0)
+; RV32I-NEXT:    lbu s3, 9(a0)
+; RV32I-NEXT:    lbu s4, 10(a0)
+; RV32I-NEXT:    lbu s5, 11(a0)
+; RV32I-NEXT:    lbu s7, 12(a0)
+; RV32I-NEXT:    lbu s8, 13(a0)
+; RV32I-NEXT:    lbu s9, 14(a0)
+; RV32I-NEXT:    lbu s10, 15(a0)
+; RV32I-NEXT:    lbu s11, 16(a0)
+; RV32I-NEXT:    lbu ra, 17(a0)
+; RV32I-NEXT:    lbu t4, 18(a0)
+; RV32I-NEXT:    lbu s0, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    or a4, a7, a6
-; RV32I-NEXT:    lbu s10, 20(a0)
-; RV32I-NEXT:    lbu s11, 21(a0)
-; RV32I-NEXT:    lbu ra, 22(a0)
-; RV32I-NEXT:    lbu a3, 23(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 21(a0)
+; RV32I-NEXT:    lbu t5, 22(a0)
+; RV32I-NEXT:    lbu s1, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    lbu s1, 24(a0)
-; RV32I-NEXT:    lbu s3, 25(a0)
-; RV32I-NEXT:    lbu t4, 26(a0)
-; RV32I-NEXT:    lbu t5, 27(a0)
-; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s3, s3, 8
 ; RV32I-NEXT:    slli s4, s4, 16
 ; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    or t1, s2, s0
-; RV32I-NEXT:    or t2, s5, s4
-; RV32I-NEXT:    or t3, s7, s6
-; RV32I-NEXT:    lbu t6, 28(a0)
-; RV32I-NEXT:    lbu s4, 29(a0)
-; RV32I-NEXT:    lbu s5, 30(a0)
-; RV32I-NEXT:    lbu s6, 31(a0)
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a0, s9, s8
-; RV32I-NEXT:    or s0, s11, s10
-; RV32I-NEXT:    or s2, a3, ra
-; RV32I-NEXT:    lbu a3, 0(a1)
-; RV32I-NEXT:    lbu s7, 1(a1)
-; RV32I-NEXT:    lbu s8, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    or a6, t6, t3
+; RV32I-NEXT:    or a7, s3, s2
+; RV32I-NEXT:    or t0, s5, s4
+; RV32I-NEXT:    lbu t3, 24(a0)
+; RV32I-NEXT:    lbu s5, 25(a0)
+; RV32I-NEXT:    lbu s6, 26(a0)
+; RV32I-NEXT:    lbu t6, 27(a0)
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    slli s9, s9, 16
+; RV32I-NEXT:    slli s10, s10, 24
+; RV32I-NEXT:    slli ra, ra, 8
+; RV32I-NEXT:    or s7, s8, s7
+; RV32I-NEXT:    or s2, s10, s9
+; RV32I-NEXT:    or s3, ra, s11
+; RV32I-NEXT:    lbu s4, 28(a0)
+; RV32I-NEXT:    lbu s8, 29(a0)
+; RV32I-NEXT:    lbu s9, 30(a0)
+; RV32I-NEXT:    lbu s10, 31(a0)
+; RV32I-NEXT:    lbu a0, 0(a1)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
 ; RV32I-NEXT:    sw zero, 64(sp)
@@ -1619,89 +1555,90 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw zero, 44(sp)
 ; RV32I-NEXT:    sw zero, 48(sp)
 ; RV32I-NEXT:    sw zero, 52(sp)
-; RV32I-NEXT:    slli s3, s3, 8
-; RV32I-NEXT:    or s1, s3, s1
-; RV32I-NEXT:    addi s3, sp, 8
 ; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t5, t5, 24
-; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    slli s5, s5, 16
-; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or t4, t5, t4
-; RV32I-NEXT:    or t5, s4, t6
-; RV32I-NEXT:    or t6, s6, s5
-; RV32I-NEXT:    or a3, s7, a3
-; RV32I-NEXT:    or a1, a1, s8
-; RV32I-NEXT:    lw s4, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a4, s4
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a7, t2, t1
-; RV32I-NEXT:    or t0, a0, t3
-; RV32I-NEXT:    or t1, s2, s0
-; RV32I-NEXT:    or t2, t4, s1
-; RV32I-NEXT:    or t3, t6, t5
-; RV32I-NEXT:    or a0, a1, a3
-; RV32I-NEXT:    sw t0, 24(sp)
-; RV32I-NEXT:    sw t1, 28(sp)
-; RV32I-NEXT:    sw t2, 32(sp)
-; RV32I-NEXT:    sw t3, 36(sp)
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
-; RV32I-NEXT:    sw a6, 16(sp)
-; RV32I-NEXT:    sw a7, 20(sp)
+; RV32I-NEXT:    slli s0, s0, 24
+; RV32I-NEXT:    or t4, s0, t4
+; RV32I-NEXT:    addi s0, sp, 8
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    slli s9, s9, 16
+; RV32I-NEXT:    slli s10, s10, 24
+; RV32I-NEXT:    or t1, t2, t1
 ; RV32I-NEXT:    srli a1, a0, 3
-; RV32I-NEXT:    andi a3, a0, 31
-; RV32I-NEXT:    andi a4, a1, 28
-; RV32I-NEXT:    xori a1, a3, 31
-; RV32I-NEXT:    add a4, s3, a4
-; RV32I-NEXT:    lw a3, 0(a4)
-; RV32I-NEXT:    lw a5, 4(a4)
-; RV32I-NEXT:    lw a6, 8(a4)
-; RV32I-NEXT:    lw a7, 12(a4)
-; RV32I-NEXT:    lw t0, 16(a4)
-; RV32I-NEXT:    lw t1, 20(a4)
-; RV32I-NEXT:    lw t2, 24(a4)
-; RV32I-NEXT:    lw a4, 28(a4)
-; RV32I-NEXT:    srl t3, a5, a0
-; RV32I-NEXT:    slli t4, a6, 1
+; RV32I-NEXT:    or t2, s1, t5
+; RV32I-NEXT:    andi t5, a0, 31
+; RV32I-NEXT:    or t3, s5, t3
+; RV32I-NEXT:    or t6, t6, s6
+; RV32I-NEXT:    or s1, s8, s4
+; RV32I-NEXT:    or s4, s10, s9
+; RV32I-NEXT:    andi s5, a1, 28
+; RV32I-NEXT:    xori a1, t5, 31
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, s2, s7
+; RV32I-NEXT:    or a7, t4, s3
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or t1, t6, t3
+; RV32I-NEXT:    or t2, s4, s1
+; RV32I-NEXT:    add s0, s0, s5
+; RV32I-NEXT:    sw a7, 24(sp)
+; RV32I-NEXT:    sw t0, 28(sp)
+; RV32I-NEXT:    sw t1, 32(sp)
+; RV32I-NEXT:    sw t2, 36(sp)
+; RV32I-NEXT:    sw a3, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a5, 16(sp)
+; RV32I-NEXT:    sw a6, 20(sp)
+; RV32I-NEXT:    lw a3, 0(s0)
+; RV32I-NEXT:    lw a4, 4(s0)
+; RV32I-NEXT:    lw a5, 8(s0)
+; RV32I-NEXT:    lw a6, 12(s0)
+; RV32I-NEXT:    lw a7, 16(s0)
+; RV32I-NEXT:    lw t0, 20(s0)
+; RV32I-NEXT:    lw t1, 24(s0)
+; RV32I-NEXT:    lw t2, 28(s0)
+; RV32I-NEXT:    srl t3, a4, a0
+; RV32I-NEXT:    slli t4, a5, 1
 ; RV32I-NEXT:    srl a3, a3, a0
-; RV32I-NEXT:    slli a5, a5, 1
-; RV32I-NEXT:    srl t5, a7, a0
-; RV32I-NEXT:    slli t6, t0, 1
-; RV32I-NEXT:    srl a6, a6, a0
-; RV32I-NEXT:    slli a7, a7, 1
-; RV32I-NEXT:    srl s0, t1, a0
-; RV32I-NEXT:    slli s1, t2, 1
-; RV32I-NEXT:    srl t0, t0, a0
-; RV32I-NEXT:    slli t1, t1, 1
+; RV32I-NEXT:    slli a4, a4, 1
+; RV32I-NEXT:    srl t5, a6, a0
+; RV32I-NEXT:    slli t6, a7, 1
+; RV32I-NEXT:    srl a5, a5, a0
+; RV32I-NEXT:    slli a6, a6, 1
+; RV32I-NEXT:    srl s0, t0, a0
+; RV32I-NEXT:    slli s1, t1, 1
+; RV32I-NEXT:    srl a7, a7, a0
+; RV32I-NEXT:    slli t0, t0, 1
+; RV32I-NEXT:    srl t1, t1, a0
+; RV32I-NEXT:    slli s2, t2, 1
 ; RV32I-NEXT:    srl t2, t2, a0
-; RV32I-NEXT:    slli s2, a4, 1
-; RV32I-NEXT:    srl s3, a4, a0
 ; RV32I-NEXT:    sll a0, t4, a1
-; RV32I-NEXT:    sll a4, a5, a1
-; RV32I-NEXT:    sll a5, t6, a1
-; RV32I-NEXT:    sll a7, a7, a1
-; RV32I-NEXT:    sll t4, s1, a1
-; RV32I-NEXT:    sll t1, t1, a1
-; RV32I-NEXT:    sll t6, s2, a1
-; RV32I-NEXT:    srli s1, s3, 24
-; RV32I-NEXT:    srli s2, s3, 16
-; RV32I-NEXT:    srli s4, s3, 8
+; RV32I-NEXT:    sll a4, a4, a1
+; RV32I-NEXT:    sll t4, t6, a1
+; RV32I-NEXT:    sll a6, a6, a1
+; RV32I-NEXT:    sll t6, s1, a1
+; RV32I-NEXT:    sll t0, t0, a1
+; RV32I-NEXT:    sll s1, s2, a1
+; RV32I-NEXT:    srli s2, t2, 24
+; RV32I-NEXT:    srli s3, t2, 16
+; RV32I-NEXT:    srli s4, t2, 8
 ; RV32I-NEXT:    or a0, t3, a0
 ; RV32I-NEXT:    or a1, a3, a4
-; RV32I-NEXT:    or a3, t5, a5
-; RV32I-NEXT:    or a4, a6, a7
-; RV32I-NEXT:    or a5, s0, t4
-; RV32I-NEXT:    or a6, t0, t1
-; RV32I-NEXT:    or a7, t2, t6
-; RV32I-NEXT:    sb s3, 28(a2)
+; RV32I-NEXT:    or a3, t5, t4
+; RV32I-NEXT:    or a4, a5, a6
+; RV32I-NEXT:    or a5, s0, t6
+; RV32I-NEXT:    or a6, a7, t0
+; RV32I-NEXT:    or a7, t1, s1
+; RV32I-NEXT:    sb t2, 28(a2)
 ; RV32I-NEXT:    sb s4, 29(a2)
-; RV32I-NEXT:    sb s2, 30(a2)
-; RV32I-NEXT:    sb s1, 31(a2)
+; RV32I-NEXT:    sb s3, 30(a2)
+; RV32I-NEXT:    sb s2, 31(a2)
 ; RV32I-NEXT:    srli t0, a7, 24
 ; RV32I-NEXT:    srli t1, a7, 16
 ; RV32I-NEXT:    srli t2, a7, 8
@@ -1775,19 +1712,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -160
-; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -144
+; RV64I-NEXT:    sd s0, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 0(a0)
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
@@ -1804,146 +1739,125 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s1, 13(a0)
 ; RV64I-NEXT:    lbu s2, 14(a0)
 ; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    lbu s4, 16(a0)
 ; RV64I-NEXT:    lbu s5, 17(a0)
 ; RV64I-NEXT:    lbu s6, 18(a0)
 ; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    slli s8, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or a5, a4, a3
-; RV64I-NEXT:    or a6, a6, s8
-; RV64I-NEXT:    or a3, t0, a7
-; RV64I-NEXT:    or a4, t2, t1
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    lbu s10, 22(a0)
-; RV64I-NEXT:    lbu s11, 23(a0)
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t5, t5, 16
 ; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    lbu t5, 20(a0)
+; RV64I-NEXT:    lbu t6, 21(a0)
+; RV64I-NEXT:    lbu s8, 22(a0)
+; RV64I-NEXT:    lbu s9, 23(a0)
 ; RV64I-NEXT:    slli s1, s1, 8
 ; RV64I-NEXT:    slli s2, s2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    or t1, s1, s0
-; RV64I-NEXT:    or t2, s3, s2
-; RV64I-NEXT:    lbu t6, 24(a0)
-; RV64I-NEXT:    lbu s0, 25(a0)
-; RV64I-NEXT:    lbu s1, 26(a0)
-; RV64I-NEXT:    lbu s2, 27(a0)
 ; RV64I-NEXT:    slli s5, s5, 8
 ; RV64I-NEXT:    slli s6, s6, 16
 ; RV64I-NEXT:    slli s7, s7, 24
-; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
 ; RV64I-NEXT:    or t3, s5, s4
 ; RV64I-NEXT:    or t4, s7, s6
-; RV64I-NEXT:    or t5, s9, s8
-; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s0, 24(a0)
+; RV64I-NEXT:    lbu s1, 25(a0)
+; RV64I-NEXT:    lbu s2, 26(a0)
+; RV64I-NEXT:    lbu s3, 27(a0)
+; RV64I-NEXT:    slli t6, t6, 8
+; RV64I-NEXT:    slli s8, s8, 16
+; RV64I-NEXT:    slli s9, s9, 24
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    or t5, t6, t5
+; RV64I-NEXT:    or t6, s9, s8
+; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    lbu s1, 28(a0)
 ; RV64I-NEXT:    lbu s4, 29(a0)
 ; RV64I-NEXT:    lbu s5, 30(a0)
 ; RV64I-NEXT:    lbu s6, 31(a0)
-; RV64I-NEXT:    slli s10, s10, 16
-; RV64I-NEXT:    slli s11, s11, 24
-; RV64I-NEXT:    slli s0, s0, 8
-; RV64I-NEXT:    slli s1, s1, 16
-; RV64I-NEXT:    slli s2, s2, 24
-; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    or a0, s11, s10
-; RV64I-NEXT:    or t6, s0, t6
-; RV64I-NEXT:    or s0, s2, s1
-; RV64I-NEXT:    or s1, s4, s3
-; RV64I-NEXT:    lbu s2, 0(a1)
-; RV64I-NEXT:    lbu s3, 1(a1)
-; RV64I-NEXT:    lbu s4, 2(a1)
-; RV64I-NEXT:    lbu s7, 3(a1)
-; RV64I-NEXT:    slli s5, s5, 16
-; RV64I-NEXT:    slli s6, s6, 24
-; RV64I-NEXT:    slli s3, s3, 8
-; RV64I-NEXT:    slli s4, s4, 16
-; RV64I-NEXT:    slli s7, s7, 24
-; RV64I-NEXT:    or s5, s6, s5
-; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    or s3, s7, s4
-; RV64I-NEXT:    lbu s4, 5(a1)
-; RV64I-NEXT:    lbu s6, 4(a1)
-; RV64I-NEXT:    lbu s7, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    or s4, s4, s6
-; RV64I-NEXT:    slli s7, s7, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, s7
+; RV64I-NEXT:    lbu a0, 0(a1)
 ; RV64I-NEXT:    sd zero, 0(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
 ; RV64I-NEXT:    sd zero, 16(sp)
 ; RV64I-NEXT:    sd zero, 24(sp)
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    addi a6, sp, 32
+; RV64I-NEXT:    slli s2, s2, 16
+; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a1, s3, s2
+; RV64I-NEXT:    addi s2, sp, 32
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    or s1, s4, s1
+; RV64I-NEXT:    srli s3, a0, 3
+; RV64I-NEXT:    or s4, s6, s5
+; RV64I-NEXT:    andi s5, a0, 63
+; RV64I-NEXT:    andi s3, s3, 24
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, t0, a7
-; RV64I-NEXT:    or a7, t2, t1
-; RV64I-NEXT:    or t0, t4, t3
-; RV64I-NEXT:    or a0, a0, t5
-; RV64I-NEXT:    or t1, s0, t6
-; RV64I-NEXT:    or t2, s5, s1
-; RV64I-NEXT:    or t3, s3, s2
-; RV64I-NEXT:    or a1, a1, s4
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    slli a7, a7, 32
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    slli t2, t2, 32
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a3, a3, a5
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    or a5, t2, t1
-; RV64I-NEXT:    or a1, a1, t3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or a1, a1, s0
+; RV64I-NEXT:    or t1, s4, s1
+; RV64I-NEXT:    sub t2, s2, s3
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    slli t0, t0, 32
+; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a1, t1, a1
 ; RV64I-NEXT:    sd a3, 32(sp)
 ; RV64I-NEXT:    sd a4, 40(sp)
-; RV64I-NEXT:    sd a0, 48(sp)
-; RV64I-NEXT:    sd a5, 56(sp)
-; RV64I-NEXT:    srli a0, a1, 3
-; RV64I-NEXT:    andi a3, a1, 63
-; RV64I-NEXT:    andi a0, a0, 24
-; RV64I-NEXT:    sub a0, a6, a0
-; RV64I-NEXT:    ld a4, 0(a0)
-; RV64I-NEXT:    ld a5, 8(a0)
-; RV64I-NEXT:    ld a6, 16(a0)
-; RV64I-NEXT:    ld a0, 24(a0)
-; RV64I-NEXT:    xori a3, a3, 63
-; RV64I-NEXT:    sll a7, a5, a1
-; RV64I-NEXT:    srli t0, a4, 1
-; RV64I-NEXT:    sll t1, a0, a1
-; RV64I-NEXT:    srli a0, a6, 1
-; RV64I-NEXT:    sll a6, a6, a1
-; RV64I-NEXT:    srli a5, a5, 1
-; RV64I-NEXT:    sll a4, a4, a1
-; RV64I-NEXT:    srl a1, t0, a3
-; RV64I-NEXT:    srl t0, a0, a3
-; RV64I-NEXT:    srl a3, a5, a3
-; RV64I-NEXT:    srli a5, a4, 56
-; RV64I-NEXT:    srli t2, a4, 48
-; RV64I-NEXT:    srli t3, a4, 40
-; RV64I-NEXT:    srli t4, a4, 32
-; RV64I-NEXT:    srli t5, a4, 24
-; RV64I-NEXT:    srli t6, a4, 16
-; RV64I-NEXT:    srli s0, a4, 8
-; RV64I-NEXT:    or a0, a7, a1
-; RV64I-NEXT:    or a1, t1, t0
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    sb t4, 4(a2)
-; RV64I-NEXT:    sb t3, 5(a2)
-; RV64I-NEXT:    sb t2, 6(a2)
-; RV64I-NEXT:    sb a5, 7(a2)
-; RV64I-NEXT:    sb a4, 0(a2)
-; RV64I-NEXT:    sb s0, 1(a2)
-; RV64I-NEXT:    sb t6, 2(a2)
-; RV64I-NEXT:    sb t5, 3(a2)
+; RV64I-NEXT:    sd a5, 48(sp)
+; RV64I-NEXT:    sd a1, 56(sp)
+; RV64I-NEXT:    ld a1, 0(t2)
+; RV64I-NEXT:    ld a3, 8(t2)
+; RV64I-NEXT:    ld a4, 16(t2)
+; RV64I-NEXT:    ld a5, 24(t2)
+; RV64I-NEXT:    xori a6, s5, 63
+; RV64I-NEXT:    sll a7, a3, a0
+; RV64I-NEXT:    srli t0, a1, 1
+; RV64I-NEXT:    sll a5, a5, a0
+; RV64I-NEXT:    srli t1, a4, 1
+; RV64I-NEXT:    sll a4, a4, a0
+; RV64I-NEXT:    srli a3, a3, 1
+; RV64I-NEXT:    sll t2, a1, a0
+; RV64I-NEXT:    srl a0, t0, a6
+; RV64I-NEXT:    srl a1, t1, a6
+; RV64I-NEXT:    srl a3, a3, a6
+; RV64I-NEXT:    srli a6, t2, 56
+; RV64I-NEXT:    srli t0, t2, 48
+; RV64I-NEXT:    srli t1, t2, 40
+; RV64I-NEXT:    srli t3, t2, 32
+; RV64I-NEXT:    srli t4, t2, 24
+; RV64I-NEXT:    srli t5, t2, 16
+; RV64I-NEXT:    srli t6, t2, 8
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    or a1, a5, a1
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    sb t3, 4(a2)
+; RV64I-NEXT:    sb t1, 5(a2)
+; RV64I-NEXT:    sb t0, 6(a2)
+; RV64I-NEXT:    sb a6, 7(a2)
+; RV64I-NEXT:    sb t2, 0(a2)
+; RV64I-NEXT:    sb t6, 1(a2)
+; RV64I-NEXT:    sb t5, 2(a2)
+; RV64I-NEXT:    sb t4, 3(a2)
 ; RV64I-NEXT:    srli a4, a3, 56
 ; RV64I-NEXT:    srli a5, a3, 48
 ; RV64I-NEXT:    srli a6, a3, 40
@@ -1989,19 +1903,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a1, 9(a2)
 ; RV64I-NEXT:    sb a5, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 160
+; RV64I-NEXT:    ld s0, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 144
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_32bytes:
@@ -2026,67 +1938,55 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a7, 3(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s2, 13(a0)
-; RV32I-NEXT:    lbu s4, 14(a0)
-; RV32I-NEXT:    lbu s5, 15(a0)
-; RV32I-NEXT:    lbu s6, 16(a0)
-; RV32I-NEXT:    lbu s7, 17(a0)
-; RV32I-NEXT:    lbu s8, 18(a0)
-; RV32I-NEXT:    lbu s9, 19(a0)
+; RV32I-NEXT:    lbu t3, 6(a0)
+; RV32I-NEXT:    lbu t6, 7(a0)
+; RV32I-NEXT:    lbu s2, 8(a0)
+; RV32I-NEXT:    lbu s3, 9(a0)
+; RV32I-NEXT:    lbu s4, 10(a0)
+; RV32I-NEXT:    lbu s5, 11(a0)
+; RV32I-NEXT:    lbu s7, 12(a0)
+; RV32I-NEXT:    lbu s8, 13(a0)
+; RV32I-NEXT:    lbu s9, 14(a0)
+; RV32I-NEXT:    lbu s10, 15(a0)
+; RV32I-NEXT:    lbu s11, 16(a0)
+; RV32I-NEXT:    lbu ra, 17(a0)
+; RV32I-NEXT:    lbu t4, 18(a0)
+; RV32I-NEXT:    lbu s0, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    or a4, a7, a6
-; RV32I-NEXT:    lbu s10, 20(a0)
-; RV32I-NEXT:    lbu s11, 21(a0)
-; RV32I-NEXT:    lbu ra, 22(a0)
-; RV32I-NEXT:    lbu a3, 23(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 21(a0)
+; RV32I-NEXT:    lbu t5, 22(a0)
+; RV32I-NEXT:    lbu s1, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    lbu s1, 24(a0)
-; RV32I-NEXT:    lbu s3, 25(a0)
-; RV32I-NEXT:    lbu t4, 26(a0)
-; RV32I-NEXT:    lbu t5, 27(a0)
-; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s3, s3, 8
 ; RV32I-NEXT:    slli s4, s4, 16
 ; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    or t1, s2, s0
-; RV32I-NEXT:    or t2, s5, s4
-; RV32I-NEXT:    or t3, s7, s6
-; RV32I-NEXT:    lbu t6, 28(a0)
-; RV32I-NEXT:    lbu s4, 29(a0)
-; RV32I-NEXT:    lbu s5, 30(a0)
-; RV32I-NEXT:    lbu s6, 31(a0)
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or a0, s9, s8
-; RV32I-NEXT:    or s0, s11, s10
-; RV32I-NEXT:    or s2, a3, ra
-; RV32I-NEXT:    lbu a3, 0(a1)
-; RV32I-NEXT:    lbu s7, 1(a1)
-; RV32I-NEXT:    lbu s8, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    or a6, t6, t3
+; RV32I-NEXT:    or a7, s3, s2
+; RV32I-NEXT:    or t0, s5, s4
+; RV32I-NEXT:    lbu t3, 24(a0)
+; RV32I-NEXT:    lbu s5, 25(a0)
+; RV32I-NEXT:    lbu s6, 26(a0)
+; RV32I-NEXT:    lbu t6, 27(a0)
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    slli s9, s9, 16
+; RV32I-NEXT:    slli s10, s10, 24
+; RV32I-NEXT:    slli ra, ra, 8
+; RV32I-NEXT:    or s7, s8, s7
+; RV32I-NEXT:    or s2, s10, s9
+; RV32I-NEXT:    or s3, ra, s11
+; RV32I-NEXT:    lbu s4, 28(a0)
+; RV32I-NEXT:    lbu s8, 29(a0)
+; RV32I-NEXT:    lbu s9, 30(a0)
+; RV32I-NEXT:    lbu s10, 31(a0)
+; RV32I-NEXT:    lbu a0, 0(a1)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
 ; RV32I-NEXT:    sw zero, 32(sp)
@@ -2095,88 +1995,89 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw zero, 12(sp)
 ; RV32I-NEXT:    sw zero, 16(sp)
 ; RV32I-NEXT:    sw zero, 20(sp)
-; RV32I-NEXT:    slli s3, s3, 8
-; RV32I-NEXT:    or s1, s3, s1
-; RV32I-NEXT:    addi s3, sp, 40
 ; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t5, t5, 24
-; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    slli s5, s5, 16
-; RV32I-NEXT:    slli s6, s6, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or t4, t5, t4
-; RV32I-NEXT:    or t5, s4, t6
-; RV32I-NEXT:    or t6, s6, s5
-; RV32I-NEXT:    or a3, s7, a3
-; RV32I-NEXT:    or a1, a1, s8
-; RV32I-NEXT:    lw s4, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a4, s4
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a7, t2, t1
-; RV32I-NEXT:    or t0, a0, t3
-; RV32I-NEXT:    or t1, s2, s0
-; RV32I-NEXT:    or t2, t4, s1
-; RV32I-NEXT:    or t3, t6, t5
-; RV32I-NEXT:    or a0, a1, a3
-; RV32I-NEXT:    sw t0, 56(sp)
-; RV32I-NEXT:    sw t1, 60(sp)
-; RV32I-NEXT:    sw t2, 64(sp)
-; RV32I-NEXT:    sw t3, 68(sp)
-; RV32I-NEXT:    sw a4, 40(sp)
-; RV32I-NEXT:    sw a5, 44(sp)
-; RV32I-NEXT:    sw a6, 48(sp)
-; RV32I-NEXT:    sw a7, 52(sp)
+; RV32I-NEXT:    slli s0, s0, 24
+; RV32I-NEXT:    or t4, s0, t4
+; RV32I-NEXT:    addi s0, sp, 40
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    slli s9, s9, 16
+; RV32I-NEXT:    slli s10, s10, 24
+; RV32I-NEXT:    or t1, t2, t1
 ; RV32I-NEXT:    srli a1, a0, 3
-; RV32I-NEXT:    andi a3, a0, 31
-; RV32I-NEXT:    andi a4, a1, 28
-; RV32I-NEXT:    xori a1, a3, 31
-; RV32I-NEXT:    sub a3, s3, a4
-; RV32I-NEXT:    lw a4, 0(a3)
-; RV32I-NEXT:    lw a5, 4(a3)
-; RV32I-NEXT:    lw a6, 8(a3)
-; RV32I-NEXT:    lw a7, 12(a3)
-; RV32I-NEXT:    lw t0, 16(a3)
-; RV32I-NEXT:    lw t1, 20(a3)
-; RV32I-NEXT:    lw t2, 24(a3)
-; RV32I-NEXT:    lw a3, 28(a3)
-; RV32I-NEXT:    sll t3, a5, a0
-; RV32I-NEXT:    srli t4, a4, 1
-; RV32I-NEXT:    sll t5, a7, a0
-; RV32I-NEXT:    srli t6, a6, 1
-; RV32I-NEXT:    sll a6, a6, a0
-; RV32I-NEXT:    srli a5, a5, 1
-; RV32I-NEXT:    sll s0, t1, a0
-; RV32I-NEXT:    srli s1, t0, 1
-; RV32I-NEXT:    sll t0, t0, a0
-; RV32I-NEXT:    srli a7, a7, 1
-; RV32I-NEXT:    sll s2, a3, a0
-; RV32I-NEXT:    srli a3, t2, 1
+; RV32I-NEXT:    or t2, s1, t5
+; RV32I-NEXT:    andi t5, a0, 31
+; RV32I-NEXT:    or t3, s5, t3
+; RV32I-NEXT:    or t6, t6, s6
+; RV32I-NEXT:    or s1, s8, s4
+; RV32I-NEXT:    or s4, s10, s9
+; RV32I-NEXT:    andi s5, a1, 28
+; RV32I-NEXT:    xori a1, t5, 31
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, s2, s7
+; RV32I-NEXT:    or a7, t4, s3
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or t1, t6, t3
+; RV32I-NEXT:    or t2, s4, s1
+; RV32I-NEXT:    sub t3, s0, s5
+; RV32I-NEXT:    sw a7, 56(sp)
+; RV32I-NEXT:    sw t0, 60(sp)
+; RV32I-NEXT:    sw t1, 64(sp)
+; RV32I-NEXT:    sw t2, 68(sp)
+; RV32I-NEXT:    sw a3, 40(sp)
+; RV32I-NEXT:    sw a4, 44(sp)
+; RV32I-NEXT:    sw a5, 48(sp)
+; RV32I-NEXT:    sw a6, 52(sp)
+; RV32I-NEXT:    lw a3, 0(t3)
+; RV32I-NEXT:    lw a4, 4(t3)
+; RV32I-NEXT:    lw a5, 8(t3)
+; RV32I-NEXT:    lw a6, 12(t3)
+; RV32I-NEXT:    lw a7, 16(t3)
+; RV32I-NEXT:    lw t0, 20(t3)
+; RV32I-NEXT:    lw t1, 24(t3)
+; RV32I-NEXT:    lw t2, 28(t3)
+; RV32I-NEXT:    sll t3, a4, a0
+; RV32I-NEXT:    srli t4, a3, 1
+; RV32I-NEXT:    sll t5, a6, a0
+; RV32I-NEXT:    srli t6, a5, 1
+; RV32I-NEXT:    sll a5, a5, a0
+; RV32I-NEXT:    srli a4, a4, 1
+; RV32I-NEXT:    sll s0, t0, a0
+; RV32I-NEXT:    srli s1, a7, 1
+; RV32I-NEXT:    sll a7, a7, a0
+; RV32I-NEXT:    srli a6, a6, 1
 ; RV32I-NEXT:    sll t2, t2, a0
-; RV32I-NEXT:    srli t1, t1, 1
-; RV32I-NEXT:    sll s3, a4, a0
+; RV32I-NEXT:    srli s2, t1, 1
+; RV32I-NEXT:    sll t1, t1, a0
+; RV32I-NEXT:    srli t0, t0, 1
+; RV32I-NEXT:    sll s3, a3, a0
 ; RV32I-NEXT:    srl a0, t4, a1
-; RV32I-NEXT:    srl a4, t6, a1
-; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    srl a3, t6, a1
+; RV32I-NEXT:    srl a4, a4, a1
 ; RV32I-NEXT:    srl t4, s1, a1
-; RV32I-NEXT:    srl a7, a7, a1
-; RV32I-NEXT:    srl t6, a3, a1
-; RV32I-NEXT:    srl t1, t1, a1
+; RV32I-NEXT:    srl a6, a6, a1
+; RV32I-NEXT:    srl t6, s2, a1
+; RV32I-NEXT:    srl t0, t0, a1
 ; RV32I-NEXT:    srli s1, s3, 24
-; RV32I-NEXT:    srli s4, s3, 16
-; RV32I-NEXT:    srli s5, s3, 8
+; RV32I-NEXT:    srli s2, s3, 16
+; RV32I-NEXT:    srli s4, s3, 8
 ; RV32I-NEXT:    or a0, t3, a0
-; RV32I-NEXT:    or a1, t5, a4
-; RV32I-NEXT:    or a3, a6, a5
+; RV32I-NEXT:    or a1, t5, a3
+; RV32I-NEXT:    or a3, a5, a4
 ; RV32I-NEXT:    or a4, s0, t4
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, s2, t6
-; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a6, t2, t6
+; RV32I-NEXT:    or a7, t1, t0
 ; RV32I-NEXT:    sb s3, 0(a2)
-; RV32I-NEXT:    sb s5, 1(a2)
-; RV32I-NEXT:    sb s4, 2(a2)
+; RV32I-NEXT:    sb s4, 1(a2)
+; RV32I-NEXT:    sb s2, 2(a2)
 ; RV32I-NEXT:    sb s1, 3(a2)
 ; RV32I-NEXT:    srli t0, a7, 24
 ; RV32I-NEXT:    srli t1, a7, 16
@@ -2251,19 +2152,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -160
-; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -144
+; RV64I-NEXT:    sd s0, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 0(a0)
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
@@ -2280,144 +2179,123 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s1, 13(a0)
 ; RV64I-NEXT:    lbu s2, 14(a0)
 ; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    lbu s4, 16(a0)
 ; RV64I-NEXT:    lbu s5, 17(a0)
 ; RV64I-NEXT:    lbu s6, 18(a0)
 ; RV64I-NEXT:    lbu s7, 19(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    lbu s8, 20(a0)
-; RV64I-NEXT:    lbu s9, 21(a0)
-; RV64I-NEXT:    lbu s10, 22(a0)
-; RV64I-NEXT:    lbu s11, 23(a0)
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t5, t5, 16
 ; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    lbu t5, 20(a0)
+; RV64I-NEXT:    lbu t6, 21(a0)
+; RV64I-NEXT:    lbu s8, 22(a0)
+; RV64I-NEXT:    lbu s9, 23(a0)
 ; RV64I-NEXT:    slli s1, s1, 8
 ; RV64I-NEXT:    slli s2, s2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    or t1, s1, s0
-; RV64I-NEXT:    or t2, s3, s2
-; RV64I-NEXT:    lbu t6, 24(a0)
-; RV64I-NEXT:    lbu s0, 25(a0)
-; RV64I-NEXT:    lbu s1, 26(a0)
-; RV64I-NEXT:    lbu s2, 27(a0)
 ; RV64I-NEXT:    slli s5, s5, 8
 ; RV64I-NEXT:    slli s6, s6, 16
 ; RV64I-NEXT:    slli s7, s7, 24
-; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
 ; RV64I-NEXT:    or t3, s5, s4
 ; RV64I-NEXT:    or t4, s7, s6
-; RV64I-NEXT:    or t5, s9, s8
-; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s0, 24(a0)
+; RV64I-NEXT:    lbu s1, 25(a0)
+; RV64I-NEXT:    lbu s2, 26(a0)
+; RV64I-NEXT:    lbu s3, 27(a0)
+; RV64I-NEXT:    slli t6, t6, 8
+; RV64I-NEXT:    slli s8, s8, 16
+; RV64I-NEXT:    slli s9, s9, 24
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    or t5, t6, t5
+; RV64I-NEXT:    or t6, s9, s8
+; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    lbu s1, 28(a0)
 ; RV64I-NEXT:    lbu s4, 29(a0)
 ; RV64I-NEXT:    lbu s5, 30(a0)
 ; RV64I-NEXT:    lbu s6, 31(a0)
-; RV64I-NEXT:    slli s10, s10, 16
-; RV64I-NEXT:    slli s11, s11, 24
-; RV64I-NEXT:    slli s0, s0, 8
-; RV64I-NEXT:    slli s1, s1, 16
-; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    lbu a0, 0(a1)
+; RV64I-NEXT:    slli s2, s2, 16
+; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a1, s3, s2
+; RV64I-NEXT:    mv s2, sp
 ; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    or a0, s11, s10
-; RV64I-NEXT:    or t6, s0, t6
-; RV64I-NEXT:    or s0, s2, s1
-; RV64I-NEXT:    or s1, s4, s3
-; RV64I-NEXT:    lbu s2, 0(a1)
-; RV64I-NEXT:    lbu s3, 1(a1)
-; RV64I-NEXT:    lbu s4, 2(a1)
-; RV64I-NEXT:    lbu s7, 3(a1)
 ; RV64I-NEXT:    slli s5, s5, 16
 ; RV64I-NEXT:    slli s6, s6, 24
-; RV64I-NEXT:    slli s3, s3, 8
-; RV64I-NEXT:    slli s4, s4, 16
-; RV64I-NEXT:    slli s7, s7, 24
-; RV64I-NEXT:    or s5, s6, s5
-; RV64I-NEXT:    or s2, s3, s2
-; RV64I-NEXT:    or s3, s7, s4
-; RV64I-NEXT:    lbu s4, 5(a1)
-; RV64I-NEXT:    lbu s6, 4(a1)
-; RV64I-NEXT:    lbu s7, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    or s4, s4, s6
-; RV64I-NEXT:    slli s7, s7, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, s7
-; RV64I-NEXT:    mv s6, sp
+; RV64I-NEXT:    or s1, s4, s1
+; RV64I-NEXT:    srli s3, a0, 3
+; RV64I-NEXT:    or s4, s6, s5
+; RV64I-NEXT:    andi s5, a0, 63
+; RV64I-NEXT:    andi s3, s3, 24
+; RV64I-NEXT:    xori s5, s5, 63
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a5, t0, a7
 ; RV64I-NEXT:    or a6, t2, t1
 ; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or a0, a0, t5
-; RV64I-NEXT:    or t0, s0, t6
-; RV64I-NEXT:    or t1, s5, s1
-; RV64I-NEXT:    or t2, s3, s2
-; RV64I-NEXT:    or a1, a1, s4
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or a1, a1, s0
+; RV64I-NEXT:    or t1, s4, s1
+; RV64I-NEXT:    add s2, s2, s3
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    slli t3, t1, 32
-; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    slli t0, t0, 32
+; RV64I-NEXT:    slli t2, t1, 32
 ; RV64I-NEXT:    sraiw t1, t1, 31
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    or a5, t3, t0
-; RV64I-NEXT:    or a1, a1, t2
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a1, t2, a1
 ; RV64I-NEXT:    sd t1, 32(sp)
 ; RV64I-NEXT:    sd t1, 40(sp)
 ; RV64I-NEXT:    sd t1, 48(sp)
 ; RV64I-NEXT:    sd t1, 56(sp)
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    sd a4, 8(sp)
-; RV64I-NEXT:    sd a0, 16(sp)
-; RV64I-NEXT:    sd a5, 24(sp)
-; RV64I-NEXT:    srli a0, a1, 3
-; RV64I-NEXT:    andi a3, a1, 63
-; RV64I-NEXT:    andi a0, a0, 24
-; RV64I-NEXT:    xori a3, a3, 63
-; RV64I-NEXT:    add a0, s6, a0
-; RV64I-NEXT:    ld a4, 8(a0)
-; RV64I-NEXT:    ld a5, 16(a0)
-; RV64I-NEXT:    ld a6, 0(a0)
-; RV64I-NEXT:    ld a0, 24(a0)
-; RV64I-NEXT:    srl a7, a4, a1
+; RV64I-NEXT:    sd a5, 16(sp)
+; RV64I-NEXT:    sd a1, 24(sp)
+; RV64I-NEXT:    ld a1, 8(s2)
+; RV64I-NEXT:    ld a3, 16(s2)
+; RV64I-NEXT:    ld a4, 0(s2)
+; RV64I-NEXT:    ld a5, 24(s2)
+; RV64I-NEXT:    srl a6, a1, a0
+; RV64I-NEXT:    slli a7, a3, 1
+; RV64I-NEXT:    srl a4, a4, a0
+; RV64I-NEXT:    slli a1, a1, 1
+; RV64I-NEXT:    srl a3, a3, a0
 ; RV64I-NEXT:    slli t0, a5, 1
-; RV64I-NEXT:    srl a6, a6, a1
-; RV64I-NEXT:    slli a4, a4, 1
-; RV64I-NEXT:    srl a5, a5, a1
-; RV64I-NEXT:    slli t1, a0, 1
-; RV64I-NEXT:    sra t2, a0, a1
-; RV64I-NEXT:    sll a0, t0, a3
-; RV64I-NEXT:    sll a1, a4, a3
-; RV64I-NEXT:    sll a3, t1, a3
-; RV64I-NEXT:    srli a4, t2, 56
-; RV64I-NEXT:    srli t0, t2, 48
-; RV64I-NEXT:    srli t1, t2, 40
-; RV64I-NEXT:    srli t3, t2, 32
-; RV64I-NEXT:    srli t4, t2, 24
-; RV64I-NEXT:    srli t5, t2, 16
-; RV64I-NEXT:    srli t6, t2, 8
-; RV64I-NEXT:    or a0, a7, a0
-; RV64I-NEXT:    or a1, a6, a1
-; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    sra a5, a5, a0
+; RV64I-NEXT:    sll a0, a7, s5
+; RV64I-NEXT:    sll a1, a1, s5
+; RV64I-NEXT:    sll a7, t0, s5
+; RV64I-NEXT:    srli t0, a5, 56
+; RV64I-NEXT:    srli t1, a5, 48
+; RV64I-NEXT:    srli t2, a5, 40
+; RV64I-NEXT:    srli t3, a5, 32
+; RV64I-NEXT:    srli t4, a5, 24
+; RV64I-NEXT:    srli t5, a5, 16
+; RV64I-NEXT:    srli t6, a5, 8
+; RV64I-NEXT:    or a0, a6, a0
+; RV64I-NEXT:    or a1, a4, a1
+; RV64I-NEXT:    or a3, a3, a7
 ; RV64I-NEXT:    sb t3, 28(a2)
-; RV64I-NEXT:    sb t1, 29(a2)
-; RV64I-NEXT:    sb t0, 30(a2)
-; RV64I-NEXT:    sb a4, 31(a2)
-; RV64I-NEXT:    sb t2, 24(a2)
+; RV64I-NEXT:    sb t2, 29(a2)
+; RV64I-NEXT:    sb t1, 30(a2)
+; RV64I-NEXT:    sb t0, 31(a2)
+; RV64I-NEXT:    sb a5, 24(a2)
 ; RV64I-NEXT:    sb t6, 25(a2)
 ; RV64I-NEXT:    sb t5, 26(a2)
 ; RV64I-NEXT:    sb t4, 27(a2)
@@ -2438,47 +2316,45 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    srli s3, a0, 56
 ; RV64I-NEXT:    srli s4, a0, 48
 ; RV64I-NEXT:    srli s5, a0, 40
-; RV64I-NEXT:    srli s6, a0, 32
 ; RV64I-NEXT:    sb a7, 20(a2)
 ; RV64I-NEXT:    sb a6, 21(a2)
 ; RV64I-NEXT:    sb a5, 22(a2)
 ; RV64I-NEXT:    sb a4, 23(a2)
-; RV64I-NEXT:    srli a4, a0, 24
+; RV64I-NEXT:    srli a4, a0, 32
 ; RV64I-NEXT:    sb a3, 16(a2)
 ; RV64I-NEXT:    sb t2, 17(a2)
 ; RV64I-NEXT:    sb t1, 18(a2)
 ; RV64I-NEXT:    sb t0, 19(a2)
-; RV64I-NEXT:    srli a3, a0, 16
+; RV64I-NEXT:    srli a3, a0, 24
 ; RV64I-NEXT:    sb t6, 4(a2)
 ; RV64I-NEXT:    sb t5, 5(a2)
 ; RV64I-NEXT:    sb t4, 6(a2)
 ; RV64I-NEXT:    sb t3, 7(a2)
-; RV64I-NEXT:    srli a5, a0, 8
+; RV64I-NEXT:    srli a5, a0, 16
 ; RV64I-NEXT:    sb a1, 0(a2)
 ; RV64I-NEXT:    sb s2, 1(a2)
 ; RV64I-NEXT:    sb s1, 2(a2)
 ; RV64I-NEXT:    sb s0, 3(a2)
-; RV64I-NEXT:    sb s6, 12(a2)
+; RV64I-NEXT:    srli a1, a0, 8
+; RV64I-NEXT:    sb a4, 12(a2)
 ; RV64I-NEXT:    sb s5, 13(a2)
 ; RV64I-NEXT:    sb s4, 14(a2)
 ; RV64I-NEXT:    sb s3, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    sb a5, 9(a2)
-; RV64I-NEXT:    sb a3, 10(a2)
-; RV64I-NEXT:    sb a4, 11(a2)
-; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 160
+; RV64I-NEXT:    sb a1, 9(a2)
+; RV64I-NEXT:    sb a5, 10(a2)
+; RV64I-NEXT:    sb a3, 11(a2)
+; RV64I-NEXT:    ld s0, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 144
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_32bytes:
@@ -2503,159 +2379,148 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a7, 3(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t1, 6(a0)
-; RV32I-NEXT:    lbu t2, 7(a0)
-; RV32I-NEXT:    lbu t3, 8(a0)
-; RV32I-NEXT:    lbu t4, 9(a0)
-; RV32I-NEXT:    lbu t5, 10(a0)
-; RV32I-NEXT:    lbu t6, 11(a0)
-; RV32I-NEXT:    lbu s0, 12(a0)
-; RV32I-NEXT:    lbu s1, 13(a0)
-; RV32I-NEXT:    lbu s2, 14(a0)
-; RV32I-NEXT:    lbu s3, 15(a0)
-; RV32I-NEXT:    lbu s4, 16(a0)
-; RV32I-NEXT:    lbu s5, 17(a0)
-; RV32I-NEXT:    lbu s6, 18(a0)
-; RV32I-NEXT:    lbu s7, 19(a0)
+; RV32I-NEXT:    lbu t3, 6(a0)
+; RV32I-NEXT:    lbu t4, 7(a0)
+; RV32I-NEXT:    lbu t6, 8(a0)
+; RV32I-NEXT:    lbu s0, 9(a0)
+; RV32I-NEXT:    lbu s4, 10(a0)
+; RV32I-NEXT:    lbu s5, 11(a0)
+; RV32I-NEXT:    lbu s6, 12(a0)
+; RV32I-NEXT:    lbu s7, 13(a0)
+; RV32I-NEXT:    lbu s8, 14(a0)
+; RV32I-NEXT:    lbu s9, 15(a0)
+; RV32I-NEXT:    lbu s10, 16(a0)
+; RV32I-NEXT:    lbu s11, 17(a0)
+; RV32I-NEXT:    lbu s2, 18(a0)
+; RV32I-NEXT:    lbu s3, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    or a4, a7, a6
-; RV32I-NEXT:    lbu s8, 20(a0)
-; RV32I-NEXT:    lbu s9, 21(a0)
-; RV32I-NEXT:    lbu s10, 22(a0)
-; RV32I-NEXT:    lbu s11, 23(a0)
+; RV32I-NEXT:    lbu t1, 20(a0)
+; RV32I-NEXT:    lbu t2, 21(a0)
+; RV32I-NEXT:    lbu t5, 22(a0)
+; RV32I-NEXT:    lbu s1, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s5, s5, 24
 ; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a6, t2, t1
-; RV32I-NEXT:    or a7, t4, t3
-; RV32I-NEXT:    or t0, t6, t5
-; RV32I-NEXT:    lbu ra, 24(a0)
-; RV32I-NEXT:    lbu a3, 25(a0)
-; RV32I-NEXT:    lbu t4, 26(a0)
-; RV32I-NEXT:    lbu t5, 27(a0)
-; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or a6, t4, t3
+; RV32I-NEXT:    or a7, s0, t6
+; RV32I-NEXT:    or t0, s5, s4
+; RV32I-NEXT:    lbu t3, 24(a0)
+; RV32I-NEXT:    lbu s4, 25(a0)
+; RV32I-NEXT:    lbu s5, 26(a0)
+; RV32I-NEXT:    lbu ra, 27(a0)
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    or t4, s7, s6
+; RV32I-NEXT:    or t6, s9, s8
+; RV32I-NEXT:    or s0, s11, s10
+; RV32I-NEXT:    lbu s6, 28(a0)
+; RV32I-NEXT:    lbu s7, 29(a0)
+; RV32I-NEXT:    lbu s8, 30(a0)
+; RV32I-NEXT:    lbu s9, 31(a0)
+; RV32I-NEXT:    lbu a0, 0(a1)
 ; RV32I-NEXT:    slli s2, s2, 16
 ; RV32I-NEXT:    slli s3, s3, 24
-; RV32I-NEXT:    slli s5, s5, 8
-; RV32I-NEXT:    or t1, s1, s0
-; RV32I-NEXT:    or t2, s3, s2
-; RV32I-NEXT:    or t3, s5, s4
-; RV32I-NEXT:    lbu t6, 28(a0)
-; RV32I-NEXT:    lbu s0, 29(a0)
-; RV32I-NEXT:    lbu s1, 30(a0)
-; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli s7, s7, 24
-; RV32I-NEXT:    slli s9, s9, 8
-; RV32I-NEXT:    slli s10, s10, 16
-; RV32I-NEXT:    slli s11, s11, 24
-; RV32I-NEXT:    or s2, s7, s6
-; RV32I-NEXT:    or s3, s9, s8
-; RV32I-NEXT:    or s4, s11, s10
-; RV32I-NEXT:    lbu s5, 0(a1)
-; RV32I-NEXT:    lbu s6, 1(a1)
-; RV32I-NEXT:    lbu s7, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, ra
-; RV32I-NEXT:    addi s8, sp, 8
-; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli t5, t5, 24
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    slli s6, s6, 8
-; RV32I-NEXT:    slli s7, s7, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or t4, t5, t4
-; RV32I-NEXT:    or t5, s0, t6
-; RV32I-NEXT:    or s1, a0, s1
-; RV32I-NEXT:    or t6, s6, s5
-; RV32I-NEXT:    or a1, a1, s7
-; RV32I-NEXT:    srai s0, a0, 31
-; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    or a4, a4, a0
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a6, t0, a7
-; RV32I-NEXT:    or a7, t2, t1
-; RV32I-NEXT:    or t0, s2, t3
-; RV32I-NEXT:    or t1, s4, s3
-; RV32I-NEXT:    or a3, t4, a3
+; RV32I-NEXT:    or s2, s3, s2
+; RV32I-NEXT:    addi s3, sp, 8
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli ra, ra, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    srli a1, a0, 3
 ; RV32I-NEXT:    or t2, s1, t5
-; RV32I-NEXT:    or a0, a1, t6
-; RV32I-NEXT:    sw s0, 56(sp)
-; RV32I-NEXT:    sw s0, 60(sp)
-; RV32I-NEXT:    sw s0, 64(sp)
-; RV32I-NEXT:    sw s0, 68(sp)
-; RV32I-NEXT:    sw s0, 40(sp)
-; RV32I-NEXT:    sw s0, 44(sp)
-; RV32I-NEXT:    sw s0, 48(sp)
-; RV32I-NEXT:    sw s0, 52(sp)
-; RV32I-NEXT:    sw t0, 24(sp)
-; RV32I-NEXT:    sw t1, 28(sp)
-; RV32I-NEXT:    sw a3, 32(sp)
+; RV32I-NEXT:    andi t5, a0, 31
+; RV32I-NEXT:    or t3, s4, t3
+; RV32I-NEXT:    or s1, ra, s5
+; RV32I-NEXT:    or s4, s7, s6
+; RV32I-NEXT:    or s5, s9, s8
+; RV32I-NEXT:    srai s6, s9, 31
+; RV32I-NEXT:    andi s7, a1, 28
+; RV32I-NEXT:    xori a1, t5, 31
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, t6, t4
+; RV32I-NEXT:    or a7, s2, s0
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or t1, s1, t3
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    sw s6, 56(sp)
+; RV32I-NEXT:    sw s6, 60(sp)
+; RV32I-NEXT:    sw s6, 64(sp)
+; RV32I-NEXT:    sw s6, 68(sp)
+; RV32I-NEXT:    sw s6, 40(sp)
+; RV32I-NEXT:    sw s6, 44(sp)
+; RV32I-NEXT:    sw s6, 48(sp)
+; RV32I-NEXT:    sw s6, 52(sp)
+; RV32I-NEXT:    add s3, s3, s7
+; RV32I-NEXT:    sw a7, 24(sp)
+; RV32I-NEXT:    sw t0, 28(sp)
+; RV32I-NEXT:    sw t1, 32(sp)
 ; RV32I-NEXT:    sw t2, 36(sp)
-; RV32I-NEXT:    sw a4, 8(sp)
-; RV32I-NEXT:    sw a5, 12(sp)
-; RV32I-NEXT:    sw a6, 16(sp)
-; RV32I-NEXT:    sw a7, 20(sp)
-; RV32I-NEXT:    srli a1, a0, 3
-; RV32I-NEXT:    andi a3, a0, 31
-; RV32I-NEXT:    andi a4, a1, 28
-; RV32I-NEXT:    xori a1, a3, 31
-; RV32I-NEXT:    add a4, s8, a4
-; RV32I-NEXT:    lw a3, 0(a4)
-; RV32I-NEXT:    lw a5, 4(a4)
-; RV32I-NEXT:    lw a6, 8(a4)
-; RV32I-NEXT:    lw a7, 12(a4)
-; RV32I-NEXT:    lw t0, 16(a4)
-; RV32I-NEXT:    lw t1, 20(a4)
-; RV32I-NEXT:    lw t2, 24(a4)
-; RV32I-NEXT:    lw a4, 28(a4)
-; RV32I-NEXT:    srl t3, a5, a0
-; RV32I-NEXT:    slli t4, a6, 1
+; RV32I-NEXT:    sw a3, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a5, 16(sp)
+; RV32I-NEXT:    sw a6, 20(sp)
+; RV32I-NEXT:    lw a3, 0(s3)
+; RV32I-NEXT:    lw a4, 4(s3)
+; RV32I-NEXT:    lw a5, 8(s3)
+; RV32I-NEXT:    lw a6, 12(s3)
+; RV32I-NEXT:    lw a7, 16(s3)
+; RV32I-NEXT:    lw t0, 20(s3)
+; RV32I-NEXT:    lw t1, 24(s3)
+; RV32I-NEXT:    lw t2, 28(s3)
+; RV32I-NEXT:    srl t3, a4, a0
+; RV32I-NEXT:    slli t4, a5, 1
 ; RV32I-NEXT:    srl a3, a3, a0
-; RV32I-NEXT:    slli a5, a5, 1
-; RV32I-NEXT:    srl t5, a7, a0
-; RV32I-NEXT:    slli t6, t0, 1
-; RV32I-NEXT:    srl a6, a6, a0
-; RV32I-NEXT:    slli a7, a7, 1
-; RV32I-NEXT:    srl s0, t1, a0
-; RV32I-NEXT:    slli s1, t2, 1
-; RV32I-NEXT:    srl t0, t0, a0
-; RV32I-NEXT:    slli t1, t1, 1
-; RV32I-NEXT:    srl t2, t2, a0
-; RV32I-NEXT:    slli s2, a4, 1
-; RV32I-NEXT:    sra s3, a4, a0
+; RV32I-NEXT:    slli a4, a4, 1
+; RV32I-NEXT:    srl t5, a6, a0
+; RV32I-NEXT:    slli t6, a7, 1
+; RV32I-NEXT:    srl a5, a5, a0
+; RV32I-NEXT:    slli a6, a6, 1
+; RV32I-NEXT:    srl s0, t0, a0
+; RV32I-NEXT:    slli s1, t1, 1
+; RV32I-NEXT:    srl a7, a7, a0
+; RV32I-NEXT:    slli t0, t0, 1
+; RV32I-NEXT:    srl t1, t1, a0
+; RV32I-NEXT:    slli s2, t2, 1
+; RV32I-NEXT:    sra t2, t2, a0
 ; RV32I-NEXT:    sll a0, t4, a1
-; RV32I-NEXT:    sll a4, a5, a1
-; RV32I-NEXT:    sll a5, t6, a1
-; RV32I-NEXT:    sll a7, a7, a1
-; RV32I-NEXT:    sll t4, s1, a1
-; RV32I-NEXT:    sll t1, t1, a1
-; RV32I-NEXT:    sll t6, s2, a1
-; RV32I-NEXT:    srli s1, s3, 24
-; RV32I-NEXT:    srli s2, s3, 16
-; RV32I-NEXT:    srli s4, s3, 8
+; RV32I-NEXT:    sll a4, a4, a1
+; RV32I-NEXT:    sll t4, t6, a1
+; RV32I-NEXT:    sll a6, a6, a1
+; RV32I-NEXT:    sll t6, s1, a1
+; RV32I-NEXT:    sll t0, t0, a1
+; RV32I-NEXT:    sll s1, s2, a1
+; RV32I-NEXT:    srli s2, t2, 24
+; RV32I-NEXT:    srli s3, t2, 16
+; RV32I-NEXT:    srli s4, t2, 8
 ; RV32I-NEXT:    or a0, t3, a0
 ; RV32I-NEXT:    or a1, a3, a4
-; RV32I-NEXT:    or a3, t5, a5
-; RV32I-NEXT:    or a4, a6, a7
-; RV32I-NEXT:    or a5, s0, t4
-; RV32I-NEXT:    or a6, t0, t1
-; RV32I-NEXT:    or a7, t2, t6
-; RV32I-NEXT:    sb s3, 28(a2)
+; RV32I-NEXT:    or a3, t5, t4
+; RV32I-NEXT:    or a4, a5, a6
+; RV32I-NEXT:    or a5, s0, t6
+; RV32I-NEXT:    or a6, a7, t0
+; RV32I-NEXT:    or a7, t1, s1
+; RV32I-NEXT:    sb t2, 28(a2)
 ; RV32I-NEXT:    sb s4, 29(a2)
-; RV32I-NEXT:    sb s2, 30(a2)
-; RV32I-NEXT:    sb s1, 31(a2)
+; RV32I-NEXT:    sb s3, 30(a2)
+; RV32I-NEXT:    sb s2, 31(a2)
 ; RV32I-NEXT:    srli t0, a7, 24
 ; RV32I-NEXT:    srli t1, a7, 16
 ; RV32I-NEXT:    srli t2, a7, 8
diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll
index 6dee3d303a6f2..12d17c7f669de 100644
--- a/llvm/test/CodeGen/SystemZ/pr60413.ll
+++ b/llvm/test/CodeGen/SystemZ/pr60413.ll
@@ -14,12 +14,12 @@ define dso_local void @m() local_unnamed_addr #1 {
 ; CHECK-LABEL: m:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    aghi %r15, -168
-; CHECK-NEXT:    lhrl %r1, f+4
-; CHECK-NEXT:    sll %r1, 8
-; CHECK-NEXT:    larl %r2, f
-; CHECK-NEXT:    ic %r1, 6(%r2)
-; CHECK-NEXT:    larl %r2, e
-; CHECK-NEXT:    lb %r0, 3(%r2)
+; CHECK-NEXT:    lhrl %r2, f+4
+; CHECK-NEXT:    larl %r1, f
+; CHECK-NEXT:    llc %r1, 6(%r1)
+; CHECK-NEXT:    larl %r3, e
+; CHECK-NEXT:    lb %r0, 3(%r3)
+; CHECK-NEXT:    rosbg %r1, %r2, 32, 55, 8
 ; CHECK-NEXT:    vlvgf %v1, %r1, 0
 ; CHECK-NEXT:    vlvgf %v1, %r1, 1
 ; CHECK-NEXT:    larl %r2, .LCPI0_0
@@ -29,6 +29,7 @@ define dso_local void @m() local_unnamed_addr #1 {
 ; CHECK-NEXT:    vlvgf %v0, %r1, 1
 ; CHECK-NEXT:    vperm %v4, %v1, %v0, %v2
 ; CHECK-NEXT:    vlvgf %v0, %r1, 3
+; CHECK-NEXT:    # kill: def $r1l killed $r1l killed $r1d
 ; CHECK-NEXT:    nilh %r1, 255
 ; CHECK-NEXT:    chi %r1, 128
 ; CHECK-NEXT:    ipm %r1
diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll
index 6e22d855dc831..f6d66ab47ce05 100644
--- a/llvm/test/CodeGen/X86/abds-neg.ll
+++ b/llvm/test/CodeGen/X86/abds-neg.ll
@@ -1058,15 +1058,15 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %esi
 ; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    xorl %edx, %esi
 ; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    subl %esi, %eax
+; X86-NEXT:    sbbl %ecx, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -1089,15 +1089,15 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %esi
 ; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    xorl %edx, %esi
 ; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    subl %esi, %eax
+; X86-NEXT:    sbbl %ecx, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -1121,11 +1121,11 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
@@ -1178,11 +1178,11 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
diff --git a/llvm/test/CodeGen/X86/abds-vector-128.ll b/llvm/test/CodeGen/X86/abds-vector-128.ll
index bcb42002fb08e..148be83892b72 100644
--- a/llvm/test/CodeGen/X86/abds-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abds-vector-128.ll
@@ -806,7 +806,7 @@ define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwin
 ; AVX512-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm3
 ; AVX512-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
 ; AVX512-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %cmp = icmp sge <2 x i64> %a, %b
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index 0de308a9e0738..4e4891a283ce9 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -1734,20 +1734,20 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-LABEL: not_avg_v16i8_wide_constants:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps (%rdi), %xmm1
-; SSE2-NEXT:    movdqa (%rsi), %xmm0
+; SSE2-NEXT:    movdqa (%rsi), %xmm2
 ; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    movd %eax, %xmm0
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm1
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm4
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm4
+; SSE2-NEXT:    movd %eax, %xmm3
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm5
@@ -1762,9 +1762,6 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-NEXT:    movd %eax, %xmm8
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm10
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm9
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
@@ -1774,6 +1771,9 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-NEXT:    movd %eax, %xmm12
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
+; SSE2-NEXT:    movd %eax, %xmm10
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm13
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
@@ -1783,45 +1783,43 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-NEXT:    movd %eax, %xmm15
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[0,0,0,0]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
 ; SSE2-NEXT:    movapd %xmm4, %xmm5
 ; SSE2-NEXT:    andpd %xmm1, %xmm5
 ; SSE2-NEXT:    xorpd %xmm4, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    paddw %xmm5, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm12[0,0,0,0]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
-; SSE2-NEXT:    movapd %xmm2, %xmm3
-; SSE2-NEXT:    andpd %xmm0, %xmm3
-; SSE2-NEXT:    xorpd %xmm2, %xmm0
-; SSE2-NEXT:    psrlw $1, %xmm0
-; SSE2-NEXT:    paddw %xmm3, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    packuswb %xmm0, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; SSE2-NEXT:    movapd %xmm0, %xmm3
+; SSE2-NEXT:    andpd %xmm2, %xmm3
+; SSE2-NEXT:    xorpd %xmm0, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm2
+; SSE2-NEXT:    paddw %xmm3, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    packuswb %xmm2, %xmm1
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
 ; SSE2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index 362b3b945f962..b932bbfbfb807 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -171,15 +171,15 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    vmovdqa (%edx), %xmm0
-; X86-NEXT:    vpand (%ecx), %xmm0, %xmm0
+; X86-NEXT:    vmovdqa (%ecx), %xmm0
+; X86-NEXT:    vpand (%edx), %xmm0, %xmm0
 ; X86-NEXT:    vpextrb $6, %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_extractelement:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa (%rdi), %xmm0
-; X64-NEXT:    vpand (%rsi), %xmm0, %xmm0
+; X64-NEXT:    vmovdqa (%rsi), %xmm0
+; X64-NEXT:    vpand (%rdi), %xmm0, %xmm0
 ; X64-NEXT:    vpextrb $6, %xmm0, (%rdx)
 ; X64-NEXT:    retq
   %i0 = load <16 x i8>, ptr %origin0
@@ -198,8 +198,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst,
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    vmovdqa (%esi), %xmm0
-; X86-NEXT:    vpand (%edx), %xmm0, %xmm0
+; X86-NEXT:    vmovdqa (%edx), %xmm0
+; X86-NEXT:    vpand (%esi), %xmm0, %xmm0
 ; X86-NEXT:    vmovdqa %xmm0, (%ecx)
 ; X86-NEXT:    vpextrb $6, %xmm0, (%eax)
 ; X86-NEXT:    popl %esi
@@ -207,8 +207,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst,
 ;
 ; X64-LABEL: freeze_extractelement_escape:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa (%rdi), %xmm0
-; X64-NEXT:    vpand (%rsi), %xmm0, %xmm0
+; X64-NEXT:    vmovdqa (%rsi), %xmm0
+; X64-NEXT:    vpand (%rdi), %xmm0, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rcx)
 ; X64-NEXT:    vpextrb $6, %xmm0, (%rdx)
 ; X64-NEXT:    retq
@@ -239,8 +239,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id
 ; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    movl 12(%ebp), %esi
 ; X86-NEXT:    movl 8(%ebp), %edi
-; X86-NEXT:    vmovaps (%edi), %xmm0
-; X86-NEXT:    vandps (%esi), %xmm0, %xmm0
+; X86-NEXT:    vmovaps (%esi), %xmm0
+; X86-NEXT:    vandps (%edi), %xmm0, %xmm0
 ; X86-NEXT:    vmovaps %xmm0, (%esp)
 ; X86-NEXT:    movzbl (%esp,%ecx), %ecx
 ; X86-NEXT:    cmpb (%esp,%eax), %cl
@@ -255,8 +255,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id
 ; X64:       # %bb.0:
 ; X64-NEXT:    andl $15, %ecx
 ; X64-NEXT:    andl $15, %edx
-; X64-NEXT:    vmovaps (%rdi), %xmm0
-; X64-NEXT:    vandps (%rsi), %xmm0, %xmm0
+; X64-NEXT:    vmovaps (%rsi), %xmm0
+; X64-NEXT:    vandps (%rdi), %xmm0, %xmm0
 ; X64-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movzbl -24(%rsp,%rdx), %eax
 ; X64-NEXT:    cmpb -24(%rsp,%rcx), %al
@@ -278,16 +278,22 @@ define void @freeze_buildvector_single_maybe_poison_operand(ptr %origin, ptr %ds
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vbroadcastss {{.*#+}} xmm0 = [42,42,42,42]
-; X86-NEXT:    vpinsrd $0, (%ecx), %xmm0, %xmm0
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movl $42, %ecx
+; X86-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; X86-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; X86-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
 ; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_buildvector_single_maybe_poison_operand:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [42,42,42,42]
-; X64-NEXT:    vpinsrd $0, (%rdi), %xmm0, %xmm0
+; X64-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movl $42, %eax
+; X64-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
+; X64-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
+; X64-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
 ; X64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]
 ; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rsi)
@@ -311,18 +317,24 @@ define void @freeze_buildvector_single_repeated_maybe_poison_operand(ptr %origin
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %ecx
 ; X86-NEXT:    andl $15, %ecx
-; X86-NEXT:    vbroadcastss {{.*#+}} xmm0 = [42,42,42,42]
-; X86-NEXT:    vpinsrd $0, %ecx, %xmm0, %xmm0
-; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; X86-NEXT:    vmovd %ecx, %xmm0
+; X86-NEXT:    movl $42, %edx
+; X86-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
+; X86-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; X86-NEXT:    vpinsrd $3, %edx, %xmm0, %xmm0
 ; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_buildvector_single_repeated_maybe_poison_operand:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [42,42,42,42]
-; X64-NEXT:    vpinsrd $0, (%rdi), %xmm0, %xmm0
-; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    andl $15, %eax
+; X64-NEXT:    vmovd %eax, %xmm0
+; X64-NEXT:    movl $42, %ecx
+; X64-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; X64-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
+; X64-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
 ; X64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]
 ; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rsi)
@@ -351,10 +363,7 @@ define void @freeze_two_frozen_buildvectors(ptr %origin0, ptr %origin1, ptr %dst
 ; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [7,7,7,7]
 ; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovdqa %xmm0, (%ecx)
-; X86-NEXT:    vmovd %edx, %xmm0
-; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; X86-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X86-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
+; X86-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
 ; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
@@ -367,10 +376,7 @@ define void @freeze_two_frozen_buildvectors(ptr %origin0, ptr %origin1, ptr %dst
 ; X64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]
 ; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rdx)
-; X64-NEXT:    vmovd %eax, %xmm0
-; X64-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
+; X64-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
 ; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rcx)
 ; X64-NEXT:    retq
@@ -397,15 +403,13 @@ define void @freeze_two_buildvectors_only_one_frozen(ptr %origin0, ptr %origin1,
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl (%edx), %edx
 ; X86-NEXT:    andl $15, %edx
-; X86-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; X86-NEXT:    vmovd %edx, %xmm1
-; X86-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
-; X86-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
-; X86-NEXT:    vbroadcastss {{.*#+}} xmm2 = [7,7,7,7]
-; X86-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; X86-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
+; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [7,7,7,7]
+; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovdqa %xmm0, (%ecx)
-; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
-; X86-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; X86-NEXT:    vmovd %edx, %xmm0
+; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -413,14 +417,13 @@ define void @freeze_two_buildvectors_only_one_frozen(ptr %origin0, ptr %origin1,
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
 ; X64-NEXT:    andl $15, %eax
-; X64-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; X64-NEXT:    vmovd %eax, %xmm1
-; X64-NEXT:    vpbroadcastd %xmm1, %xmm1
-; X64-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; X64-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7]
-; X64-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; X64-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
+; X64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]
+; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rdx)
-; X64-NEXT:    vpand %xmm2, %xmm1, %xmm0
+; X64-NEXT:    vmovd %eax, %xmm0
+; X64-NEXT:    vpbroadcastd %xmm0, %xmm0
+; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rcx)
 ; X64-NEXT:    retq
   %i0.src = load i32, ptr %origin0
@@ -445,23 +448,28 @@ define void @freeze_two_buildvectors_one_undef_elt(ptr %origin0, ptr %origin1, p
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl (%edx), %edx
 ; X86-NEXT:    andl $15, %edx
-; X86-NEXT:    vmovddup {{.*#+}} xmm0 = [7,0,7,0]
-; X86-NEXT:    # xmm0 = mem[0,0]
-; X86-NEXT:    vmovd %edx, %xmm1
-; X86-NEXT:    vpand %xmm0, %xmm1, %xmm2
-; X86-NEXT:    vmovdqa %xmm2, (%ecx)
-; X86-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; X86-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vpinsrd $0, %edx, %xmm0, %xmm0
+; X86-NEXT:    vmovddup {{.*#+}} xmm1 = [7,0,7,0]
+; X86-NEXT:    # xmm1 = mem[0,0]
+; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vmovdqa %xmm0, (%ecx)
+; X86-NEXT:    vmovd %edx, %xmm0
+; X86-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_two_buildvectors_one_undef_elt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    vmovd %eax, %xmm0
-; X64-NEXT:    vpbroadcastd %xmm0, %xmm0
-; X64-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    andl $15, %eax
+; X64-NEXT:    vpinsrq $0, %rax, %xmm0, %xmm0
+; X64-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [7,7]
+; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rdx)
+; X64-NEXT:    vmovq %rax, %xmm0
+; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
+; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rcx)
 ; X64-NEXT:    retq
   %i0.src = load i64, ptr %origin0
@@ -534,9 +542,8 @@ define void @freeze_buildvector_one_undef_elt(ptr %origin0, ptr %origin1, ptr %o
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vpinsrd $0, (%esi), %xmm0, %xmm0
 ; X86-NEXT:    vpinsrd $1, (%edx), %xmm0, %xmm0
-; X86-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
 ; X86-NEXT:    vpinsrd $3, (%ecx), %xmm0, %xmm0
 ; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
@@ -545,9 +552,8 @@ define void @freeze_buildvector_one_undef_elt(ptr %origin0, ptr %origin1, ptr %o
 ;
 ; X64-LABEL: freeze_buildvector_one_undef_elt:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    vpinsrd $0, (%rdi), %xmm0, %xmm0
 ; X64-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm0
-; X64-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
 ; X64-NEXT:    vpinsrd $3, (%rcx), %xmm0, %xmm0
 ; X64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]
 ; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index 2ac2be5545dfd..d2b292f1a7996 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -119,8 +119,8 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-AVX2-NEXT:  .LBB0_2: # %vector.body
 ; CHECK-AVX2-NEXT:    # Parent Loop BB0_1 Depth=1
 ; CHECK-AVX2-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-AVX2-NEXT:    vmovdqu 1024(%rdx,%rsi), %xmm5
-; CHECK-AVX2-NEXT:    vmovdqu 1040(%rdx,%rsi), %xmm6
+; CHECK-AVX2-NEXT:    vmovdqu 1024(%rdx,%rsi), %ymm5
+; CHECK-AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; CHECK-AVX2-NEXT:    vpextrq $1, %xmm5, %rdi
 ; CHECK-AVX2-NEXT:    vpextrq $1, %xmm6, %r8
 ; CHECK-AVX2-NEXT:    vmovq %xmm5, %r9



More information about the llvm-commits mailing list