[llvm-branch-commits] [llvm] [DAG] visitFREEZE - limit freezing of multiple operands (PR #150425)
Simon Pilgrim via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Jul 24 07:11:01 PDT 2025
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/150425
This is a partial revert of https://github.com/llvm/llvm-project/pull/145939 (I've kept the BUILD_VECTOR(FREEZE(UNDEF), FREEZE(UNDEF), elt2, ...) canonicalization) as we're getting reports of infinite loops (https://github.com/llvm/llvm-project/issues/148084).
The issue appears to be due to deep chains of nodes and how visitFREEZE replaces all instances of an operand with a common frozen version - other users of the original frozen node then get added back to the worklist but might no longer be able to confirm a node isn't poison due to recursion depth limits on isGuaranteedNotToBeUndefOrPoison.
The issue still exists with the old implementation but by only allowing a single frozen operand it helps prevent cases of interdependent frozen nodes.
I'm still working on supporting multiple operands as its critical for topological DAG handling but need to get a fix in for trunk and 21.x.
Fixes https://github.com/llvm/llvm-project/issues/148084
>From a612be6bfed905b3b5937f495a5cdf9bb6be0edf Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 18 Jul 2025 17:38:11 +0100
Subject: [PATCH 1/5] [DAG] visitFREEZE - remove unused HadMaybePoisonOperands
check. NFC. (#149517)
Redundant since #145939
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 23812d795f5fa..4cd0a9a4fc3e7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16751,12 +16751,8 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly*/ false,
/*Depth*/ 1))
continue;
- bool HadMaybePoisonOperands = !MaybePoisonOperands.empty();
- bool IsNewMaybePoisonOperand = MaybePoisonOperands.insert(Op).second;
- if (IsNewMaybePoisonOperand)
+ if (MaybePoisonOperands.insert(Op).second)
MaybePoisonOperandNumbers.push_back(OpNo);
- if (!HadMaybePoisonOperands)
- continue;
}
// NOTE: the whole op may be not guaranteed to not be undef or poison because
// it could create undef or poison due to it's poison-generating flags.
>From 4f7368ac4ceceda11a50069a9386c34cf5afff28 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Sun, 20 Jul 2025 13:06:55 +0100
Subject: [PATCH 2/5] [DAG] Add missing Depth argument to
isGuaranteedNotToBeUndefOrPoison calls inside SimplifyDemanded methods
(#149550)
Ensure we don't exceed the maximum recursion depth
---
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 5 +++--
llvm/test/CodeGen/X86/pr62286.ll | 8 ++++----
2 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e0597988e8907..37fddcf943681 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -778,7 +778,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
case ISD::FREEZE: {
SDValue N0 = Op.getOperand(0);
if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
- /*PoisonOnly=*/false))
+ /*PoisonOnly=*/false, Depth + 1))
return N0;
break;
}
@@ -3369,7 +3369,8 @@ bool TargetLowering::SimplifyDemandedVectorElts(
case ISD::FREEZE: {
SDValue N0 = Op.getOperand(0);
if (TLO.DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
- /*PoisonOnly=*/false))
+ /*PoisonOnly=*/false,
+ Depth + 1))
return TLO.CombineTo(Op, N0);
// TODO: Replace this with the general fold from DAGCombiner::visitFREEZE
diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll
index 2d1b7fcbf0239..9728e130333c4 100644
--- a/llvm/test/CodeGen/X86/pr62286.ll
+++ b/llvm/test/CodeGen/X86/pr62286.ll
@@ -42,10 +42,10 @@ define i64 @PR62286(i32 %a) {
; AVX2-LABEL: PR62286:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm1
-; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
>From 4f07800533ac770223407177cfc38e1bc6c4774d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 22 Jul 2025 07:50:17 +0100
Subject: [PATCH 3/5] [X86] canCreateUndefOrPoisonForTargetNode - SSE
PINSR/PEXTR vector element insert/extract are never out of bounds (#149822)
The immediate index is guaranteed to be treated as modulo
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 15 +++--
llvm/test/CodeGen/X86/avg.ll | 81 ++++++++++++-------------
2 files changed, 50 insertions(+), 46 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5e35d5630d667..d755c7e9bf3b3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45096,27 +45096,34 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
switch (Op.getOpcode()) {
+ // SSE vector insert/extracts use modulo indices.
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW:
+ case X86ISD::PEXTRB:
+ case X86ISD::PEXTRW:
+ return false;
// SSE vector multiplies are either inbounds or saturate.
case X86ISD::VPMADDUBSW:
case X86ISD::VPMADDWD:
+ return false;
// SSE vector shifts handle out of bounds shift amounts.
case X86ISD::VSHLI:
case X86ISD::VSRLI:
case X86ISD::VSRAI:
return false;
- // SSE blends.
+ // SSE blends.
case X86ISD::BLENDI:
case X86ISD::BLENDV:
return false;
- // SSE target shuffles.
+ // SSE target shuffles.
case X86ISD::PSHUFD:
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
case X86ISD::VPERMILPI:
case X86ISD::VPERMV3:
return false;
- // SSE comparisons handle all icmp/fcmp cases.
- // TODO: Add CMPM/MM with test coverage.
+ // SSE comparisons handle all icmp/fcmp cases.
+ // TODO: Add CMPM/MM with test coverage.
case X86ISD::CMPP:
case X86ISD::PCMPEQ:
case X86ISD::PCMPGT:
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index 217ccebdfb77f..9be816655072c 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -1829,73 +1829,70 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX1-NEXT: vpextrd $2, %xmm5, %ecx
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; AVX1-NEXT: vpextrd $2, %xmm4, %eax
; AVX1-NEXT: vpextrw $3, %xmm3, %edx
+; AVX1-NEXT: vpextrw $2, %xmm3, %ecx
+; AVX1-NEXT: vpextrw $1, %xmm3, %eax
; AVX1-NEXT: decl %edx
; AVX1-NEXT: vmovd %edx, %xmm4
-; AVX1-NEXT: vpextrw $2, %xmm3, %edx
-; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm5
-; AVX1-NEXT: vpextrw $1, %xmm3, %edx
-; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm6
; AVX1-NEXT: vpextrw $0, %xmm3, %edx
+; AVX1-NEXT: decl %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm5
+; AVX1-NEXT: vpextrw $3, %xmm2, %ecx
+; AVX1-NEXT: decl %eax
+; AVX1-NEXT: vmovd %eax, %xmm6
+; AVX1-NEXT: vpextrw $2, %xmm2, %eax
; AVX1-NEXT: decl %edx
; AVX1-NEXT: vmovd %edx, %xmm7
-; AVX1-NEXT: vpextrw $3, %xmm2, %edx
-; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm8
-; AVX1-NEXT: vpextrw $2, %xmm2, %edx
-; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm9
; AVX1-NEXT: vpextrw $1, %xmm2, %edx
+; AVX1-NEXT: decl %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm8
+; AVX1-NEXT: vpextrw $0, %xmm2, %ecx
+; AVX1-NEXT: decl %eax
+; AVX1-NEXT: vmovd %eax, %xmm9
+; AVX1-NEXT: vpextrw $7, %xmm3, %eax
; AVX1-NEXT: decl %edx
; AVX1-NEXT: vmovd %edx, %xmm10
-; AVX1-NEXT: vpextrw $0, %xmm2, %edx
-; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm11
-; AVX1-NEXT: vpextrw $5, %xmm3, %edx
-; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm12
-; AVX1-NEXT: vpextrw $4, %xmm3, %edx
+; AVX1-NEXT: vpextrw $6, %xmm3, %edx
+; AVX1-NEXT: decl %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm11
+; AVX1-NEXT: vpextrw $7, %xmm2, %ecx
+; AVX1-NEXT: decl %eax
+; AVX1-NEXT: vmovd %eax, %xmm12
+; AVX1-NEXT: vpextrw $6, %xmm2, %eax
; AVX1-NEXT: decl %edx
; AVX1-NEXT: vmovd %edx, %xmm13
-; AVX1-NEXT: vpextrw $5, %xmm2, %edx
-; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm14
-; AVX1-NEXT: vpextrw $4, %xmm2, %edx
+; AVX1-NEXT: vpextrw $5, %xmm3, %edx
+; AVX1-NEXT: decl %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm14
+; AVX1-NEXT: vpextrw $4, %xmm3, %ecx
+; AVX1-NEXT: decl %eax
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: vpextrw $5, %xmm2, %eax
; AVX1-NEXT: decl %edx
; AVX1-NEXT: vmovd %edx, %xmm15
-; AVX1-NEXT: vpextrw $7, %xmm3, %edx
+; AVX1-NEXT: vpextrw $4, %xmm2, %edx
; AVX1-NEXT: decl %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm3
-; AVX1-NEXT: vpextrw $7, %xmm2, %ecx
-; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm2
+; AVX1-NEXT: vmovd %ecx, %xmm2
; AVX1-NEXT: decl %eax
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX1-NEXT: vmovd %eax, %xmm5
-; AVX1-NEXT: decl %ecx
+; AVX1-NEXT: decl %edx
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX1-NEXT: vmovd %ecx, %xmm7
+; AVX1-NEXT: vmovd %edx, %xmm7
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX1-NEXT: vmovddup {{.*#+}} ymm3 = ymm6[0,0,2,2]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
+; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
-; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
+; AVX1-NEXT: vmovddup {{.*#+}} ymm2 = ymm2[0,0,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7]
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vandps %ymm0, %ymm2, %ymm1
>From 9b4ff42998ddbf1e79d63ec521d462b3ad9eaa3f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 22 Jul 2025 15:40:55 +0100
Subject: [PATCH 4/5] [DAG] visitFREEZE - limit freezing of multiple operands
(#149797)
This is a partial revert of #145939 (I've kept the BUILD_VECTOR(FREEZE(UNDEF), FREEZE(UNDEF), elt2, ...) canonicalization) as we're getting reports of infinite loops (#148084).
The issue appears to be due to deep chains of nodes and how visitFREEZE replaces all instances of an operand with a common frozen version - other users of the original frozen node then get added back to the worklist but might no longer be able to confirm a node isn't poison due to recursion depth limits on isGuaranteedNotToBeUndefOrPoison.
The issue still exists with the old implementation but by only allowing a single frozen operand it helps prevent cases of interdependent frozen nodes.
I'm still working on supporting multiple operands as its critical for topological DAG handling but need to get a fix in for trunk and 21.x.
Fixes #148084
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 25 +-
llvm/test/CodeGen/AMDGPU/div_i128.ll | 64 +-
llvm/test/CodeGen/AMDGPU/rem_i128.ll | 64 +-
llvm/test/CodeGen/NVPTX/i1-select.ll | 30 +-
llvm/test/CodeGen/NVPTX/i128.ll | 582 +++--
llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll | 417 ++--
llvm/test/CodeGen/RISCV/fpclamptosat.ll | 88 +-
.../RISCV/intrinsic-cttz-elts-vscale.ll | 36 +-
.../RISCV/wide-scalar-shift-legalization.ll | 1901 +++++++++--------
llvm/test/CodeGen/SystemZ/pr60413.ll | 36 +-
llvm/test/CodeGen/X86/abds-neg.ll | 36 +-
llvm/test/CodeGen/X86/avg.ll | 156 +-
.../X86/div-rem-pair-recomposition-signed.ll | 39 +-
.../div-rem-pair-recomposition-unsigned.ll | 12 +-
llvm/test/CodeGen/X86/freeze-vector.ll | 24 +-
.../test/CodeGen/X86/setcc-non-simple-type.ll | 4 +-
16 files changed, 1816 insertions(+), 1698 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4cd0a9a4fc3e7..91fd2d843f445 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16717,7 +16717,8 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
// Fold freeze(op(x, ...)) -> op(freeze(x), ...).
// Try to push freeze through instructions that propagate but don't produce
// poison as far as possible. If an operand of freeze follows three
- // conditions 1) one-use, and 2) does not produce poison then push
+ // conditions 1) one-use, 2) does not produce poison, and 3) has all but one
+ // guaranteed-non-poison operands (or is a BUILD_VECTOR or similar) then push
// the freeze through to the operands that are not guaranteed non-poison.
// NOTE: we will strip poison-generating flags, so ignore them here.
if (DAG.canCreateUndefOrPoison(N0, /*PoisonOnly*/ false,
@@ -16725,6 +16726,18 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
N0->getNumValues() != 1 || !N0->hasOneUse())
return SDValue();
+ // TOOD: we should always allow multiple operands, however this increases the
+ // likelihood of infinite loops due to the ReplaceAllUsesOfValueWith call
+ // below causing later nodes that share frozen operands to fold again and no
+ // longer being able to confirm other operands are not poison due to recursion
+ // depth limits on isGuaranteedNotToBeUndefOrPoison.
+ bool AllowMultipleMaybePoisonOperands =
+ N0.getOpcode() == ISD::SELECT_CC || N0.getOpcode() == ISD::SETCC ||
+ N0.getOpcode() == ISD::BUILD_VECTOR ||
+ N0.getOpcode() == ISD::BUILD_PAIR ||
+ N0.getOpcode() == ISD::VECTOR_SHUFFLE ||
+ N0.getOpcode() == ISD::CONCAT_VECTORS || N0.getOpcode() == ISD::FMUL;
+
// Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all
// ones" or "constant" into something that depends on FrozenUndef. We can
// instead pick undef values to keep those properties, while at the same time
@@ -16751,8 +16764,16 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly*/ false,
/*Depth*/ 1))
continue;
- if (MaybePoisonOperands.insert(Op).second)
+ bool HadMaybePoisonOperands = !MaybePoisonOperands.empty();
+ bool IsNewMaybePoisonOperand = MaybePoisonOperands.insert(Op).second;
+ if (IsNewMaybePoisonOperand)
MaybePoisonOperandNumbers.push_back(OpNo);
+ if (!HadMaybePoisonOperands)
+ continue;
+ if (IsNewMaybePoisonOperand && !AllowMultipleMaybePoisonOperands) {
+ // Multiple maybe-poison ops when not allowed - bail out.
+ return SDValue();
+ }
}
// NOTE: the whole op may be not guaranteed to not be undef or poison because
// it could create undef or poison due to it's poison-generating flags.
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index f8e13fcdd2273..51398a45055eb 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -475,28 +475,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -507,7 +500,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1043,10 +1035,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
@@ -2664,28 +2656,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -2696,7 +2681,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -3232,10 +3216,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index ba9dd8f7c2468..6512bee36e88b 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -513,28 +513,21 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -545,7 +538,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1081,10 +1073,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
@@ -1897,28 +1889,21 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_nop 0
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -1929,7 +1914,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: s_mov_b32 s14, s13
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -2465,10 +2449,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
; GFX9-O0-NEXT: s_mov_b32 s5, s6
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll
index f1adc3489c0d9..9a051b3fd8bb7 100644
--- a/llvm/test/CodeGen/NVPTX/i1-select.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-select.ll
@@ -94,27 +94,27 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals
define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %false) {
; CHECK-LABEL: test_select_i1_basic_folding(
; CHECK: {
-; CHECK-NEXT: .reg .pred %p<12>;
-; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .pred %p<13>;
+; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_basic_folding_param_0];
; CHECK-NEXT: setp.eq.b32 %p1, %r1, 0;
-; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_1];
-; CHECK-NEXT: setp.ne.b32 %p2, %r3, 0;
-; CHECK-NEXT: setp.eq.b32 %p3, %r3, 0;
-; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_2];
-; CHECK-NEXT: setp.eq.b32 %p4, %r5, 0;
-; CHECK-NEXT: ld.param.b32 %r6, [test_select_i1_basic_folding_param_3];
+; CHECK-NEXT: ld.param.b32 %r2, [test_select_i1_basic_folding_param_1];
+; CHECK-NEXT: setp.ne.b32 %p2, %r2, 0;
+; CHECK-NEXT: setp.eq.b32 %p3, %r2, 0;
+; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_2];
+; CHECK-NEXT: setp.eq.b32 %p4, %r3, 0;
+; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_folding_param_3];
; CHECK-NEXT: xor.pred %p6, %p1, %p3;
-; CHECK-NEXT: ld.param.b32 %r7, [test_select_i1_basic_folding_param_4];
+; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_4];
; CHECK-NEXT: and.pred %p7, %p6, %p4;
-; CHECK-NEXT: and.pred %p8, %p2, %p4;
-; CHECK-NEXT: and.pred %p9, %p3, %p7;
-; CHECK-NEXT: or.pred %p10, %p9, %p8;
-; CHECK-NEXT: xor.pred %p11, %p10, %p3;
-; CHECK-NEXT: selp.b32 %r8, %r6, %r7, %p11;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT: and.pred %p9, %p2, %p4;
+; CHECK-NEXT: and.pred %p10, %p3, %p7;
+; CHECK-NEXT: or.pred %p11, %p10, %p9;
+; CHECK-NEXT: xor.pred %p12, %p11, %p3;
+; CHECK-NEXT: selp.b32 %r6, %r4, %r5, %p12;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
; CHECK-NEXT: ret;
%b1 = icmp eq i32 %v1, 0
%b2 = icmp eq i32 %v2, 0
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index f2211eb1c0b8e..44d85589b5056 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -5,9 +5,9 @@
define i128 @srem_i128(i128 %lhs, i128 %rhs) {
; CHECK-LABEL: srem_i128(
; CHECK: {
-; CHECK-NEXT: .reg .pred %p<22>;
+; CHECK-NEXT: .reg .pred %p<20>;
; CHECK-NEXT: .reg .b32 %r<12>;
-; CHECK-NEXT: .reg .b64 %rd<126>;
+; CHECK-NEXT: .reg .b64 %rd<127>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [srem_i128_param_0];
@@ -42,103 +42,102 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: cvt.u64.u32 %rd62, %r4;
; CHECK-NEXT: add.s64 %rd63, %rd62, 64;
; CHECK-NEXT: selp.b64 %rd64, %rd61, %rd63, %p7;
-; CHECK-NEXT: mov.b64 %rd116, 0;
+; CHECK-NEXT: mov.b64 %rd117, 0;
; CHECK-NEXT: sub.cc.s64 %rd66, %rd60, %rd64;
-; CHECK-NEXT: subc.cc.s64 %rd8, %rd116, 0;
-; CHECK-NEXT: setp.ne.b64 %p8, %rd8, 0;
-; CHECK-NEXT: and.pred %p10, %p8, %p8;
-; CHECK-NEXT: setp.eq.b64 %p11, %rd8, 0;
-; CHECK-NEXT: setp.gt.u64 %p12, %rd66, 127;
-; CHECK-NEXT: and.pred %p13, %p11, %p12;
-; CHECK-NEXT: or.pred %p14, %p13, %p10;
-; CHECK-NEXT: or.pred %p15, %p5, %p14;
-; CHECK-NEXT: xor.b64 %rd67, %rd66, 127;
-; CHECK-NEXT: or.b64 %rd68, %rd67, %rd8;
-; CHECK-NEXT: setp.eq.b64 %p16, %rd68, 0;
-; CHECK-NEXT: selp.b64 %rd125, 0, %rd4, %p15;
-; CHECK-NEXT: selp.b64 %rd124, 0, %rd3, %p15;
-; CHECK-NEXT: or.pred %p17, %p15, %p16;
-; CHECK-NEXT: @%p17 bra $L__BB0_5;
+; CHECK-NEXT: subc.cc.s64 %rd67, %rd117, 0;
+; CHECK-NEXT: setp.gt.u64 %p8, %rd66, 127;
+; CHECK-NEXT: setp.eq.b64 %p9, %rd67, 0;
+; CHECK-NEXT: and.pred %p10, %p9, %p8;
+; CHECK-NEXT: setp.ne.b64 %p11, %rd67, 0;
+; CHECK-NEXT: or.pred %p12, %p10, %p11;
+; CHECK-NEXT: or.pred %p13, %p5, %p12;
+; CHECK-NEXT: xor.b64 %rd68, %rd66, 127;
+; CHECK-NEXT: or.b64 %rd69, %rd68, %rd67;
+; CHECK-NEXT: setp.eq.b64 %p14, %rd69, 0;
+; CHECK-NEXT: selp.b64 %rd126, 0, %rd4, %p13;
+; CHECK-NEXT: selp.b64 %rd125, 0, %rd3, %p13;
+; CHECK-NEXT: or.pred %p15, %p13, %p14;
+; CHECK-NEXT: @%p15 bra $L__BB0_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
-; CHECK-NEXT: add.cc.s64 %rd118, %rd66, 1;
-; CHECK-NEXT: addc.cc.s64 %rd119, %rd8, 0;
-; CHECK-NEXT: or.b64 %rd71, %rd118, %rd119;
-; CHECK-NEXT: setp.eq.b64 %p18, %rd71, 0;
+; CHECK-NEXT: add.cc.s64 %rd119, %rd66, 1;
+; CHECK-NEXT: addc.cc.s64 %rd120, %rd67, 0;
+; CHECK-NEXT: or.b64 %rd72, %rd119, %rd120;
+; CHECK-NEXT: setp.eq.b64 %p16, %rd72, 0;
; CHECK-NEXT: cvt.u32.u64 %r5, %rd66;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd72, %rd4, %r6;
+; CHECK-NEXT: shl.b64 %rd73, %rd4, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd73, %rd3, %r7;
-; CHECK-NEXT: or.b64 %rd74, %rd72, %rd73;
+; CHECK-NEXT: shr.u64 %rd74, %rd3, %r7;
+; CHECK-NEXT: or.b64 %rd75, %rd73, %rd74;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd75, %rd3, %r8;
-; CHECK-NEXT: setp.gt.s32 %p19, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd123, %rd75, %rd74, %p19;
-; CHECK-NEXT: shl.b64 %rd122, %rd3, %r6;
-; CHECK-NEXT: mov.b64 %rd113, %rd116;
-; CHECK-NEXT: @%p18 bra $L__BB0_4;
+; CHECK-NEXT: shl.b64 %rd76, %rd3, %r8;
+; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63;
+; CHECK-NEXT: selp.b64 %rd124, %rd76, %rd75, %p17;
+; CHECK-NEXT: shl.b64 %rd123, %rd3, %r6;
+; CHECK-NEXT: mov.b64 %rd114, %rd117;
+; CHECK-NEXT: @%p16 bra $L__BB0_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
-; CHECK-NEXT: cvt.u32.u64 %r9, %rd118;
-; CHECK-NEXT: shr.u64 %rd78, %rd3, %r9;
+; CHECK-NEXT: cvt.u32.u64 %r9, %rd119;
+; CHECK-NEXT: shr.u64 %rd79, %rd3, %r9;
; CHECK-NEXT: sub.s32 %r10, 64, %r9;
-; CHECK-NEXT: shl.b64 %rd79, %rd4, %r10;
-; CHECK-NEXT: or.b64 %rd80, %rd78, %rd79;
+; CHECK-NEXT: shl.b64 %rd80, %rd4, %r10;
+; CHECK-NEXT: or.b64 %rd81, %rd79, %rd80;
; CHECK-NEXT: add.s32 %r11, %r9, -64;
-; CHECK-NEXT: shr.u64 %rd81, %rd4, %r11;
-; CHECK-NEXT: setp.gt.s32 %p20, %r9, 63;
-; CHECK-NEXT: selp.b64 %rd120, %rd81, %rd80, %p20;
-; CHECK-NEXT: shr.u64 %rd121, %rd4, %r9;
+; CHECK-NEXT: shr.u64 %rd82, %rd4, %r11;
+; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63;
+; CHECK-NEXT: selp.b64 %rd121, %rd82, %rd81, %p18;
+; CHECK-NEXT: shr.u64 %rd122, %rd4, %r9;
; CHECK-NEXT: add.cc.s64 %rd35, %rd5, -1;
; CHECK-NEXT: addc.cc.s64 %rd36, %rd6, -1;
-; CHECK-NEXT: mov.b64 %rd113, 0;
-; CHECK-NEXT: mov.b64 %rd116, %rd113;
+; CHECK-NEXT: mov.b64 %rd114, 0;
+; CHECK-NEXT: mov.b64 %rd117, %rd114;
; CHECK-NEXT: $L__BB0_2: // %udiv-do-while
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: shr.u64 %rd82, %rd120, 63;
-; CHECK-NEXT: shl.b64 %rd83, %rd121, 1;
-; CHECK-NEXT: or.b64 %rd84, %rd83, %rd82;
-; CHECK-NEXT: shl.b64 %rd85, %rd120, 1;
-; CHECK-NEXT: shr.u64 %rd86, %rd123, 63;
-; CHECK-NEXT: or.b64 %rd87, %rd85, %rd86;
-; CHECK-NEXT: shr.u64 %rd88, %rd122, 63;
-; CHECK-NEXT: shl.b64 %rd89, %rd123, 1;
-; CHECK-NEXT: or.b64 %rd90, %rd89, %rd88;
-; CHECK-NEXT: shl.b64 %rd91, %rd122, 1;
-; CHECK-NEXT: or.b64 %rd122, %rd116, %rd91;
-; CHECK-NEXT: or.b64 %rd123, %rd113, %rd90;
-; CHECK-NEXT: sub.cc.s64 %rd92, %rd35, %rd87;
-; CHECK-NEXT: subc.cc.s64 %rd93, %rd36, %rd84;
-; CHECK-NEXT: shr.s64 %rd94, %rd93, 63;
-; CHECK-NEXT: and.b64 %rd116, %rd94, 1;
-; CHECK-NEXT: and.b64 %rd95, %rd94, %rd5;
-; CHECK-NEXT: and.b64 %rd96, %rd94, %rd6;
-; CHECK-NEXT: sub.cc.s64 %rd120, %rd87, %rd95;
-; CHECK-NEXT: subc.cc.s64 %rd121, %rd84, %rd96;
-; CHECK-NEXT: add.cc.s64 %rd118, %rd118, -1;
-; CHECK-NEXT: addc.cc.s64 %rd119, %rd119, -1;
-; CHECK-NEXT: or.b64 %rd97, %rd118, %rd119;
-; CHECK-NEXT: setp.eq.b64 %p21, %rd97, 0;
-; CHECK-NEXT: @%p21 bra $L__BB0_4;
+; CHECK-NEXT: shr.u64 %rd83, %rd121, 63;
+; CHECK-NEXT: shl.b64 %rd84, %rd122, 1;
+; CHECK-NEXT: or.b64 %rd85, %rd84, %rd83;
+; CHECK-NEXT: shl.b64 %rd86, %rd121, 1;
+; CHECK-NEXT: shr.u64 %rd87, %rd124, 63;
+; CHECK-NEXT: or.b64 %rd88, %rd86, %rd87;
+; CHECK-NEXT: shr.u64 %rd89, %rd123, 63;
+; CHECK-NEXT: shl.b64 %rd90, %rd124, 1;
+; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT: shl.b64 %rd92, %rd123, 1;
+; CHECK-NEXT: or.b64 %rd123, %rd117, %rd92;
+; CHECK-NEXT: or.b64 %rd124, %rd114, %rd91;
+; CHECK-NEXT: sub.cc.s64 %rd93, %rd35, %rd88;
+; CHECK-NEXT: subc.cc.s64 %rd94, %rd36, %rd85;
+; CHECK-NEXT: shr.s64 %rd95, %rd94, 63;
+; CHECK-NEXT: and.b64 %rd117, %rd95, 1;
+; CHECK-NEXT: and.b64 %rd96, %rd95, %rd5;
+; CHECK-NEXT: and.b64 %rd97, %rd95, %rd6;
+; CHECK-NEXT: sub.cc.s64 %rd121, %rd88, %rd96;
+; CHECK-NEXT: subc.cc.s64 %rd122, %rd85, %rd97;
+; CHECK-NEXT: add.cc.s64 %rd119, %rd119, -1;
+; CHECK-NEXT: addc.cc.s64 %rd120, %rd120, -1;
+; CHECK-NEXT: or.b64 %rd98, %rd119, %rd120;
+; CHECK-NEXT: setp.eq.b64 %p19, %rd98, 0;
+; CHECK-NEXT: @%p19 bra $L__BB0_4;
; CHECK-NEXT: bra.uni $L__BB0_2;
; CHECK-NEXT: $L__BB0_4: // %udiv-loop-exit
-; CHECK-NEXT: shr.u64 %rd98, %rd122, 63;
-; CHECK-NEXT: shl.b64 %rd99, %rd123, 1;
-; CHECK-NEXT: or.b64 %rd100, %rd99, %rd98;
-; CHECK-NEXT: shl.b64 %rd101, %rd122, 1;
-; CHECK-NEXT: or.b64 %rd124, %rd116, %rd101;
-; CHECK-NEXT: or.b64 %rd125, %rd113, %rd100;
+; CHECK-NEXT: shr.u64 %rd99, %rd123, 63;
+; CHECK-NEXT: shl.b64 %rd100, %rd124, 1;
+; CHECK-NEXT: or.b64 %rd101, %rd100, %rd99;
+; CHECK-NEXT: shl.b64 %rd102, %rd123, 1;
+; CHECK-NEXT: or.b64 %rd125, %rd117, %rd102;
+; CHECK-NEXT: or.b64 %rd126, %rd114, %rd101;
; CHECK-NEXT: $L__BB0_5: // %udiv-end
-; CHECK-NEXT: mul.hi.u64 %rd102, %rd5, %rd124;
-; CHECK-NEXT: mad.lo.s64 %rd103, %rd5, %rd125, %rd102;
-; CHECK-NEXT: mad.lo.s64 %rd104, %rd6, %rd124, %rd103;
-; CHECK-NEXT: mul.lo.s64 %rd105, %rd5, %rd124;
-; CHECK-NEXT: sub.cc.s64 %rd106, %rd3, %rd105;
-; CHECK-NEXT: subc.cc.s64 %rd107, %rd4, %rd104;
-; CHECK-NEXT: xor.b64 %rd108, %rd106, %rd2;
+; CHECK-NEXT: mul.hi.u64 %rd103, %rd5, %rd125;
+; CHECK-NEXT: mad.lo.s64 %rd104, %rd5, %rd126, %rd103;
+; CHECK-NEXT: mad.lo.s64 %rd105, %rd6, %rd125, %rd104;
+; CHECK-NEXT: mul.lo.s64 %rd106, %rd5, %rd125;
+; CHECK-NEXT: sub.cc.s64 %rd107, %rd3, %rd106;
+; CHECK-NEXT: subc.cc.s64 %rd108, %rd4, %rd105;
; CHECK-NEXT: xor.b64 %rd109, %rd107, %rd2;
-; CHECK-NEXT: sub.cc.s64 %rd110, %rd108, %rd2;
-; CHECK-NEXT: subc.cc.s64 %rd111, %rd109, %rd2;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd110, %rd111};
+; CHECK-NEXT: xor.b64 %rd110, %rd108, %rd2;
+; CHECK-NEXT: sub.cc.s64 %rd111, %rd109, %rd2;
+; CHECK-NEXT: subc.cc.s64 %rd112, %rd110, %rd2;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd111, %rd112};
; CHECK-NEXT: ret;
%div = srem i128 %lhs, %rhs
ret i128 %div
@@ -149,7 +148,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
; CHECK: {
; CHECK-NEXT: .reg .pred %p<18>;
; CHECK-NEXT: .reg .b32 %r<12>;
-; CHECK-NEXT: .reg .b64 %rd<111>;
+; CHECK-NEXT: .reg .b64 %rd<113>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [urem_i128_param_0];
@@ -173,98 +172,98 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: cvt.u64.u32 %rd52, %r4;
; CHECK-NEXT: add.s64 %rd53, %rd52, 64;
; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT: mov.b64 %rd101, 0;
-; CHECK-NEXT: sub.cc.s64 %rd5, %rd50, %rd54;
-; CHECK-NEXT: subc.cc.s64 %rd6, %rd101, 0;
-; CHECK-NEXT: setp.gt.u64 %p6, %rd5, 127;
-; CHECK-NEXT: setp.eq.b64 %p7, %rd6, 0;
+; CHECK-NEXT: mov.b64 %rd103, 0;
+; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54;
+; CHECK-NEXT: subc.cc.s64 %rd57, %rd103, 0;
+; CHECK-NEXT: setp.gt.u64 %p6, %rd56, 127;
+; CHECK-NEXT: setp.eq.b64 %p7, %rd57, 0;
; CHECK-NEXT: and.pred %p8, %p7, %p6;
-; CHECK-NEXT: setp.ne.b64 %p9, %rd6, 0;
+; CHECK-NEXT: setp.ne.b64 %p9, %rd57, 0;
; CHECK-NEXT: or.pred %p10, %p8, %p9;
; CHECK-NEXT: or.pred %p11, %p3, %p10;
-; CHECK-NEXT: xor.b64 %rd56, %rd5, 127;
-; CHECK-NEXT: or.b64 %rd57, %rd56, %rd6;
-; CHECK-NEXT: setp.eq.b64 %p12, %rd57, 0;
-; CHECK-NEXT: selp.b64 %rd110, 0, %rd42, %p11;
-; CHECK-NEXT: selp.b64 %rd109, 0, %rd41, %p11;
+; CHECK-NEXT: xor.b64 %rd58, %rd56, 127;
+; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57;
+; CHECK-NEXT: setp.eq.b64 %p12, %rd59, 0;
+; CHECK-NEXT: selp.b64 %rd112, 0, %rd42, %p11;
+; CHECK-NEXT: selp.b64 %rd111, 0, %rd41, %p11;
; CHECK-NEXT: or.pred %p13, %p11, %p12;
; CHECK-NEXT: @%p13 bra $L__BB1_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
-; CHECK-NEXT: add.cc.s64 %rd103, %rd5, 1;
-; CHECK-NEXT: addc.cc.s64 %rd104, %rd6, 0;
-; CHECK-NEXT: or.b64 %rd60, %rd103, %rd104;
-; CHECK-NEXT: setp.eq.b64 %p14, %rd60, 0;
-; CHECK-NEXT: cvt.u32.u64 %r5, %rd5;
+; CHECK-NEXT: add.cc.s64 %rd105, %rd56, 1;
+; CHECK-NEXT: addc.cc.s64 %rd106, %rd57, 0;
+; CHECK-NEXT: or.b64 %rd62, %rd105, %rd106;
+; CHECK-NEXT: setp.eq.b64 %p14, %rd62, 0;
+; CHECK-NEXT: cvt.u32.u64 %r5, %rd56;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd61, %rd42, %r6;
+; CHECK-NEXT: shl.b64 %rd63, %rd42, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd62, %rd41, %r7;
-; CHECK-NEXT: or.b64 %rd63, %rd61, %rd62;
+; CHECK-NEXT: shr.u64 %rd64, %rd41, %r7;
+; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd64, %rd41, %r8;
+; CHECK-NEXT: shl.b64 %rd66, %rd41, %r8;
; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd108, %rd64, %rd63, %p15;
-; CHECK-NEXT: shl.b64 %rd107, %rd41, %r6;
-; CHECK-NEXT: mov.b64 %rd98, %rd101;
+; CHECK-NEXT: selp.b64 %rd110, %rd66, %rd65, %p15;
+; CHECK-NEXT: shl.b64 %rd109, %rd41, %r6;
+; CHECK-NEXT: mov.b64 %rd100, %rd103;
; CHECK-NEXT: @%p14 bra $L__BB1_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
-; CHECK-NEXT: cvt.u32.u64 %r9, %rd103;
-; CHECK-NEXT: shr.u64 %rd67, %rd41, %r9;
+; CHECK-NEXT: cvt.u32.u64 %r9, %rd105;
+; CHECK-NEXT: shr.u64 %rd69, %rd41, %r9;
; CHECK-NEXT: sub.s32 %r10, 64, %r9;
-; CHECK-NEXT: shl.b64 %rd68, %rd42, %r10;
-; CHECK-NEXT: or.b64 %rd69, %rd67, %rd68;
+; CHECK-NEXT: shl.b64 %rd70, %rd42, %r10;
+; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70;
; CHECK-NEXT: add.s32 %r11, %r9, -64;
-; CHECK-NEXT: shr.u64 %rd70, %rd42, %r11;
+; CHECK-NEXT: shr.u64 %rd72, %rd42, %r11;
; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63;
-; CHECK-NEXT: selp.b64 %rd105, %rd70, %rd69, %p16;
-; CHECK-NEXT: shr.u64 %rd106, %rd42, %r9;
+; CHECK-NEXT: selp.b64 %rd107, %rd72, %rd71, %p16;
+; CHECK-NEXT: shr.u64 %rd108, %rd42, %r9;
; CHECK-NEXT: add.cc.s64 %rd33, %rd3, -1;
; CHECK-NEXT: addc.cc.s64 %rd34, %rd4, -1;
-; CHECK-NEXT: mov.b64 %rd98, 0;
-; CHECK-NEXT: mov.b64 %rd101, %rd98;
+; CHECK-NEXT: mov.b64 %rd100, 0;
+; CHECK-NEXT: mov.b64 %rd103, %rd100;
; CHECK-NEXT: $L__BB1_2: // %udiv-do-while
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: shr.u64 %rd71, %rd105, 63;
-; CHECK-NEXT: shl.b64 %rd72, %rd106, 1;
-; CHECK-NEXT: or.b64 %rd73, %rd72, %rd71;
-; CHECK-NEXT: shl.b64 %rd74, %rd105, 1;
-; CHECK-NEXT: shr.u64 %rd75, %rd108, 63;
-; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75;
-; CHECK-NEXT: shr.u64 %rd77, %rd107, 63;
-; CHECK-NEXT: shl.b64 %rd78, %rd108, 1;
-; CHECK-NEXT: or.b64 %rd79, %rd78, %rd77;
-; CHECK-NEXT: shl.b64 %rd80, %rd107, 1;
-; CHECK-NEXT: or.b64 %rd107, %rd101, %rd80;
-; CHECK-NEXT: or.b64 %rd108, %rd98, %rd79;
-; CHECK-NEXT: sub.cc.s64 %rd81, %rd33, %rd76;
-; CHECK-NEXT: subc.cc.s64 %rd82, %rd34, %rd73;
-; CHECK-NEXT: shr.s64 %rd83, %rd82, 63;
-; CHECK-NEXT: and.b64 %rd101, %rd83, 1;
-; CHECK-NEXT: and.b64 %rd84, %rd83, %rd3;
-; CHECK-NEXT: and.b64 %rd85, %rd83, %rd4;
-; CHECK-NEXT: sub.cc.s64 %rd105, %rd76, %rd84;
-; CHECK-NEXT: subc.cc.s64 %rd106, %rd73, %rd85;
-; CHECK-NEXT: add.cc.s64 %rd103, %rd103, -1;
-; CHECK-NEXT: addc.cc.s64 %rd104, %rd104, -1;
-; CHECK-NEXT: or.b64 %rd86, %rd103, %rd104;
-; CHECK-NEXT: setp.eq.b64 %p17, %rd86, 0;
+; CHECK-NEXT: shr.u64 %rd73, %rd107, 63;
+; CHECK-NEXT: shl.b64 %rd74, %rd108, 1;
+; CHECK-NEXT: or.b64 %rd75, %rd74, %rd73;
+; CHECK-NEXT: shl.b64 %rd76, %rd107, 1;
+; CHECK-NEXT: shr.u64 %rd77, %rd110, 63;
+; CHECK-NEXT: or.b64 %rd78, %rd76, %rd77;
+; CHECK-NEXT: shr.u64 %rd79, %rd109, 63;
+; CHECK-NEXT: shl.b64 %rd80, %rd110, 1;
+; CHECK-NEXT: or.b64 %rd81, %rd80, %rd79;
+; CHECK-NEXT: shl.b64 %rd82, %rd109, 1;
+; CHECK-NEXT: or.b64 %rd109, %rd103, %rd82;
+; CHECK-NEXT: or.b64 %rd110, %rd100, %rd81;
+; CHECK-NEXT: sub.cc.s64 %rd83, %rd33, %rd78;
+; CHECK-NEXT: subc.cc.s64 %rd84, %rd34, %rd75;
+; CHECK-NEXT: shr.s64 %rd85, %rd84, 63;
+; CHECK-NEXT: and.b64 %rd103, %rd85, 1;
+; CHECK-NEXT: and.b64 %rd86, %rd85, %rd3;
+; CHECK-NEXT: and.b64 %rd87, %rd85, %rd4;
+; CHECK-NEXT: sub.cc.s64 %rd107, %rd78, %rd86;
+; CHECK-NEXT: subc.cc.s64 %rd108, %rd75, %rd87;
+; CHECK-NEXT: add.cc.s64 %rd105, %rd105, -1;
+; CHECK-NEXT: addc.cc.s64 %rd106, %rd106, -1;
+; CHECK-NEXT: or.b64 %rd88, %rd105, %rd106;
+; CHECK-NEXT: setp.eq.b64 %p17, %rd88, 0;
; CHECK-NEXT: @%p17 bra $L__BB1_4;
; CHECK-NEXT: bra.uni $L__BB1_2;
; CHECK-NEXT: $L__BB1_4: // %udiv-loop-exit
-; CHECK-NEXT: shr.u64 %rd87, %rd107, 63;
-; CHECK-NEXT: shl.b64 %rd88, %rd108, 1;
-; CHECK-NEXT: or.b64 %rd89, %rd88, %rd87;
-; CHECK-NEXT: shl.b64 %rd90, %rd107, 1;
-; CHECK-NEXT: or.b64 %rd109, %rd101, %rd90;
-; CHECK-NEXT: or.b64 %rd110, %rd98, %rd89;
+; CHECK-NEXT: shr.u64 %rd89, %rd109, 63;
+; CHECK-NEXT: shl.b64 %rd90, %rd110, 1;
+; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT: shl.b64 %rd92, %rd109, 1;
+; CHECK-NEXT: or.b64 %rd111, %rd103, %rd92;
+; CHECK-NEXT: or.b64 %rd112, %rd100, %rd91;
; CHECK-NEXT: $L__BB1_5: // %udiv-end
-; CHECK-NEXT: mul.hi.u64 %rd91, %rd3, %rd109;
-; CHECK-NEXT: mad.lo.s64 %rd92, %rd3, %rd110, %rd91;
-; CHECK-NEXT: mad.lo.s64 %rd93, %rd4, %rd109, %rd92;
-; CHECK-NEXT: mul.lo.s64 %rd94, %rd3, %rd109;
-; CHECK-NEXT: sub.cc.s64 %rd95, %rd41, %rd94;
-; CHECK-NEXT: subc.cc.s64 %rd96, %rd42, %rd93;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd95, %rd96};
+; CHECK-NEXT: mul.hi.u64 %rd93, %rd3, %rd111;
+; CHECK-NEXT: mad.lo.s64 %rd94, %rd3, %rd112, %rd93;
+; CHECK-NEXT: mad.lo.s64 %rd95, %rd4, %rd111, %rd94;
+; CHECK-NEXT: mul.lo.s64 %rd96, %rd3, %rd111;
+; CHECK-NEXT: sub.cc.s64 %rd97, %rd41, %rd96;
+; CHECK-NEXT: subc.cc.s64 %rd98, %rd42, %rd95;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd97, %rd98};
; CHECK-NEXT: ret;
%div = urem i128 %lhs, %rhs
ret i128 %div
@@ -307,9 +306,9 @@ define i128 @urem_i128_pow2k(i128 %lhs) {
define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-LABEL: sdiv_i128(
; CHECK: {
-; CHECK-NEXT: .reg .pred %p<22>;
+; CHECK-NEXT: .reg .pred %p<20>;
; CHECK-NEXT: .reg .b32 %r<12>;
-; CHECK-NEXT: .reg .b64 %rd<121>;
+; CHECK-NEXT: .reg .b64 %rd<122>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [sdiv_i128_param_0];
@@ -345,97 +344,96 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: cvt.u64.u32 %rd63, %r4;
; CHECK-NEXT: add.s64 %rd64, %rd63, 64;
; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7;
-; CHECK-NEXT: mov.b64 %rd111, 0;
+; CHECK-NEXT: mov.b64 %rd112, 0;
; CHECK-NEXT: sub.cc.s64 %rd67, %rd61, %rd65;
-; CHECK-NEXT: subc.cc.s64 %rd8, %rd111, 0;
-; CHECK-NEXT: setp.ne.b64 %p8, %rd8, 0;
-; CHECK-NEXT: and.pred %p10, %p8, %p8;
-; CHECK-NEXT: setp.eq.b64 %p11, %rd8, 0;
-; CHECK-NEXT: setp.gt.u64 %p12, %rd67, 127;
-; CHECK-NEXT: and.pred %p13, %p11, %p12;
-; CHECK-NEXT: or.pred %p14, %p13, %p10;
-; CHECK-NEXT: or.pred %p15, %p5, %p14;
-; CHECK-NEXT: xor.b64 %rd68, %rd67, 127;
-; CHECK-NEXT: or.b64 %rd69, %rd68, %rd8;
-; CHECK-NEXT: setp.eq.b64 %p16, %rd69, 0;
-; CHECK-NEXT: selp.b64 %rd120, 0, %rd2, %p15;
-; CHECK-NEXT: selp.b64 %rd119, 0, %rd1, %p15;
-; CHECK-NEXT: or.pred %p17, %p15, %p16;
-; CHECK-NEXT: @%p17 bra $L__BB4_5;
+; CHECK-NEXT: subc.cc.s64 %rd68, %rd112, 0;
+; CHECK-NEXT: setp.gt.u64 %p8, %rd67, 127;
+; CHECK-NEXT: setp.eq.b64 %p9, %rd68, 0;
+; CHECK-NEXT: and.pred %p10, %p9, %p8;
+; CHECK-NEXT: setp.ne.b64 %p11, %rd68, 0;
+; CHECK-NEXT: or.pred %p12, %p10, %p11;
+; CHECK-NEXT: or.pred %p13, %p5, %p12;
+; CHECK-NEXT: xor.b64 %rd69, %rd67, 127;
+; CHECK-NEXT: or.b64 %rd70, %rd69, %rd68;
+; CHECK-NEXT: setp.eq.b64 %p14, %rd70, 0;
+; CHECK-NEXT: selp.b64 %rd121, 0, %rd2, %p13;
+; CHECK-NEXT: selp.b64 %rd120, 0, %rd1, %p13;
+; CHECK-NEXT: or.pred %p15, %p13, %p14;
+; CHECK-NEXT: @%p15 bra $L__BB4_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
-; CHECK-NEXT: add.cc.s64 %rd113, %rd67, 1;
-; CHECK-NEXT: addc.cc.s64 %rd114, %rd8, 0;
-; CHECK-NEXT: or.b64 %rd72, %rd113, %rd114;
-; CHECK-NEXT: setp.eq.b64 %p18, %rd72, 0;
+; CHECK-NEXT: add.cc.s64 %rd114, %rd67, 1;
+; CHECK-NEXT: addc.cc.s64 %rd115, %rd68, 0;
+; CHECK-NEXT: or.b64 %rd73, %rd114, %rd115;
+; CHECK-NEXT: setp.eq.b64 %p16, %rd73, 0;
; CHECK-NEXT: cvt.u32.u64 %r5, %rd67;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd73, %rd2, %r6;
+; CHECK-NEXT: shl.b64 %rd74, %rd2, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd74, %rd1, %r7;
-; CHECK-NEXT: or.b64 %rd75, %rd73, %rd74;
+; CHECK-NEXT: shr.u64 %rd75, %rd1, %r7;
+; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd76, %rd1, %r8;
-; CHECK-NEXT: setp.gt.s32 %p19, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd118, %rd76, %rd75, %p19;
-; CHECK-NEXT: shl.b64 %rd117, %rd1, %r6;
-; CHECK-NEXT: mov.b64 %rd108, %rd111;
-; CHECK-NEXT: @%p18 bra $L__BB4_4;
+; CHECK-NEXT: shl.b64 %rd77, %rd1, %r8;
+; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63;
+; CHECK-NEXT: selp.b64 %rd119, %rd77, %rd76, %p17;
+; CHECK-NEXT: shl.b64 %rd118, %rd1, %r6;
+; CHECK-NEXT: mov.b64 %rd109, %rd112;
+; CHECK-NEXT: @%p16 bra $L__BB4_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
-; CHECK-NEXT: cvt.u32.u64 %r9, %rd113;
-; CHECK-NEXT: shr.u64 %rd79, %rd1, %r9;
+; CHECK-NEXT: cvt.u32.u64 %r9, %rd114;
+; CHECK-NEXT: shr.u64 %rd80, %rd1, %r9;
; CHECK-NEXT: sub.s32 %r10, 64, %r9;
-; CHECK-NEXT: shl.b64 %rd80, %rd2, %r10;
-; CHECK-NEXT: or.b64 %rd81, %rd79, %rd80;
+; CHECK-NEXT: shl.b64 %rd81, %rd2, %r10;
+; CHECK-NEXT: or.b64 %rd82, %rd80, %rd81;
; CHECK-NEXT: add.s32 %r11, %r9, -64;
-; CHECK-NEXT: shr.u64 %rd82, %rd2, %r11;
-; CHECK-NEXT: setp.gt.s32 %p20, %r9, 63;
-; CHECK-NEXT: selp.b64 %rd115, %rd82, %rd81, %p20;
-; CHECK-NEXT: shr.u64 %rd116, %rd2, %r9;
+; CHECK-NEXT: shr.u64 %rd83, %rd2, %r11;
+; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63;
+; CHECK-NEXT: selp.b64 %rd116, %rd83, %rd82, %p18;
+; CHECK-NEXT: shr.u64 %rd117, %rd2, %r9;
; CHECK-NEXT: add.cc.s64 %rd35, %rd3, -1;
; CHECK-NEXT: addc.cc.s64 %rd36, %rd4, -1;
-; CHECK-NEXT: mov.b64 %rd108, 0;
-; CHECK-NEXT: mov.b64 %rd111, %rd108;
+; CHECK-NEXT: mov.b64 %rd109, 0;
+; CHECK-NEXT: mov.b64 %rd112, %rd109;
; CHECK-NEXT: $L__BB4_2: // %udiv-do-while
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: shr.u64 %rd83, %rd115, 63;
-; CHECK-NEXT: shl.b64 %rd84, %rd116, 1;
-; CHECK-NEXT: or.b64 %rd85, %rd84, %rd83;
-; CHECK-NEXT: shl.b64 %rd86, %rd115, 1;
-; CHECK-NEXT: shr.u64 %rd87, %rd118, 63;
-; CHECK-NEXT: or.b64 %rd88, %rd86, %rd87;
-; CHECK-NEXT: shr.u64 %rd89, %rd117, 63;
-; CHECK-NEXT: shl.b64 %rd90, %rd118, 1;
-; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89;
-; CHECK-NEXT: shl.b64 %rd92, %rd117, 1;
-; CHECK-NEXT: or.b64 %rd117, %rd111, %rd92;
-; CHECK-NEXT: or.b64 %rd118, %rd108, %rd91;
-; CHECK-NEXT: sub.cc.s64 %rd93, %rd35, %rd88;
-; CHECK-NEXT: subc.cc.s64 %rd94, %rd36, %rd85;
-; CHECK-NEXT: shr.s64 %rd95, %rd94, 63;
-; CHECK-NEXT: and.b64 %rd111, %rd95, 1;
-; CHECK-NEXT: and.b64 %rd96, %rd95, %rd3;
-; CHECK-NEXT: and.b64 %rd97, %rd95, %rd4;
-; CHECK-NEXT: sub.cc.s64 %rd115, %rd88, %rd96;
-; CHECK-NEXT: subc.cc.s64 %rd116, %rd85, %rd97;
-; CHECK-NEXT: add.cc.s64 %rd113, %rd113, -1;
-; CHECK-NEXT: addc.cc.s64 %rd114, %rd114, -1;
-; CHECK-NEXT: or.b64 %rd98, %rd113, %rd114;
-; CHECK-NEXT: setp.eq.b64 %p21, %rd98, 0;
-; CHECK-NEXT: @%p21 bra $L__BB4_4;
+; CHECK-NEXT: shr.u64 %rd84, %rd116, 63;
+; CHECK-NEXT: shl.b64 %rd85, %rd117, 1;
+; CHECK-NEXT: or.b64 %rd86, %rd85, %rd84;
+; CHECK-NEXT: shl.b64 %rd87, %rd116, 1;
+; CHECK-NEXT: shr.u64 %rd88, %rd119, 63;
+; CHECK-NEXT: or.b64 %rd89, %rd87, %rd88;
+; CHECK-NEXT: shr.u64 %rd90, %rd118, 63;
+; CHECK-NEXT: shl.b64 %rd91, %rd119, 1;
+; CHECK-NEXT: or.b64 %rd92, %rd91, %rd90;
+; CHECK-NEXT: shl.b64 %rd93, %rd118, 1;
+; CHECK-NEXT: or.b64 %rd118, %rd112, %rd93;
+; CHECK-NEXT: or.b64 %rd119, %rd109, %rd92;
+; CHECK-NEXT: sub.cc.s64 %rd94, %rd35, %rd89;
+; CHECK-NEXT: subc.cc.s64 %rd95, %rd36, %rd86;
+; CHECK-NEXT: shr.s64 %rd96, %rd95, 63;
+; CHECK-NEXT: and.b64 %rd112, %rd96, 1;
+; CHECK-NEXT: and.b64 %rd97, %rd96, %rd3;
+; CHECK-NEXT: and.b64 %rd98, %rd96, %rd4;
+; CHECK-NEXT: sub.cc.s64 %rd116, %rd89, %rd97;
+; CHECK-NEXT: subc.cc.s64 %rd117, %rd86, %rd98;
+; CHECK-NEXT: add.cc.s64 %rd114, %rd114, -1;
+; CHECK-NEXT: addc.cc.s64 %rd115, %rd115, -1;
+; CHECK-NEXT: or.b64 %rd99, %rd114, %rd115;
+; CHECK-NEXT: setp.eq.b64 %p19, %rd99, 0;
+; CHECK-NEXT: @%p19 bra $L__BB4_4;
; CHECK-NEXT: bra.uni $L__BB4_2;
; CHECK-NEXT: $L__BB4_4: // %udiv-loop-exit
-; CHECK-NEXT: shr.u64 %rd99, %rd117, 63;
-; CHECK-NEXT: shl.b64 %rd100, %rd118, 1;
-; CHECK-NEXT: or.b64 %rd101, %rd100, %rd99;
-; CHECK-NEXT: shl.b64 %rd102, %rd117, 1;
-; CHECK-NEXT: or.b64 %rd119, %rd111, %rd102;
-; CHECK-NEXT: or.b64 %rd120, %rd108, %rd101;
+; CHECK-NEXT: shr.u64 %rd100, %rd118, 63;
+; CHECK-NEXT: shl.b64 %rd101, %rd119, 1;
+; CHECK-NEXT: or.b64 %rd102, %rd101, %rd100;
+; CHECK-NEXT: shl.b64 %rd103, %rd118, 1;
+; CHECK-NEXT: or.b64 %rd120, %rd112, %rd103;
+; CHECK-NEXT: or.b64 %rd121, %rd109, %rd102;
; CHECK-NEXT: $L__BB4_5: // %udiv-end
-; CHECK-NEXT: xor.b64 %rd103, %rd119, %rd5;
; CHECK-NEXT: xor.b64 %rd104, %rd120, %rd5;
-; CHECK-NEXT: sub.cc.s64 %rd105, %rd103, %rd5;
-; CHECK-NEXT: subc.cc.s64 %rd106, %rd104, %rd5;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd105, %rd106};
+; CHECK-NEXT: xor.b64 %rd105, %rd121, %rd5;
+; CHECK-NEXT: sub.cc.s64 %rd106, %rd104, %rd5;
+; CHECK-NEXT: subc.cc.s64 %rd107, %rd105, %rd5;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd106, %rd107};
; CHECK-NEXT: ret;
%div = sdiv i128 %lhs, %rhs
ret i128 %div
@@ -446,7 +444,7 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
; CHECK: {
; CHECK-NEXT: .reg .pred %p<18>;
; CHECK-NEXT: .reg .b32 %r<12>;
-; CHECK-NEXT: .reg .b64 %rd<105>;
+; CHECK-NEXT: .reg .b64 %rd<107>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [udiv_i128_param_0];
@@ -470,92 +468,92 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: cvt.u64.u32 %rd52, %r4;
; CHECK-NEXT: add.s64 %rd53, %rd52, 64;
; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT: mov.b64 %rd95, 0;
-; CHECK-NEXT: sub.cc.s64 %rd5, %rd50, %rd54;
-; CHECK-NEXT: subc.cc.s64 %rd6, %rd95, 0;
-; CHECK-NEXT: setp.gt.u64 %p6, %rd5, 127;
-; CHECK-NEXT: setp.eq.b64 %p7, %rd6, 0;
+; CHECK-NEXT: mov.b64 %rd97, 0;
+; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54;
+; CHECK-NEXT: subc.cc.s64 %rd57, %rd97, 0;
+; CHECK-NEXT: setp.gt.u64 %p6, %rd56, 127;
+; CHECK-NEXT: setp.eq.b64 %p7, %rd57, 0;
; CHECK-NEXT: and.pred %p8, %p7, %p6;
-; CHECK-NEXT: setp.ne.b64 %p9, %rd6, 0;
+; CHECK-NEXT: setp.ne.b64 %p9, %rd57, 0;
; CHECK-NEXT: or.pred %p10, %p8, %p9;
; CHECK-NEXT: or.pred %p11, %p3, %p10;
-; CHECK-NEXT: xor.b64 %rd56, %rd5, 127;
-; CHECK-NEXT: or.b64 %rd57, %rd56, %rd6;
-; CHECK-NEXT: setp.eq.b64 %p12, %rd57, 0;
-; CHECK-NEXT: selp.b64 %rd104, 0, %rd42, %p11;
-; CHECK-NEXT: selp.b64 %rd103, 0, %rd41, %p11;
+; CHECK-NEXT: xor.b64 %rd58, %rd56, 127;
+; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57;
+; CHECK-NEXT: setp.eq.b64 %p12, %rd59, 0;
+; CHECK-NEXT: selp.b64 %rd106, 0, %rd42, %p11;
+; CHECK-NEXT: selp.b64 %rd105, 0, %rd41, %p11;
; CHECK-NEXT: or.pred %p13, %p11, %p12;
; CHECK-NEXT: @%p13 bra $L__BB5_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
-; CHECK-NEXT: add.cc.s64 %rd97, %rd5, 1;
-; CHECK-NEXT: addc.cc.s64 %rd98, %rd6, 0;
-; CHECK-NEXT: or.b64 %rd60, %rd97, %rd98;
-; CHECK-NEXT: setp.eq.b64 %p14, %rd60, 0;
-; CHECK-NEXT: cvt.u32.u64 %r5, %rd5;
+; CHECK-NEXT: add.cc.s64 %rd99, %rd56, 1;
+; CHECK-NEXT: addc.cc.s64 %rd100, %rd57, 0;
+; CHECK-NEXT: or.b64 %rd62, %rd99, %rd100;
+; CHECK-NEXT: setp.eq.b64 %p14, %rd62, 0;
+; CHECK-NEXT: cvt.u32.u64 %r5, %rd56;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd61, %rd42, %r6;
+; CHECK-NEXT: shl.b64 %rd63, %rd42, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd62, %rd41, %r7;
-; CHECK-NEXT: or.b64 %rd63, %rd61, %rd62;
+; CHECK-NEXT: shr.u64 %rd64, %rd41, %r7;
+; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd64, %rd41, %r8;
+; CHECK-NEXT: shl.b64 %rd66, %rd41, %r8;
; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd102, %rd64, %rd63, %p15;
-; CHECK-NEXT: shl.b64 %rd101, %rd41, %r6;
-; CHECK-NEXT: mov.b64 %rd92, %rd95;
+; CHECK-NEXT: selp.b64 %rd104, %rd66, %rd65, %p15;
+; CHECK-NEXT: shl.b64 %rd103, %rd41, %r6;
+; CHECK-NEXT: mov.b64 %rd94, %rd97;
; CHECK-NEXT: @%p14 bra $L__BB5_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
-; CHECK-NEXT: cvt.u32.u64 %r9, %rd97;
-; CHECK-NEXT: shr.u64 %rd67, %rd41, %r9;
+; CHECK-NEXT: cvt.u32.u64 %r9, %rd99;
+; CHECK-NEXT: shr.u64 %rd69, %rd41, %r9;
; CHECK-NEXT: sub.s32 %r10, 64, %r9;
-; CHECK-NEXT: shl.b64 %rd68, %rd42, %r10;
-; CHECK-NEXT: or.b64 %rd69, %rd67, %rd68;
+; CHECK-NEXT: shl.b64 %rd70, %rd42, %r10;
+; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70;
; CHECK-NEXT: add.s32 %r11, %r9, -64;
-; CHECK-NEXT: shr.u64 %rd70, %rd42, %r11;
+; CHECK-NEXT: shr.u64 %rd72, %rd42, %r11;
; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63;
-; CHECK-NEXT: selp.b64 %rd99, %rd70, %rd69, %p16;
-; CHECK-NEXT: shr.u64 %rd100, %rd42, %r9;
+; CHECK-NEXT: selp.b64 %rd101, %rd72, %rd71, %p16;
+; CHECK-NEXT: shr.u64 %rd102, %rd42, %r9;
; CHECK-NEXT: add.cc.s64 %rd33, %rd43, -1;
; CHECK-NEXT: addc.cc.s64 %rd34, %rd44, -1;
-; CHECK-NEXT: mov.b64 %rd92, 0;
-; CHECK-NEXT: mov.b64 %rd95, %rd92;
+; CHECK-NEXT: mov.b64 %rd94, 0;
+; CHECK-NEXT: mov.b64 %rd97, %rd94;
; CHECK-NEXT: $L__BB5_2: // %udiv-do-while
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: shr.u64 %rd71, %rd99, 63;
-; CHECK-NEXT: shl.b64 %rd72, %rd100, 1;
-; CHECK-NEXT: or.b64 %rd73, %rd72, %rd71;
-; CHECK-NEXT: shl.b64 %rd74, %rd99, 1;
-; CHECK-NEXT: shr.u64 %rd75, %rd102, 63;
-; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75;
-; CHECK-NEXT: shr.u64 %rd77, %rd101, 63;
-; CHECK-NEXT: shl.b64 %rd78, %rd102, 1;
-; CHECK-NEXT: or.b64 %rd79, %rd78, %rd77;
-; CHECK-NEXT: shl.b64 %rd80, %rd101, 1;
-; CHECK-NEXT: or.b64 %rd101, %rd95, %rd80;
-; CHECK-NEXT: or.b64 %rd102, %rd92, %rd79;
-; CHECK-NEXT: sub.cc.s64 %rd81, %rd33, %rd76;
-; CHECK-NEXT: subc.cc.s64 %rd82, %rd34, %rd73;
-; CHECK-NEXT: shr.s64 %rd83, %rd82, 63;
-; CHECK-NEXT: and.b64 %rd95, %rd83, 1;
-; CHECK-NEXT: and.b64 %rd84, %rd83, %rd43;
-; CHECK-NEXT: and.b64 %rd85, %rd83, %rd44;
-; CHECK-NEXT: sub.cc.s64 %rd99, %rd76, %rd84;
-; CHECK-NEXT: subc.cc.s64 %rd100, %rd73, %rd85;
-; CHECK-NEXT: add.cc.s64 %rd97, %rd97, -1;
-; CHECK-NEXT: addc.cc.s64 %rd98, %rd98, -1;
-; CHECK-NEXT: or.b64 %rd86, %rd97, %rd98;
-; CHECK-NEXT: setp.eq.b64 %p17, %rd86, 0;
+; CHECK-NEXT: shr.u64 %rd73, %rd101, 63;
+; CHECK-NEXT: shl.b64 %rd74, %rd102, 1;
+; CHECK-NEXT: or.b64 %rd75, %rd74, %rd73;
+; CHECK-NEXT: shl.b64 %rd76, %rd101, 1;
+; CHECK-NEXT: shr.u64 %rd77, %rd104, 63;
+; CHECK-NEXT: or.b64 %rd78, %rd76, %rd77;
+; CHECK-NEXT: shr.u64 %rd79, %rd103, 63;
+; CHECK-NEXT: shl.b64 %rd80, %rd104, 1;
+; CHECK-NEXT: or.b64 %rd81, %rd80, %rd79;
+; CHECK-NEXT: shl.b64 %rd82, %rd103, 1;
+; CHECK-NEXT: or.b64 %rd103, %rd97, %rd82;
+; CHECK-NEXT: or.b64 %rd104, %rd94, %rd81;
+; CHECK-NEXT: sub.cc.s64 %rd83, %rd33, %rd78;
+; CHECK-NEXT: subc.cc.s64 %rd84, %rd34, %rd75;
+; CHECK-NEXT: shr.s64 %rd85, %rd84, 63;
+; CHECK-NEXT: and.b64 %rd97, %rd85, 1;
+; CHECK-NEXT: and.b64 %rd86, %rd85, %rd43;
+; CHECK-NEXT: and.b64 %rd87, %rd85, %rd44;
+; CHECK-NEXT: sub.cc.s64 %rd101, %rd78, %rd86;
+; CHECK-NEXT: subc.cc.s64 %rd102, %rd75, %rd87;
+; CHECK-NEXT: add.cc.s64 %rd99, %rd99, -1;
+; CHECK-NEXT: addc.cc.s64 %rd100, %rd100, -1;
+; CHECK-NEXT: or.b64 %rd88, %rd99, %rd100;
+; CHECK-NEXT: setp.eq.b64 %p17, %rd88, 0;
; CHECK-NEXT: @%p17 bra $L__BB5_4;
; CHECK-NEXT: bra.uni $L__BB5_2;
; CHECK-NEXT: $L__BB5_4: // %udiv-loop-exit
-; CHECK-NEXT: shr.u64 %rd87, %rd101, 63;
-; CHECK-NEXT: shl.b64 %rd88, %rd102, 1;
-; CHECK-NEXT: or.b64 %rd89, %rd88, %rd87;
-; CHECK-NEXT: shl.b64 %rd90, %rd101, 1;
-; CHECK-NEXT: or.b64 %rd103, %rd95, %rd90;
-; CHECK-NEXT: or.b64 %rd104, %rd92, %rd89;
+; CHECK-NEXT: shr.u64 %rd89, %rd103, 63;
+; CHECK-NEXT: shl.b64 %rd90, %rd104, 1;
+; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT: shl.b64 %rd92, %rd103, 1;
+; CHECK-NEXT: or.b64 %rd105, %rd97, %rd92;
+; CHECK-NEXT: or.b64 %rd106, %rd94, %rd91;
; CHECK-NEXT: $L__BB5_5: // %udiv-end
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd103, %rd104};
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd105, %rd106};
; CHECK-NEXT: ret;
%div = udiv i128 %lhs, %rhs
ret i128 %div
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
index 821cfd00dcd07..b540948b20f75 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
@@ -764,8 +764,13 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
;
; CHECK-PWR7-LABEL: sub_absv_8_ext:
; CHECK-PWR7: # %bb.0: # %entry
-; CHECK-PWR7-NEXT: stdu r1, -448(r1)
-; CHECK-PWR7-NEXT: .cfi_def_cfa_offset 448
+; CHECK-PWR7-NEXT: stdu r1, -512(r1)
+; CHECK-PWR7-NEXT: .cfi_def_cfa_offset 512
+; CHECK-PWR7-NEXT: .cfi_offset r14, -144
+; CHECK-PWR7-NEXT: .cfi_offset r15, -136
+; CHECK-PWR7-NEXT: .cfi_offset r16, -128
+; CHECK-PWR7-NEXT: .cfi_offset r17, -120
+; CHECK-PWR7-NEXT: .cfi_offset r18, -112
; CHECK-PWR7-NEXT: .cfi_offset r19, -104
; CHECK-PWR7-NEXT: .cfi_offset r20, -96
; CHECK-PWR7-NEXT: .cfi_offset r21, -88
@@ -778,258 +783,244 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
; CHECK-PWR7-NEXT: .cfi_offset r28, -32
; CHECK-PWR7-NEXT: .cfi_offset r29, -24
; CHECK-PWR7-NEXT: .cfi_offset r30, -16
-; CHECK-PWR7-NEXT: addi r3, r1, 304
-; CHECK-PWR7-NEXT: std r19, 344(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r20, 352(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r21, 360(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r22, 368(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r23, 376(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r24, 384(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r25, 392(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r26, 400(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r27, 408(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r28, 416(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r29, 424(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: std r30, 432(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT: stxvw4x v2, 0, r3
+; CHECK-PWR7-NEXT: .cfi_offset r31, -8
+; CHECK-PWR7-NEXT: .cfi_offset r2, -152
; CHECK-PWR7-NEXT: addi r3, r1, 320
-; CHECK-PWR7-NEXT: lbz r7, 304(r1)
-; CHECK-PWR7-NEXT: stxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: lbz r8, 320(r1)
-; CHECK-PWR7-NEXT: lbz r9, 305(r1)
-; CHECK-PWR7-NEXT: lbz r10, 321(r1)
-; CHECK-PWR7-NEXT: lbz r26, 325(r1)
-; CHECK-PWR7-NEXT: clrlwi r7, r7, 24
-; CHECK-PWR7-NEXT: clrlwi r8, r8, 24
-; CHECK-PWR7-NEXT: clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT: clrlwi r10, r10, 24
-; CHECK-PWR7-NEXT: lbz r11, 306(r1)
-; CHECK-PWR7-NEXT: lbz r12, 322(r1)
-; CHECK-PWR7-NEXT: lbz r23, 314(r1)
-; CHECK-PWR7-NEXT: clrlwi r22, r26, 24
-; CHECK-PWR7-NEXT: lbz r26, 330(r1)
-; CHECK-PWR7-NEXT: sub r8, r7, r8
-; CHECK-PWR7-NEXT: lbz r7, 315(r1)
-; CHECK-PWR7-NEXT: sub r20, r9, r10
-; CHECK-PWR7-NEXT: lbz r9, 331(r1)
-; CHECK-PWR7-NEXT: lbz r0, 307(r1)
-; CHECK-PWR7-NEXT: lbz r30, 323(r1)
-; CHECK-PWR7-NEXT: clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT: clrlwi r12, r12, 24
-; CHECK-PWR7-NEXT: clrlwi r23, r23, 24
-; CHECK-PWR7-NEXT: clrlwi r21, r26, 24
-; CHECK-PWR7-NEXT: clrlwi r7, r7, 24
-; CHECK-PWR7-NEXT: clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT: clrlwi r0, r0, 24
-; CHECK-PWR7-NEXT: clrlwi r30, r30, 24
-; CHECK-PWR7-NEXT: lbz r29, 308(r1)
-; CHECK-PWR7-NEXT: lbz r28, 324(r1)
-; CHECK-PWR7-NEXT: lbz r27, 309(r1)
-; CHECK-PWR7-NEXT: lbz r25, 310(r1)
-; CHECK-PWR7-NEXT: lbz r24, 326(r1)
-; CHECK-PWR7-NEXT: sub r19, r11, r12
-; CHECK-PWR7-NEXT: sub r11, r23, r21
-; CHECK-PWR7-NEXT: sub r9, r7, r9
-; CHECK-PWR7-NEXT: sub r26, r0, r30
-; CHECK-PWR7-NEXT: srawi r12, r11, 31
-; CHECK-PWR7-NEXT: srawi r0, r9, 31
-; CHECK-PWR7-NEXT: lbz r3, 312(r1)
-; CHECK-PWR7-NEXT: clrlwi r29, r29, 24
-; CHECK-PWR7-NEXT: clrlwi r28, r28, 24
-; CHECK-PWR7-NEXT: clrlwi r27, r27, 24
-; CHECK-PWR7-NEXT: clrlwi r25, r25, 24
-; CHECK-PWR7-NEXT: clrlwi r24, r24, 24
-; CHECK-PWR7-NEXT: xor r11, r11, r12
-; CHECK-PWR7-NEXT: xor r9, r9, r0
-; CHECK-PWR7-NEXT: sub r28, r29, r28
-; CHECK-PWR7-NEXT: sub r30, r27, r22
-; CHECK-PWR7-NEXT: sub r29, r25, r24
-; CHECK-PWR7-NEXT: sub r27, r11, r12
-; CHECK-PWR7-NEXT: sub r24, r9, r0
-; CHECK-PWR7-NEXT: lbz r9, 316(r1)
-; CHECK-PWR7-NEXT: lbz r11, 332(r1)
-; CHECK-PWR7-NEXT: lbz r4, 328(r1)
-; CHECK-PWR7-NEXT: lbz r5, 311(r1)
-; CHECK-PWR7-NEXT: lbz r6, 327(r1)
-; CHECK-PWR7-NEXT: clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT: clrlwi r3, r3, 24
-; CHECK-PWR7-NEXT: clrlwi r4, r4, 24
-; CHECK-PWR7-NEXT: clrlwi r5, r5, 24
-; CHECK-PWR7-NEXT: clrlwi r6, r6, 24
-; CHECK-PWR7-NEXT: sub r3, r3, r4
+; CHECK-PWR7-NEXT: std r14, 368(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r15, 376(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r16, 384(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r17, 392(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r18, 400(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r19, 408(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r20, 416(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r21, 424(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r22, 432(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r23, 440(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r24, 448(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r25, 456(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r26, 464(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r27, 472(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r28, 480(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r29, 488(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r30, 496(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r31, 504(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: std r2, 360(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT: stxvw4x v2, 0, r3
+; CHECK-PWR7-NEXT: lbz r3, 320(r1)
+; CHECK-PWR7-NEXT: addi r4, r1, 336
+; CHECK-PWR7-NEXT: stw r3, 60(r1) # 4-byte Folded Spill
+; CHECK-PWR7-NEXT: stxvw4x v3, 0, r4
+; CHECK-PWR7-NEXT: lbz r15, 334(r1)
+; CHECK-PWR7-NEXT: lbz r14, 350(r1)
+; CHECK-PWR7-NEXT: lbz r31, 335(r1)
+; CHECK-PWR7-NEXT: lbz r2, 351(r1)
+; CHECK-PWR7-NEXT: sub r15, r15, r14
+; CHECK-PWR7-NEXT: sub r14, r31, r2
+; CHECK-PWR7-NEXT: srawi r2, r14, 31
+; CHECK-PWR7-NEXT: xor r14, r14, r2
+; CHECK-PWR7-NEXT: lbz r3, 333(r1)
+; CHECK-PWR7-NEXT: lbz r19, 331(r1)
+; CHECK-PWR7-NEXT: lbz r18, 347(r1)
+; CHECK-PWR7-NEXT: sub r19, r19, r18
+; CHECK-PWR7-NEXT: lbz r17, 332(r1)
+; CHECK-PWR7-NEXT: lbz r16, 348(r1)
+; CHECK-PWR7-NEXT: sub r17, r17, r16
+; CHECK-PWR7-NEXT: lbz r23, 329(r1)
+; CHECK-PWR7-NEXT: sub r14, r14, r2
+; CHECK-PWR7-NEXT: lbz r2, 349(r1)
+; CHECK-PWR7-NEXT: lbz r22, 345(r1)
+; CHECK-PWR7-NEXT: lbz r4, 336(r1)
+; CHECK-PWR7-NEXT: lbz r5, 321(r1)
+; CHECK-PWR7-NEXT: lbz r6, 337(r1)
+; CHECK-PWR7-NEXT: lbz r7, 322(r1)
+; CHECK-PWR7-NEXT: lbz r8, 338(r1)
+; CHECK-PWR7-NEXT: lbz r9, 323(r1)
+; CHECK-PWR7-NEXT: lbz r10, 339(r1)
+; CHECK-PWR7-NEXT: lbz r11, 324(r1)
+; CHECK-PWR7-NEXT: lbz r12, 340(r1)
+; CHECK-PWR7-NEXT: lbz r0, 325(r1)
+; CHECK-PWR7-NEXT: lbz r30, 341(r1)
+; CHECK-PWR7-NEXT: lbz r29, 326(r1)
+; CHECK-PWR7-NEXT: lbz r28, 342(r1)
+; CHECK-PWR7-NEXT: lbz r27, 327(r1)
+; CHECK-PWR7-NEXT: lbz r26, 343(r1)
+; CHECK-PWR7-NEXT: sub r3, r3, r2
+; CHECK-PWR7-NEXT: lbz r25, 328(r1)
+; CHECK-PWR7-NEXT: lbz r24, 344(r1)
+; CHECK-PWR7-NEXT: lbz r21, 330(r1)
+; CHECK-PWR7-NEXT: lbz r20, 346(r1)
; CHECK-PWR7-NEXT: sub r5, r5, r6
-; CHECK-PWR7-NEXT: clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT: srawi r4, r3, 31
+; CHECK-PWR7-NEXT: srawi r18, r3, 31
+; CHECK-PWR7-NEXT: sub r7, r7, r8
+; CHECK-PWR7-NEXT: sub r9, r9, r10
+; CHECK-PWR7-NEXT: sub r11, r11, r12
+; CHECK-PWR7-NEXT: sub r0, r0, r30
+; CHECK-PWR7-NEXT: sub r29, r29, r28
+; CHECK-PWR7-NEXT: sub r27, r27, r26
+; CHECK-PWR7-NEXT: sub r25, r25, r24
+; CHECK-PWR7-NEXT: srawi r31, r15, 31
+; CHECK-PWR7-NEXT: ld r2, 360(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: xor r3, r3, r18
; CHECK-PWR7-NEXT: srawi r6, r5, 31
-; CHECK-PWR7-NEXT: xor r3, r3, r4
-; CHECK-PWR7-NEXT: sldi r27, r27, 56
-; CHECK-PWR7-NEXT: xor r5, r5, r6
-; CHECK-PWR7-NEXT: sub r9, r9, r11
-; CHECK-PWR7-NEXT: sub r3, r3, r4
-; CHECK-PWR7-NEXT: sldi r24, r24, 56
+; CHECK-PWR7-NEXT: srawi r8, r7, 31
+; CHECK-PWR7-NEXT: srawi r10, r9, 31
+; CHECK-PWR7-NEXT: srawi r12, r11, 31
+; CHECK-PWR7-NEXT: srawi r30, r0, 31
+; CHECK-PWR7-NEXT: sub r3, r3, r18
+; CHECK-PWR7-NEXT: srawi r18, r19, 31
+; CHECK-PWR7-NEXT: srawi r28, r29, 31
+; CHECK-PWR7-NEXT: ld r16, 384(r1) # 8-byte Folded Reload
; CHECK-PWR7-NEXT: sldi r3, r3, 56
-; CHECK-PWR7-NEXT: srawi r11, r9, 31
-; CHECK-PWR7-NEXT: std r27, 208(r1)
-; CHECK-PWR7-NEXT: sub r4, r5, r6
-; CHECK-PWR7-NEXT: std r27, 216(r1)
-; CHECK-PWR7-NEXT: srawi r27, r29, 31
-; CHECK-PWR7-NEXT: lbz r10, 313(r1)
-; CHECK-PWR7-NEXT: xor r9, r9, r11
-; CHECK-PWR7-NEXT: std r24, 224(r1)
-; CHECK-PWR7-NEXT: lbz r22, 329(r1)
-; CHECK-PWR7-NEXT: std r24, 232(r1)
-; CHECK-PWR7-NEXT: srawi r24, r30, 31
-; CHECK-PWR7-NEXT: ld r21, 360(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: sub r23, r9, r11
-; CHECK-PWR7-NEXT: lbz r9, 317(r1)
-; CHECK-PWR7-NEXT: lbz r11, 333(r1)
-; CHECK-PWR7-NEXT: xor r29, r29, r27
-; CHECK-PWR7-NEXT: std r3, 176(r1)
-; CHECK-PWR7-NEXT: std r3, 184(r1)
-; CHECK-PWR7-NEXT: sldi r3, r4, 56
-; CHECK-PWR7-NEXT: sldi r23, r23, 56
-; CHECK-PWR7-NEXT: xor r30, r30, r24
-; CHECK-PWR7-NEXT: clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT: clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT: sub r4, r30, r24
-; CHECK-PWR7-NEXT: ld r30, 432(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: std r3, 160(r1)
-; CHECK-PWR7-NEXT: std r3, 168(r1)
-; CHECK-PWR7-NEXT: sub r9, r9, r11
-; CHECK-PWR7-NEXT: sub r3, r29, r27
-; CHECK-PWR7-NEXT: std r23, 240(r1)
-; CHECK-PWR7-NEXT: ld r29, 424(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r11, r9, 31
-; CHECK-PWR7-NEXT: std r23, 248(r1)
-; CHECK-PWR7-NEXT: ld r27, 408(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r23, r28, 31
+; CHECK-PWR7-NEXT: srawi r26, r27, 31
+; CHECK-PWR7-NEXT: srawi r24, r25, 31
+; CHECK-PWR7-NEXT: xor r19, r19, r18
+; CHECK-PWR7-NEXT: xor r15, r15, r31
+; CHECK-PWR7-NEXT: xor r5, r5, r6
+; CHECK-PWR7-NEXT: std r3, 272(r1)
+; CHECK-PWR7-NEXT: std r3, 280(r1)
+; CHECK-PWR7-NEXT: srawi r3, r17, 31
+; CHECK-PWR7-NEXT: sub r19, r19, r18
+; CHECK-PWR7-NEXT: xor r7, r7, r8
+; CHECK-PWR7-NEXT: sub r15, r15, r31
+; CHECK-PWR7-NEXT: xor r17, r17, r3
+; CHECK-PWR7-NEXT: xor r9, r9, r10
+; CHECK-PWR7-NEXT: xor r11, r11, r12
+; CHECK-PWR7-NEXT: xor r0, r0, r30
+; CHECK-PWR7-NEXT: xor r29, r29, r28
+; CHECK-PWR7-NEXT: xor r27, r27, r26
+; CHECK-PWR7-NEXT: sub r3, r17, r3
+; CHECK-PWR7-NEXT: xor r25, r25, r24
+; CHECK-PWR7-NEXT: sub r25, r25, r24
+; CHECK-PWR7-NEXT: sub r27, r27, r26
+; CHECK-PWR7-NEXT: sub r29, r29, r28
; CHECK-PWR7-NEXT: sldi r3, r3, 56
-; CHECK-PWR7-NEXT: xor r28, r28, r23
-; CHECK-PWR7-NEXT: xor r9, r9, r11
-; CHECK-PWR7-NEXT: std r3, 144(r1)
-; CHECK-PWR7-NEXT: ld r24, 384(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: std r3, 152(r1)
-; CHECK-PWR7-NEXT: sldi r3, r4, 56
-; CHECK-PWR7-NEXT: sub r25, r9, r11
-; CHECK-PWR7-NEXT: lbz r9, 318(r1)
-; CHECK-PWR7-NEXT: lbz r11, 334(r1)
-; CHECK-PWR7-NEXT: std r3, 128(r1)
+; CHECK-PWR7-NEXT: sub r0, r0, r30
+; CHECK-PWR7-NEXT: sub r11, r11, r12
+; CHECK-PWR7-NEXT: sub r9, r9, r10
+; CHECK-PWR7-NEXT: sub r7, r7, r8
+; CHECK-PWR7-NEXT: sub r5, r5, r6
+; CHECK-PWR7-NEXT: sldi r14, r14, 56
+; CHECK-PWR7-NEXT: sldi r15, r15, 56
+; CHECK-PWR7-NEXT: ld r31, 504(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r3, 256(r1)
+; CHECK-PWR7-NEXT: std r3, 264(r1)
+; CHECK-PWR7-NEXT: sldi r3, r19, 56
; CHECK-PWR7-NEXT: sldi r25, r25, 56
-; CHECK-PWR7-NEXT: std r3, 136(r1)
-; CHECK-PWR7-NEXT: sub r3, r28, r23
+; CHECK-PWR7-NEXT: sldi r27, r27, 56
+; CHECK-PWR7-NEXT: std r3, 240(r1)
+; CHECK-PWR7-NEXT: std r3, 248(r1)
+; CHECK-PWR7-NEXT: sub r3, r23, r22
+; CHECK-PWR7-NEXT: srawi r23, r3, 31
+; CHECK-PWR7-NEXT: sub r22, r21, r20
+; CHECK-PWR7-NEXT: srawi r21, r22, 31
+; CHECK-PWR7-NEXT: sldi r29, r29, 56
+; CHECK-PWR7-NEXT: sldi r0, r0, 56
+; CHECK-PWR7-NEXT: sldi r11, r11, 56
+; CHECK-PWR7-NEXT: xor r3, r3, r23
+; CHECK-PWR7-NEXT: xor r22, r22, r21
+; CHECK-PWR7-NEXT: sldi r9, r9, 56
+; CHECK-PWR7-NEXT: sldi r7, r7, 56
+; CHECK-PWR7-NEXT: sldi r5, r5, 56
+; CHECK-PWR7-NEXT: ld r30, 496(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r28, 480(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: sub r3, r3, r23
+; CHECK-PWR7-NEXT: sub r22, r22, r21
+; CHECK-PWR7-NEXT: std r14, 304(r1)
+; CHECK-PWR7-NEXT: ld r26, 464(r1) # 8-byte Folded Reload
; CHECK-PWR7-NEXT: sldi r3, r3, 56
-; CHECK-PWR7-NEXT: std r3, 112(r1)
-; CHECK-PWR7-NEXT: ld r28, 416(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT: clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT: clrlwi r10, r10, 24
-; CHECK-PWR7-NEXT: std r25, 256(r1)
-; CHECK-PWR7-NEXT: std r25, 264(r1)
-; CHECK-PWR7-NEXT: sub r9, r9, r11
-; CHECK-PWR7-NEXT: srawi r25, r26, 31
-; CHECK-PWR7-NEXT: xor r26, r26, r25
-; CHECK-PWR7-NEXT: ld r23, 376(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: srawi r11, r9, 31
-; CHECK-PWR7-NEXT: std r3, 120(r1)
-; CHECK-PWR7-NEXT: sub r4, r26, r25
-; CHECK-PWR7-NEXT: clrlwi r22, r22, 24
-; CHECK-PWR7-NEXT: srawi r7, r8, 31
-; CHECK-PWR7-NEXT: sub r10, r10, r22
-; CHECK-PWR7-NEXT: ld r26, 400(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: xor r9, r9, r11
-; CHECK-PWR7-NEXT: sldi r3, r4, 56
-; CHECK-PWR7-NEXT: srawi r22, r10, 31
-; CHECK-PWR7-NEXT: xor r8, r8, r7
-; CHECK-PWR7-NEXT: xor r10, r10, r22
-; CHECK-PWR7-NEXT: sub r10, r10, r22
-; CHECK-PWR7-NEXT: ld r25, 392(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: sub r12, r9, r11
-; CHECK-PWR7-NEXT: lbz r9, 319(r1)
-; CHECK-PWR7-NEXT: lbz r11, 335(r1)
-; CHECK-PWR7-NEXT: std r3, 96(r1)
-; CHECK-PWR7-NEXT: sldi r12, r12, 56
-; CHECK-PWR7-NEXT: std r3, 104(r1)
-; CHECK-PWR7-NEXT: ld r22, 368(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: sldi r10, r10, 56
-; CHECK-PWR7-NEXT: std r10, 192(r1)
-; CHECK-PWR7-NEXT: clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT: clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT: sub r9, r9, r11
-; CHECK-PWR7-NEXT: std r12, 272(r1)
-; CHECK-PWR7-NEXT: std r12, 280(r1)
-; CHECK-PWR7-NEXT: srawi r12, r19, 31
-; CHECK-PWR7-NEXT: xor r0, r19, r12
-; CHECK-PWR7-NEXT: ld r19, 344(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: sub r3, r0, r12
-; CHECK-PWR7-NEXT: srawi r11, r9, 31
-; CHECK-PWR7-NEXT: std r10, 200(r1)
-; CHECK-PWR7-NEXT: xor r9, r9, r11
+; CHECK-PWR7-NEXT: sldi r22, r22, 56
+; CHECK-PWR7-NEXT: ld r24, 448(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r23, 440(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r14, 312(r1)
+; CHECK-PWR7-NEXT: std r15, 288(r1)
+; CHECK-PWR7-NEXT: std r3, 208(r1)
+; CHECK-PWR7-NEXT: std r3, 216(r1)
+; CHECK-PWR7-NEXT: lwz r3, 60(r1) # 4-byte Folded Reload
+; CHECK-PWR7-NEXT: std r15, 296(r1)
+; CHECK-PWR7-NEXT: ld r21, 424(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r20, 416(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r22, 224(r1)
+; CHECK-PWR7-NEXT: std r22, 232(r1)
+; CHECK-PWR7-NEXT: sub r4, r3, r4
+; CHECK-PWR7-NEXT: std r25, 192(r1)
+; CHECK-PWR7-NEXT: ld r22, 432(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r19, 408(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: srawi r3, r4, 31
+; CHECK-PWR7-NEXT: std r25, 200(r1)
+; CHECK-PWR7-NEXT: ld r25, 456(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r27, 176(r1)
+; CHECK-PWR7-NEXT: std r27, 184(r1)
+; CHECK-PWR7-NEXT: xor r4, r4, r3
+; CHECK-PWR7-NEXT: std r29, 160(r1)
+; CHECK-PWR7-NEXT: ld r27, 472(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r29, 168(r1)
+; CHECK-PWR7-NEXT: std r0, 144(r1)
+; CHECK-PWR7-NEXT: sub r3, r4, r3
+; CHECK-PWR7-NEXT: std r0, 152(r1)
+; CHECK-PWR7-NEXT: ld r29, 488(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: ld r18, 400(r1) # 8-byte Folded Reload
; CHECK-PWR7-NEXT: sldi r3, r3, 56
-; CHECK-PWR7-NEXT: sub r9, r9, r11
-; CHECK-PWR7-NEXT: std r3, 80(r1)
-; CHECK-PWR7-NEXT: std r3, 88(r1)
-; CHECK-PWR7-NEXT: sldi r9, r9, 56
-; CHECK-PWR7-NEXT: std r9, 288(r1)
-; CHECK-PWR7-NEXT: std r9, 296(r1)
-; CHECK-PWR7-NEXT: srawi r9, r20, 31
-; CHECK-PWR7-NEXT: xor r11, r20, r9
-; CHECK-PWR7-NEXT: ld r20, 352(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT: sub r4, r11, r9
-; CHECK-PWR7-NEXT: sldi r3, r4, 56
+; CHECK-PWR7-NEXT: std r11, 128(r1)
+; CHECK-PWR7-NEXT: ld r17, 392(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r11, 136(r1)
+; CHECK-PWR7-NEXT: std r9, 112(r1)
; CHECK-PWR7-NEXT: std r3, 64(r1)
; CHECK-PWR7-NEXT: std r3, 72(r1)
-; CHECK-PWR7-NEXT: sub r3, r8, r7
-; CHECK-PWR7-NEXT: sldi r3, r3, 56
-; CHECK-PWR7-NEXT: std r3, 48(r1)
-; CHECK-PWR7-NEXT: std r3, 56(r1)
-; CHECK-PWR7-NEXT: addi r3, r1, 288
+; CHECK-PWR7-NEXT: addi r3, r1, 304
+; CHECK-PWR7-NEXT: std r9, 120(r1)
+; CHECK-PWR7-NEXT: ld r15, 376(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT: std r7, 96(r1)
+; CHECK-PWR7-NEXT: std r7, 104(r1)
+; CHECK-PWR7-NEXT: std r5, 80(r1)
+; CHECK-PWR7-NEXT: std r5, 88(r1)
; CHECK-PWR7-NEXT: lxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 272
+; CHECK-PWR7-NEXT: addi r3, r1, 288
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 256
+; CHECK-PWR7-NEXT: addi r3, r1, 272
+; CHECK-PWR7-NEXT: ld r14, 368(r1) # 8-byte Folded Reload
; CHECK-PWR7-NEXT: vmrghb v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 240
+; CHECK-PWR7-NEXT: addi r3, r1, 256
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 224
+; CHECK-PWR7-NEXT: addi r3, r1, 240
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
; CHECK-PWR7-NEXT: vmrghh v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 208
+; CHECK-PWR7-NEXT: addi r3, r1, 224
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 192
+; CHECK-PWR7-NEXT: addi r3, r1, 208
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 176
+; CHECK-PWR7-NEXT: addi r3, r1, 192
; CHECK-PWR7-NEXT: lxvw4x v5, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 160
+; CHECK-PWR7-NEXT: addi r3, r1, 176
; CHECK-PWR7-NEXT: vmrghb v4, v5, v4
; CHECK-PWR7-NEXT: vmrghh v3, v4, v3
; CHECK-PWR7-NEXT: xxmrghw vs0, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 144
+; CHECK-PWR7-NEXT: addi r3, r1, 160
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 128
+; CHECK-PWR7-NEXT: addi r3, r1, 144
; CHECK-PWR7-NEXT: vmrghb v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 112
+; CHECK-PWR7-NEXT: addi r3, r1, 128
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 96
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
+; CHECK-PWR7-NEXT: addi r3, r1, 112
; CHECK-PWR7-NEXT: vmrghh v2, v3, v2
; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 80
+; CHECK-PWR7-NEXT: addi r3, r1, 96
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 64
+; CHECK-PWR7-NEXT: addi r3, r1, 80
; CHECK-PWR7-NEXT: vmrghb v3, v4, v3
; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT: addi r3, r1, 48
+; CHECK-PWR7-NEXT: addi r3, r1, 64
; CHECK-PWR7-NEXT: lxvw4x v5, 0, r3
; CHECK-PWR7-NEXT: vmrghb v4, v5, v4
; CHECK-PWR7-NEXT: vmrghh v3, v4, v3
; CHECK-PWR7-NEXT: xxmrghw vs1, v3, v2
; CHECK-PWR7-NEXT: xxmrghd v2, vs1, vs0
-; CHECK-PWR7-NEXT: addi r1, r1, 448
+; CHECK-PWR7-NEXT: addi r1, r1, 512
; CHECK-PWR7-NEXT: blr
entry:
%vecext = extractelement <16 x i8> %a, i32 0
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index 246e6a614d6aa..117e3e4aac45d 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -3292,30 +3292,30 @@ define i64 @ustest_f64i64_mm(double %x) {
; RV32IF-NEXT: mv a1, a0
; RV32IF-NEXT: addi a0, sp, 8
; RV32IF-NEXT: call __fixdfti
-; RV32IF-NEXT: lw a0, 8(sp)
-; RV32IF-NEXT: lw a1, 12(sp)
-; RV32IF-NEXT: lw a2, 20(sp)
+; RV32IF-NEXT: lw a0, 20(sp)
+; RV32IF-NEXT: lw a1, 8(sp)
+; RV32IF-NEXT: lw a2, 12(sp)
; RV32IF-NEXT: lw a3, 16(sp)
-; RV32IF-NEXT: beqz a2, .LBB47_2
+; RV32IF-NEXT: beqz a0, .LBB47_2
; RV32IF-NEXT: # %bb.1: # %entry
-; RV32IF-NEXT: slti a4, a2, 0
+; RV32IF-NEXT: slti a4, a0, 0
; RV32IF-NEXT: j .LBB47_3
; RV32IF-NEXT: .LBB47_2:
; RV32IF-NEXT: seqz a4, a3
; RV32IF-NEXT: .LBB47_3: # %entry
; RV32IF-NEXT: xori a3, a3, 1
-; RV32IF-NEXT: or a3, a3, a2
+; RV32IF-NEXT: or a3, a3, a0
; RV32IF-NEXT: seqz a3, a3
; RV32IF-NEXT: addi a3, a3, -1
; RV32IF-NEXT: and a3, a3, a4
; RV32IF-NEXT: neg a3, a3
+; RV32IF-NEXT: and a2, a3, a2
; RV32IF-NEXT: and a1, a3, a1
; RV32IF-NEXT: and a0, a3, a0
-; RV32IF-NEXT: and a2, a3, a2
-; RV32IF-NEXT: slti a2, a2, 0
-; RV32IF-NEXT: addi a2, a2, -1
-; RV32IF-NEXT: and a0, a2, a0
-; RV32IF-NEXT: and a1, a2, a1
+; RV32IF-NEXT: slti a0, a0, 0
+; RV32IF-NEXT: addi a3, a0, -1
+; RV32IF-NEXT: and a0, a3, a1
+; RV32IF-NEXT: and a1, a3, a2
; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32IF-NEXT: .cfi_restore ra
; RV32IF-NEXT: addi sp, sp, 32
@@ -3354,30 +3354,30 @@ define i64 @ustest_f64i64_mm(double %x) {
; RV32IFD-NEXT: .cfi_offset ra, -4
; RV32IFD-NEXT: addi a0, sp, 8
; RV32IFD-NEXT: call __fixdfti
-; RV32IFD-NEXT: lw a0, 8(sp)
-; RV32IFD-NEXT: lw a1, 12(sp)
-; RV32IFD-NEXT: lw a2, 20(sp)
+; RV32IFD-NEXT: lw a0, 20(sp)
+; RV32IFD-NEXT: lw a1, 8(sp)
+; RV32IFD-NEXT: lw a2, 12(sp)
; RV32IFD-NEXT: lw a3, 16(sp)
-; RV32IFD-NEXT: beqz a2, .LBB47_2
+; RV32IFD-NEXT: beqz a0, .LBB47_2
; RV32IFD-NEXT: # %bb.1: # %entry
-; RV32IFD-NEXT: slti a4, a2, 0
+; RV32IFD-NEXT: slti a4, a0, 0
; RV32IFD-NEXT: j .LBB47_3
; RV32IFD-NEXT: .LBB47_2:
; RV32IFD-NEXT: seqz a4, a3
; RV32IFD-NEXT: .LBB47_3: # %entry
; RV32IFD-NEXT: xori a3, a3, 1
-; RV32IFD-NEXT: or a3, a3, a2
+; RV32IFD-NEXT: or a3, a3, a0
; RV32IFD-NEXT: seqz a3, a3
; RV32IFD-NEXT: addi a3, a3, -1
; RV32IFD-NEXT: and a3, a3, a4
; RV32IFD-NEXT: neg a3, a3
+; RV32IFD-NEXT: and a2, a3, a2
; RV32IFD-NEXT: and a1, a3, a1
; RV32IFD-NEXT: and a0, a3, a0
-; RV32IFD-NEXT: and a2, a3, a2
-; RV32IFD-NEXT: slti a2, a2, 0
-; RV32IFD-NEXT: addi a2, a2, -1
-; RV32IFD-NEXT: and a0, a2, a0
-; RV32IFD-NEXT: and a1, a2, a1
+; RV32IFD-NEXT: slti a0, a0, 0
+; RV32IFD-NEXT: addi a3, a0, -1
+; RV32IFD-NEXT: and a0, a3, a1
+; RV32IFD-NEXT: and a1, a3, a2
; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: .cfi_restore ra
; RV32IFD-NEXT: addi sp, sp, 32
@@ -3530,30 +3530,30 @@ define i64 @ustest_f32i64_mm(float %x) {
; RV32-NEXT: .cfi_offset ra, -4
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: call __fixsfti
-; RV32-NEXT: lw a0, 8(sp)
-; RV32-NEXT: lw a1, 12(sp)
-; RV32-NEXT: lw a2, 20(sp)
+; RV32-NEXT: lw a0, 20(sp)
+; RV32-NEXT: lw a1, 8(sp)
+; RV32-NEXT: lw a2, 12(sp)
; RV32-NEXT: lw a3, 16(sp)
-; RV32-NEXT: beqz a2, .LBB50_2
+; RV32-NEXT: beqz a0, .LBB50_2
; RV32-NEXT: # %bb.1: # %entry
-; RV32-NEXT: slti a4, a2, 0
+; RV32-NEXT: slti a4, a0, 0
; RV32-NEXT: j .LBB50_3
; RV32-NEXT: .LBB50_2:
; RV32-NEXT: seqz a4, a3
; RV32-NEXT: .LBB50_3: # %entry
; RV32-NEXT: xori a3, a3, 1
-; RV32-NEXT: or a3, a3, a2
+; RV32-NEXT: or a3, a3, a0
; RV32-NEXT: seqz a3, a3
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: and a3, a3, a4
; RV32-NEXT: neg a3, a3
+; RV32-NEXT: and a2, a3, a2
; RV32-NEXT: and a1, a3, a1
; RV32-NEXT: and a0, a3, a0
-; RV32-NEXT: and a2, a3, a2
-; RV32-NEXT: slti a2, a2, 0
-; RV32-NEXT: addi a2, a2, -1
-; RV32-NEXT: and a0, a2, a0
-; RV32-NEXT: and a1, a2, a1
+; RV32-NEXT: slti a0, a0, 0
+; RV32-NEXT: addi a3, a0, -1
+; RV32-NEXT: and a0, a3, a1
+; RV32-NEXT: and a1, a3, a2
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore ra
; RV32-NEXT: addi sp, sp, 32
@@ -3767,30 +3767,30 @@ define i64 @ustest_f16i64_mm(half %x) {
; RV32-NEXT: call __extendhfsf2
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: call __fixsfti
-; RV32-NEXT: lw a0, 8(sp)
-; RV32-NEXT: lw a1, 12(sp)
-; RV32-NEXT: lw a2, 20(sp)
+; RV32-NEXT: lw a0, 20(sp)
+; RV32-NEXT: lw a1, 8(sp)
+; RV32-NEXT: lw a2, 12(sp)
; RV32-NEXT: lw a3, 16(sp)
-; RV32-NEXT: beqz a2, .LBB53_2
+; RV32-NEXT: beqz a0, .LBB53_2
; RV32-NEXT: # %bb.1: # %entry
-; RV32-NEXT: slti a4, a2, 0
+; RV32-NEXT: slti a4, a0, 0
; RV32-NEXT: j .LBB53_3
; RV32-NEXT: .LBB53_2:
; RV32-NEXT: seqz a4, a3
; RV32-NEXT: .LBB53_3: # %entry
; RV32-NEXT: xori a3, a3, 1
-; RV32-NEXT: or a3, a3, a2
+; RV32-NEXT: or a3, a3, a0
; RV32-NEXT: seqz a3, a3
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: and a3, a3, a4
; RV32-NEXT: neg a3, a3
+; RV32-NEXT: and a2, a3, a2
; RV32-NEXT: and a1, a3, a1
; RV32-NEXT: and a0, a3, a0
-; RV32-NEXT: and a2, a3, a2
-; RV32-NEXT: slti a2, a2, 0
-; RV32-NEXT: addi a2, a2, -1
-; RV32-NEXT: and a0, a2, a0
-; RV32-NEXT: and a1, a2, a1
+; RV32-NEXT: slti a0, a0, 0
+; RV32-NEXT: addi a3, a0, -1
+; RV32-NEXT: and a0, a3, a1
+; RV32-NEXT: and a1, a3, a2
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore ra
; RV32-NEXT: addi sp, sp, 32
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
index b1a6d163664e5..97d102561129d 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
@@ -7,18 +7,18 @@
define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
; RV32-LABEL: ctz_nxv4i32:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; RV32-NEXT: vid.v v10
-; RV32-NEXT: vmv.v.i v11, -1
; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; RV32-NEXT: vid.v v10
+; RV32-NEXT: li a1, -1
; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; RV32-NEXT: vmsne.vi v0, v8, 0
; RV32-NEXT: srli a0, a0, 1
; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; RV32-NEXT: vmv.v.x v8, a0
-; RV32-NEXT: vmacc.vv v8, v10, v11
-; RV32-NEXT: vmv.v.i v9, 0
-; RV32-NEXT: vmerge.vvm v8, v9, v8, v0
+; RV32-NEXT: vmadd.vx v10, a1, v8
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vvm v8, v8, v10, v0
; RV32-NEXT: vredmaxu.vs v8, v8, v8
; RV32-NEXT: vmv.x.s a1, v8
; RV32-NEXT: sub a0, a0, a1
@@ -28,18 +28,18 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
;
; RV64-LABEL: ctz_nxv4i32:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; RV64-NEXT: vid.v v10
-; RV64-NEXT: vmv.v.i v11, -1
; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; RV64-NEXT: vid.v v10
+; RV64-NEXT: li a1, -1
; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; RV64-NEXT: vmsne.vi v0, v8, 0
; RV64-NEXT: srli a0, a0, 1
; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; RV64-NEXT: vmv.v.x v8, a0
-; RV64-NEXT: vmacc.vv v8, v10, v11
-; RV64-NEXT: vmv.v.i v9, 0
-; RV64-NEXT: vmerge.vvm v8, v9, v8, v0
+; RV64-NEXT: vmadd.vx v10, a1, v8
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vvm v8, v8, v10, v0
; RV64-NEXT: vredmaxu.vs v8, v8, v8
; RV64-NEXT: vmv.x.s a1, v8
; RV64-NEXT: subw a0, a0, a1
@@ -109,17 +109,17 @@ define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
;
; RV64-LABEL: ctz_nxv8i1_no_range:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV64-NEXT: vid.v v16
-; RV64-NEXT: vmv.v.i v24, -1
; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT: vid.v v16
+; RV64-NEXT: li a1, -1
; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; RV64-NEXT: vmsne.vi v0, v8, 0
; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
; RV64-NEXT: vmv.v.x v8, a0
-; RV64-NEXT: vmacc.vv v8, v16, v24
-; RV64-NEXT: vmv.v.i v16, 0
-; RV64-NEXT: vmerge.vvm v8, v16, v8, v0
+; RV64-NEXT: vmadd.vx v16, a1, v8
+; RV64-NEXT: vmv.v.i v8, 0
+; RV64-NEXT: vmerge.vvm v8, v8, v16, v0
; RV64-NEXT: vredmaxu.vs v8, v8, v8
; RV64-NEXT: vmv.x.s a1, v8
; RV64-NEXT: sub a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index 32753ca382fc7..cd7f30d8f5898 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -716,92 +716,101 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: lbu a5, 8(a0)
+; RV32I-NEXT: lbu a6, 9(a0)
+; RV32I-NEXT: lbu t3, 10(a0)
+; RV32I-NEXT: lbu t4, 11(a0)
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: or a4, a4, a3
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a3, t0, a7
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 9(a0)
-; RV32I-NEXT: lbu t0, 10(a0)
-; RV32I-NEXT: lbu t3, 11(a0)
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: slli a7, a7, 8
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: or t1, t2, t1
-; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a7, t3, t0
-; RV32I-NEXT: lbu t0, 12(a0)
-; RV32I-NEXT: lbu t2, 13(a0)
-; RV32I-NEXT: lbu t3, 14(a0)
-; RV32I-NEXT: lbu t4, 15(a0)
-; RV32I-NEXT: lbu a0, 0(a1)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 12(a0)
+; RV32I-NEXT: lbu t1, 13(a0)
+; RV32I-NEXT: lbu t2, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or t3, t4, t3
+; RV32I-NEXT: or a6, t1, a6
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu t1, 1(a1)
+; RV32I-NEXT: lbu t2, 0(a1)
+; RV32I-NEXT: lbu t4, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
; RV32I-NEXT: sw zero, 16(sp)
; RV32I-NEXT: sw zero, 20(sp)
; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 28(sp)
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: or a1, t2, t0
-; RV32I-NEXT: mv t0, sp
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: or t2, t4, t3
-; RV32I-NEXT: srli t3, a0, 3
-; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: andi a5, a0, 31
-; RV32I-NEXT: andi t3, t3, 12
-; RV32I-NEXT: xori a5, a5, 31
-; RV32I-NEXT: or a3, t1, a3
-; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a1, t2, a1
-; RV32I-NEXT: add t0, t0, t3
-; RV32I-NEXT: sw a4, 0(sp)
-; RV32I-NEXT: sw a3, 4(sp)
-; RV32I-NEXT: sw a6, 8(sp)
-; RV32I-NEXT: sw a1, 12(sp)
-; RV32I-NEXT: lw a1, 4(t0)
-; RV32I-NEXT: lw a3, 8(t0)
-; RV32I-NEXT: lw a4, 0(t0)
-; RV32I-NEXT: lw a6, 12(t0)
-; RV32I-NEXT: srl a7, a1, a0
-; RV32I-NEXT: slli t0, a3, 1
-; RV32I-NEXT: srl a4, a4, a0
-; RV32I-NEXT: slli a1, a1, 1
-; RV32I-NEXT: srl a3, a3, a0
-; RV32I-NEXT: slli t1, a6, 1
-; RV32I-NEXT: srl a0, a6, a0
-; RV32I-NEXT: sll a6, t0, a5
-; RV32I-NEXT: sll a1, a1, a5
-; RV32I-NEXT: sll a5, t1, a5
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t4
+; RV32I-NEXT: mv t2, sp
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, t0, a7
+; RV32I-NEXT: or a5, t3, a5
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: or a1, a1, t1
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: srli a0, a1, 3
+; RV32I-NEXT: andi a3, a1, 31
+; RV32I-NEXT: andi a0, a0, 12
+; RV32I-NEXT: xori a3, a3, 31
+; RV32I-NEXT: add a0, t2, a0
+; RV32I-NEXT: lw a4, 4(a0)
+; RV32I-NEXT: lw a5, 8(a0)
+; RV32I-NEXT: lw a6, 0(a0)
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: srl a7, a4, a1
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: srl a6, a6, a1
+; RV32I-NEXT: slli a4, a4, 1
+; RV32I-NEXT: srl a5, a5, a1
+; RV32I-NEXT: slli t1, a0, 1
+; RV32I-NEXT: srl a0, a0, a1
+; RV32I-NEXT: sll a1, t0, a3
+; RV32I-NEXT: sll a4, a4, a3
+; RV32I-NEXT: sll a3, t1, a3
; RV32I-NEXT: srli t0, a0, 16
; RV32I-NEXT: srli t1, a0, 24
; RV32I-NEXT: srli t2, a0, 8
-; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a1, a4, a1
-; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: or a1, a7, a1
+; RV32I-NEXT: or a4, a6, a4
+; RV32I-NEXT: or a3, a5, a3
; RV32I-NEXT: sb a0, 12(a2)
; RV32I-NEXT: sb t2, 13(a2)
; RV32I-NEXT: sb t0, 14(a2)
; RV32I-NEXT: sb t1, 15(a2)
; RV32I-NEXT: srli a0, a3, 16
-; RV32I-NEXT: srli a4, a3, 24
-; RV32I-NEXT: srli a5, a3, 8
-; RV32I-NEXT: srli a7, a1, 16
-; RV32I-NEXT: srli t0, a1, 24
-; RV32I-NEXT: srli t1, a1, 8
-; RV32I-NEXT: srli t2, a6, 16
-; RV32I-NEXT: srli t3, a6, 24
+; RV32I-NEXT: srli a5, a3, 24
+; RV32I-NEXT: srli a6, a3, 8
+; RV32I-NEXT: srli a7, a4, 16
+; RV32I-NEXT: srli t0, a4, 24
+; RV32I-NEXT: srli t1, a4, 8
+; RV32I-NEXT: srli t2, a1, 16
+; RV32I-NEXT: srli t3, a1, 24
; RV32I-NEXT: sb a3, 8(a2)
-; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: sb a6, 9(a2)
; RV32I-NEXT: sb a0, 10(a2)
-; RV32I-NEXT: sb a4, 11(a2)
-; RV32I-NEXT: srli a0, a6, 8
-; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a5, 11(a2)
+; RV32I-NEXT: srli a0, a1, 8
+; RV32I-NEXT: sb a4, 0(a2)
; RV32I-NEXT: sb t1, 1(a2)
; RV32I-NEXT: sb a7, 2(a2)
; RV32I-NEXT: sb t0, 3(a2)
-; RV32I-NEXT: sb a6, 4(a2)
+; RV32I-NEXT: sb a1, 4(a2)
; RV32I-NEXT: sb a0, 5(a2)
; RV32I-NEXT: sb t2, 6(a2)
; RV32I-NEXT: sb t3, 7(a2)
@@ -943,93 +952,102 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: lbu a5, 8(a0)
+; RV32I-NEXT: lbu a6, 9(a0)
+; RV32I-NEXT: lbu t3, 10(a0)
+; RV32I-NEXT: lbu t4, 11(a0)
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: or a4, a4, a3
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: or a3, t0, a7
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 9(a0)
-; RV32I-NEXT: lbu t0, 10(a0)
-; RV32I-NEXT: lbu t3, 11(a0)
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: slli a7, a7, 8
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: or t1, t2, t1
-; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a7, t3, t0
-; RV32I-NEXT: lbu t0, 12(a0)
-; RV32I-NEXT: lbu t2, 13(a0)
-; RV32I-NEXT: lbu t3, 14(a0)
-; RV32I-NEXT: lbu t4, 15(a0)
-; RV32I-NEXT: lbu a0, 0(a1)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 12(a0)
+; RV32I-NEXT: lbu t1, 13(a0)
+; RV32I-NEXT: lbu t2, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or t3, t4, t3
+; RV32I-NEXT: or a6, t1, a6
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu t1, 1(a1)
+; RV32I-NEXT: lbu t2, 0(a1)
+; RV32I-NEXT: lbu t4, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
; RV32I-NEXT: sw zero, 0(sp)
; RV32I-NEXT: sw zero, 4(sp)
; RV32I-NEXT: sw zero, 8(sp)
; RV32I-NEXT: sw zero, 12(sp)
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: or a1, t2, t0
-; RV32I-NEXT: addi t0, sp, 16
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: or t2, t4, t3
-; RV32I-NEXT: srli t3, a0, 3
-; RV32I-NEXT: or a4, a5, a4
-; RV32I-NEXT: andi a5, a0, 31
-; RV32I-NEXT: andi t3, t3, 12
-; RV32I-NEXT: or a3, t1, a3
-; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a1, t2, a1
-; RV32I-NEXT: sub a7, t0, t3
-; RV32I-NEXT: sw a4, 16(sp)
-; RV32I-NEXT: sw a3, 20(sp)
-; RV32I-NEXT: sw a6, 24(sp)
-; RV32I-NEXT: sw a1, 28(sp)
-; RV32I-NEXT: lw a1, 0(a7)
-; RV32I-NEXT: lw a3, 4(a7)
-; RV32I-NEXT: lw a4, 8(a7)
-; RV32I-NEXT: lw a6, 12(a7)
-; RV32I-NEXT: xori a5, a5, 31
-; RV32I-NEXT: sll a7, a3, a0
-; RV32I-NEXT: srli t0, a1, 1
-; RV32I-NEXT: sll a6, a6, a0
-; RV32I-NEXT: srli t1, a4, 1
-; RV32I-NEXT: sll a4, a4, a0
-; RV32I-NEXT: srli a3, a3, 1
-; RV32I-NEXT: sll a0, a1, a0
-; RV32I-NEXT: srl a1, t0, a5
-; RV32I-NEXT: srl t0, t1, a5
-; RV32I-NEXT: srl a3, a3, a5
-; RV32I-NEXT: srli a5, a0, 16
-; RV32I-NEXT: srli t1, a0, 24
-; RV32I-NEXT: srli t2, a0, 8
-; RV32I-NEXT: or a1, a7, a1
-; RV32I-NEXT: or a6, a6, t0
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t4
+; RV32I-NEXT: addi t2, sp, 16
; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: sb a0, 0(a2)
+; RV32I-NEXT: or a4, t0, a7
+; RV32I-NEXT: or a5, t3, a5
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: or a1, a1, t1
+; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: sw a4, 20(sp)
+; RV32I-NEXT: sw a5, 24(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: srli a0, a1, 3
+; RV32I-NEXT: andi a3, a1, 31
+; RV32I-NEXT: andi a0, a0, 12
+; RV32I-NEXT: sub a0, t2, a0
+; RV32I-NEXT: lw a4, 0(a0)
+; RV32I-NEXT: lw a5, 4(a0)
+; RV32I-NEXT: lw a6, 8(a0)
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: xori a3, a3, 31
+; RV32I-NEXT: sll a7, a5, a1
+; RV32I-NEXT: srli t0, a4, 1
+; RV32I-NEXT: sll a0, a0, a1
+; RV32I-NEXT: srli t1, a6, 1
+; RV32I-NEXT: sll a6, a6, a1
+; RV32I-NEXT: srli a5, a5, 1
+; RV32I-NEXT: sll a1, a4, a1
+; RV32I-NEXT: srl a4, t0, a3
+; RV32I-NEXT: srl t0, t1, a3
+; RV32I-NEXT: srl a3, a5, a3
+; RV32I-NEXT: srli a5, a1, 16
+; RV32I-NEXT: srli t1, a1, 24
+; RV32I-NEXT: srli t2, a1, 8
+; RV32I-NEXT: or a4, a7, a4
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: sb a1, 0(a2)
; RV32I-NEXT: sb t2, 1(a2)
; RV32I-NEXT: sb a5, 2(a2)
; RV32I-NEXT: sb t1, 3(a2)
-; RV32I-NEXT: srli a0, a3, 16
-; RV32I-NEXT: srli a4, a3, 24
-; RV32I-NEXT: srli a5, a3, 8
-; RV32I-NEXT: srli a7, a6, 16
-; RV32I-NEXT: srli t0, a6, 24
-; RV32I-NEXT: srli t1, a6, 8
-; RV32I-NEXT: srli t2, a1, 16
-; RV32I-NEXT: srli t3, a1, 24
+; RV32I-NEXT: srli a1, a3, 16
+; RV32I-NEXT: srli a5, a3, 24
+; RV32I-NEXT: srli a6, a3, 8
+; RV32I-NEXT: srli a7, a0, 16
+; RV32I-NEXT: srli t0, a0, 24
+; RV32I-NEXT: srli t1, a0, 8
+; RV32I-NEXT: srli t2, a4, 16
+; RV32I-NEXT: srli t3, a4, 24
; RV32I-NEXT: sb a3, 8(a2)
-; RV32I-NEXT: sb a5, 9(a2)
-; RV32I-NEXT: sb a0, 10(a2)
-; RV32I-NEXT: sb a4, 11(a2)
-; RV32I-NEXT: srli a0, a1, 8
-; RV32I-NEXT: sb a6, 12(a2)
+; RV32I-NEXT: sb a6, 9(a2)
+; RV32I-NEXT: sb a1, 10(a2)
+; RV32I-NEXT: sb a5, 11(a2)
+; RV32I-NEXT: srli a1, a4, 8
+; RV32I-NEXT: sb a0, 12(a2)
; RV32I-NEXT: sb t1, 13(a2)
; RV32I-NEXT: sb a7, 14(a2)
; RV32I-NEXT: sb t0, 15(a2)
-; RV32I-NEXT: sb a1, 4(a2)
-; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: sb a1, 5(a2)
; RV32I-NEXT: sb t2, 6(a2)
; RV32I-NEXT: sb t3, 7(a2)
; RV32I-NEXT: addi sp, sp, 32
@@ -1168,73 +1186,82 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu t1, 6(a0)
; RV32I-NEXT: lbu t2, 7(a0)
; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 8(a0)
+; RV32I-NEXT: lbu t3, 9(a0)
+; RV32I-NEXT: lbu t4, 10(a0)
+; RV32I-NEXT: lbu t5, 11(a0)
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: lbu a6, 8(a0)
-; RV32I-NEXT: lbu a7, 9(a0)
-; RV32I-NEXT: lbu t0, 10(a0)
-; RV32I-NEXT: lbu t3, 11(a0)
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: slli a7, a7, 8
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: or t1, t2, t1
-; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a7, t3, t0
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a7, t2, t1
; RV32I-NEXT: lbu t0, 12(a0)
-; RV32I-NEXT: lbu t2, 13(a0)
-; RV32I-NEXT: lbu t3, 14(a0)
-; RV32I-NEXT: lbu t4, 15(a0)
-; RV32I-NEXT: lbu a0, 0(a1)
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: or a1, t2, t0
-; RV32I-NEXT: mv t0, sp
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: srli a4, a0, 3
-; RV32I-NEXT: or a5, t1, a5
-; RV32I-NEXT: andi t1, a0, 31
-; RV32I-NEXT: or t2, t4, t3
-; RV32I-NEXT: srai t3, t4, 31
-; RV32I-NEXT: andi a4, a4, 12
-; RV32I-NEXT: xori t1, t1, 31
+; RV32I-NEXT: lbu t1, 13(a0)
+; RV32I-NEXT: lbu t2, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t5, t5, 24
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or a4, t3, a4
+; RV32I-NEXT: or t3, t5, t4
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 1(a1)
+; RV32I-NEXT: lbu t4, 0(a1)
+; RV32I-NEXT: lbu t5, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t4
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t5
+; RV32I-NEXT: or a3, a5, a3
+; RV32I-NEXT: mv a5, sp
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or t2, a0, t2
+; RV32I-NEXT: srai a0, a0, 31
; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: or a1, t2, a1
-; RV32I-NEXT: sw t3, 16(sp)
-; RV32I-NEXT: sw t3, 20(sp)
-; RV32I-NEXT: sw t3, 24(sp)
-; RV32I-NEXT: sw t3, 28(sp)
-; RV32I-NEXT: add a4, t0, a4
+; RV32I-NEXT: or a4, t3, a4
+; RV32I-NEXT: or a7, t2, t0
+; RV32I-NEXT: or a1, a1, t1
+; RV32I-NEXT: sw a0, 16(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 24(sp)
+; RV32I-NEXT: sw a0, 28(sp)
; RV32I-NEXT: sw a3, 0(sp)
-; RV32I-NEXT: sw a5, 4(sp)
-; RV32I-NEXT: sw a6, 8(sp)
-; RV32I-NEXT: sw a1, 12(sp)
-; RV32I-NEXT: lw a1, 4(a4)
-; RV32I-NEXT: lw a3, 8(a4)
-; RV32I-NEXT: lw a5, 0(a4)
-; RV32I-NEXT: lw a4, 12(a4)
-; RV32I-NEXT: srl a6, a1, a0
-; RV32I-NEXT: slli a7, a3, 1
-; RV32I-NEXT: srl a5, a5, a0
-; RV32I-NEXT: slli a1, a1, 1
-; RV32I-NEXT: srl a3, a3, a0
-; RV32I-NEXT: slli t0, a4, 1
-; RV32I-NEXT: sra a0, a4, a0
-; RV32I-NEXT: sll a4, a7, t1
-; RV32I-NEXT: sll a1, a1, t1
-; RV32I-NEXT: sll a7, t0, t1
+; RV32I-NEXT: sw a6, 4(sp)
+; RV32I-NEXT: sw a4, 8(sp)
+; RV32I-NEXT: sw a7, 12(sp)
+; RV32I-NEXT: srli a0, a1, 3
+; RV32I-NEXT: andi a3, a1, 31
+; RV32I-NEXT: andi a0, a0, 12
+; RV32I-NEXT: xori a3, a3, 31
+; RV32I-NEXT: add a0, a5, a0
+; RV32I-NEXT: lw a4, 4(a0)
+; RV32I-NEXT: lw a5, 8(a0)
+; RV32I-NEXT: lw a6, 0(a0)
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: srl a7, a4, a1
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: srl a6, a6, a1
+; RV32I-NEXT: slli a4, a4, 1
+; RV32I-NEXT: srl a5, a5, a1
+; RV32I-NEXT: slli t1, a0, 1
+; RV32I-NEXT: sra a0, a0, a1
+; RV32I-NEXT: sll a1, t0, a3
+; RV32I-NEXT: sll a4, a4, a3
+; RV32I-NEXT: sll a3, t1, a3
; RV32I-NEXT: srli t0, a0, 16
; RV32I-NEXT: srli t1, a0, 24
; RV32I-NEXT: srli t2, a0, 8
+; RV32I-NEXT: or a1, a7, a1
; RV32I-NEXT: or a4, a6, a4
-; RV32I-NEXT: or a1, a5, a1
-; RV32I-NEXT: or a3, a3, a7
+; RV32I-NEXT: or a3, a5, a3
; RV32I-NEXT: sb a0, 12(a2)
; RV32I-NEXT: sb t2, 13(a2)
; RV32I-NEXT: sb t0, 14(a2)
@@ -1242,21 +1269,21 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: srli a0, a3, 16
; RV32I-NEXT: srli a5, a3, 24
; RV32I-NEXT: srli a6, a3, 8
-; RV32I-NEXT: srli a7, a1, 16
-; RV32I-NEXT: srli t0, a1, 24
-; RV32I-NEXT: srli t1, a1, 8
-; RV32I-NEXT: srli t2, a4, 16
-; RV32I-NEXT: srli t3, a4, 24
+; RV32I-NEXT: srli a7, a4, 16
+; RV32I-NEXT: srli t0, a4, 24
+; RV32I-NEXT: srli t1, a4, 8
+; RV32I-NEXT: srli t2, a1, 16
+; RV32I-NEXT: srli t3, a1, 24
; RV32I-NEXT: sb a3, 8(a2)
; RV32I-NEXT: sb a6, 9(a2)
; RV32I-NEXT: sb a0, 10(a2)
; RV32I-NEXT: sb a5, 11(a2)
-; RV32I-NEXT: srli a0, a4, 8
-; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: srli a0, a1, 8
+; RV32I-NEXT: sb a4, 0(a2)
; RV32I-NEXT: sb t1, 1(a2)
; RV32I-NEXT: sb a7, 2(a2)
; RV32I-NEXT: sb t0, 3(a2)
-; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: sb a1, 4(a2)
; RV32I-NEXT: sb a0, 5(a2)
; RV32I-NEXT: sb t2, 6(a2)
; RV32I-NEXT: sb t3, 7(a2)
@@ -1272,17 +1299,19 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: lshr_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -144
-; RV64I-NEXT: sd s0, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -160
+; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill
; RV64I-NEXT: lbu a3, 0(a0)
; RV64I-NEXT: lbu a4, 1(a0)
; RV64I-NEXT: lbu a5, 2(a0)
@@ -1299,122 +1328,143 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: lbu s1, 13(a0)
; RV64I-NEXT: lbu s2, 14(a0)
; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
; RV64I-NEXT: lbu s4, 16(a0)
; RV64I-NEXT: lbu s5, 17(a0)
; RV64I-NEXT: lbu s6, 18(a0)
; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli s8, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
; RV64I-NEXT: slli t0, t0, 8
; RV64I-NEXT: slli t1, t1, 16
; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or a5, a4, a3
+; RV64I-NEXT: or a6, a6, s8
+; RV64I-NEXT: or a3, t0, a7
+; RV64I-NEXT: or a4, t2, t1
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
; RV64I-NEXT: slli t4, t4, 8
; RV64I-NEXT: slli t5, t5, 16
; RV64I-NEXT: slli t6, t6, 24
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
-; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: lbu t5, 20(a0)
-; RV64I-NEXT: lbu t6, 21(a0)
-; RV64I-NEXT: lbu s8, 22(a0)
-; RV64I-NEXT: lbu s9, 23(a0)
; RV64I-NEXT: slli s1, s1, 8
; RV64I-NEXT: slli s2, s2, 16
; RV64I-NEXT: slli s3, s3, 24
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: or t0, t6, t5
+; RV64I-NEXT: or t1, s1, s0
+; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: lbu t6, 24(a0)
+; RV64I-NEXT: lbu s0, 25(a0)
+; RV64I-NEXT: lbu s1, 26(a0)
+; RV64I-NEXT: lbu s2, 27(a0)
; RV64I-NEXT: slli s5, s5, 8
; RV64I-NEXT: slli s6, s6, 16
; RV64I-NEXT: slli s7, s7, 24
-; RV64I-NEXT: or t1, s1, s0
-; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: slli s9, s9, 8
; RV64I-NEXT: or t3, s5, s4
; RV64I-NEXT: or t4, s7, s6
-; RV64I-NEXT: lbu s0, 24(a0)
-; RV64I-NEXT: lbu s1, 25(a0)
-; RV64I-NEXT: lbu s2, 26(a0)
-; RV64I-NEXT: lbu s3, 27(a0)
-; RV64I-NEXT: slli t6, t6, 8
-; RV64I-NEXT: slli s8, s8, 16
-; RV64I-NEXT: slli s9, s9, 24
-; RV64I-NEXT: slli s1, s1, 8
-; RV64I-NEXT: or t5, t6, t5
-; RV64I-NEXT: or t6, s9, s8
-; RV64I-NEXT: or s0, s1, s0
-; RV64I-NEXT: lbu s1, 28(a0)
+; RV64I-NEXT: or t5, s9, s8
+; RV64I-NEXT: lbu s3, 28(a0)
; RV64I-NEXT: lbu s4, 29(a0)
; RV64I-NEXT: lbu s5, 30(a0)
; RV64I-NEXT: lbu s6, 31(a0)
-; RV64I-NEXT: lbu a0, 0(a1)
+; RV64I-NEXT: slli s10, s10, 16
+; RV64I-NEXT: slli s11, s11, 24
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: slli s1, s1, 16
+; RV64I-NEXT: slli s2, s2, 24
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or a0, s11, s10
+; RV64I-NEXT: or t6, s0, t6
+; RV64I-NEXT: or s0, s2, s1
+; RV64I-NEXT: or s1, s4, s3
+; RV64I-NEXT: lbu s2, 0(a1)
+; RV64I-NEXT: lbu s3, 1(a1)
+; RV64I-NEXT: lbu s4, 2(a1)
+; RV64I-NEXT: lbu s7, 3(a1)
+; RV64I-NEXT: slli s5, s5, 16
+; RV64I-NEXT: slli s6, s6, 24
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: slli s7, s7, 24
+; RV64I-NEXT: or s5, s6, s5
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, s7, s4
+; RV64I-NEXT: lbu s4, 5(a1)
+; RV64I-NEXT: lbu s6, 4(a1)
+; RV64I-NEXT: lbu s7, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or s4, s4, s6
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, s7
; RV64I-NEXT: sd zero, 32(sp)
; RV64I-NEXT: sd zero, 40(sp)
; RV64I-NEXT: sd zero, 48(sp)
; RV64I-NEXT: sd zero, 56(sp)
-; RV64I-NEXT: slli s2, s2, 16
-; RV64I-NEXT: slli s3, s3, 24
-; RV64I-NEXT: or a1, s3, s2
-; RV64I-NEXT: mv s2, sp
-; RV64I-NEXT: slli s4, s4, 8
-; RV64I-NEXT: slli s5, s5, 16
-; RV64I-NEXT: slli s6, s6, 24
-; RV64I-NEXT: or s1, s4, s1
-; RV64I-NEXT: srli s3, a0, 3
-; RV64I-NEXT: or s4, s6, s5
-; RV64I-NEXT: andi s5, a0, 63
-; RV64I-NEXT: andi s3, s3, 24
-; RV64I-NEXT: xori s5, s5, 63
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
-; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: or a1, a1, s0
-; RV64I-NEXT: or t1, s4, s1
-; RV64I-NEXT: add s2, s2, s3
-; RV64I-NEXT: slli a4, a4, 32
-; RV64I-NEXT: slli a6, a6, 32
-; RV64I-NEXT: slli t0, t0, 32
-; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: mv a6, sp
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a1, t1, a1
+; RV64I-NEXT: or a4, t0, a7
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: or t0, t4, t3
+; RV64I-NEXT: or a0, a0, t5
+; RV64I-NEXT: or t1, s0, t6
+; RV64I-NEXT: or t2, s5, s1
+; RV64I-NEXT: or t3, s3, s2
+; RV64I-NEXT: or a1, a1, s4
+; RV64I-NEXT: slli a3, a3, 32
+; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: slli t2, t2, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a3, a3, a5
+; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: or a5, t2, t1
+; RV64I-NEXT: or a1, a1, t3
; RV64I-NEXT: sd a3, 0(sp)
; RV64I-NEXT: sd a4, 8(sp)
-; RV64I-NEXT: sd a5, 16(sp)
-; RV64I-NEXT: sd a1, 24(sp)
-; RV64I-NEXT: ld a1, 8(s2)
-; RV64I-NEXT: ld a3, 16(s2)
-; RV64I-NEXT: ld a4, 0(s2)
-; RV64I-NEXT: ld a5, 24(s2)
-; RV64I-NEXT: srl a6, a1, a0
-; RV64I-NEXT: slli a7, a3, 1
-; RV64I-NEXT: srl a4, a4, a0
-; RV64I-NEXT: slli a1, a1, 1
-; RV64I-NEXT: srl a3, a3, a0
+; RV64I-NEXT: sd a0, 16(sp)
+; RV64I-NEXT: sd a5, 24(sp)
+; RV64I-NEXT: srli a0, a1, 3
+; RV64I-NEXT: andi a3, a1, 63
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: xori a3, a3, 63
+; RV64I-NEXT: add a0, a6, a0
+; RV64I-NEXT: ld a4, 8(a0)
+; RV64I-NEXT: ld a5, 16(a0)
+; RV64I-NEXT: ld a6, 0(a0)
+; RV64I-NEXT: ld a0, 24(a0)
+; RV64I-NEXT: srl a7, a4, a1
; RV64I-NEXT: slli t0, a5, 1
-; RV64I-NEXT: srl a5, a5, a0
-; RV64I-NEXT: sll a0, a7, s5
-; RV64I-NEXT: sll a1, a1, s5
-; RV64I-NEXT: sll a7, t0, s5
-; RV64I-NEXT: srli t0, a5, 56
-; RV64I-NEXT: srli t1, a5, 48
-; RV64I-NEXT: srli t2, a5, 40
-; RV64I-NEXT: srli t3, a5, 32
-; RV64I-NEXT: srli t4, a5, 24
-; RV64I-NEXT: srli t5, a5, 16
-; RV64I-NEXT: srli t6, a5, 8
-; RV64I-NEXT: or a0, a6, a0
-; RV64I-NEXT: or a1, a4, a1
-; RV64I-NEXT: or a3, a3, a7
+; RV64I-NEXT: srl a6, a6, a1
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: srl a5, a5, a1
+; RV64I-NEXT: slli t1, a0, 1
+; RV64I-NEXT: srl t2, a0, a1
+; RV64I-NEXT: sll a0, t0, a3
+; RV64I-NEXT: sll a1, a4, a3
+; RV64I-NEXT: sll a3, t1, a3
+; RV64I-NEXT: srli a4, t2, 56
+; RV64I-NEXT: srli t0, t2, 48
+; RV64I-NEXT: srli t1, t2, 40
+; RV64I-NEXT: srli t3, t2, 32
+; RV64I-NEXT: srli t4, t2, 24
+; RV64I-NEXT: srli t5, t2, 16
+; RV64I-NEXT: srli t6, t2, 8
+; RV64I-NEXT: or a0, a7, a0
+; RV64I-NEXT: or a1, a6, a1
+; RV64I-NEXT: or a3, a5, a3
; RV64I-NEXT: sb t3, 28(a2)
-; RV64I-NEXT: sb t2, 29(a2)
-; RV64I-NEXT: sb t1, 30(a2)
-; RV64I-NEXT: sb t0, 31(a2)
-; RV64I-NEXT: sb a5, 24(a2)
+; RV64I-NEXT: sb t1, 29(a2)
+; RV64I-NEXT: sb t0, 30(a2)
+; RV64I-NEXT: sb a4, 31(a2)
+; RV64I-NEXT: sb t2, 24(a2)
; RV64I-NEXT: sb t6, 25(a2)
; RV64I-NEXT: sb t5, 26(a2)
; RV64I-NEXT: sb t4, 27(a2)
@@ -1463,17 +1513,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: sb a1, 9(a2)
; RV64I-NEXT: sb a5, 10(a2)
; RV64I-NEXT: sb a3, 11(a2)
-; RV64I-NEXT: ld s0, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 144
+; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 160
; RV64I-NEXT: ret
;
; RV32I-LABEL: lshr_32bytes:
@@ -1498,55 +1550,67 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu a7, 3(a0)
; RV32I-NEXT: lbu a5, 4(a0)
; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t3, 6(a0)
-; RV32I-NEXT: lbu t6, 7(a0)
-; RV32I-NEXT: lbu s2, 8(a0)
-; RV32I-NEXT: lbu s3, 9(a0)
-; RV32I-NEXT: lbu s4, 10(a0)
-; RV32I-NEXT: lbu s5, 11(a0)
-; RV32I-NEXT: lbu s7, 12(a0)
-; RV32I-NEXT: lbu s8, 13(a0)
-; RV32I-NEXT: lbu s9, 14(a0)
-; RV32I-NEXT: lbu s10, 15(a0)
-; RV32I-NEXT: lbu s11, 16(a0)
-; RV32I-NEXT: lbu ra, 17(a0)
-; RV32I-NEXT: lbu t4, 18(a0)
-; RV32I-NEXT: lbu s0, 19(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s0, 12(a0)
+; RV32I-NEXT: lbu s2, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s5, 15(a0)
+; RV32I-NEXT: lbu s6, 16(a0)
+; RV32I-NEXT: lbu s7, 17(a0)
+; RV32I-NEXT: lbu s8, 18(a0)
+; RV32I-NEXT: lbu s9, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
; RV32I-NEXT: or a4, a7, a6
-; RV32I-NEXT: lbu t1, 20(a0)
-; RV32I-NEXT: lbu t2, 21(a0)
-; RV32I-NEXT: lbu t5, 22(a0)
-; RV32I-NEXT: lbu s1, 23(a0)
+; RV32I-NEXT: lbu s10, 20(a0)
+; RV32I-NEXT: lbu s11, 21(a0)
+; RV32I-NEXT: lbu ra, 22(a0)
+; RV32I-NEXT: lbu a3, 23(a0)
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or a5, t0, a5
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: lbu s1, 24(a0)
+; RV32I-NEXT: lbu s3, 25(a0)
+; RV32I-NEXT: lbu t4, 26(a0)
+; RV32I-NEXT: lbu t5, 27(a0)
+; RV32I-NEXT: slli s2, s2, 8
; RV32I-NEXT: slli s4, s4, 16
; RV32I-NEXT: slli s5, s5, 24
-; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: or a6, t6, t3
-; RV32I-NEXT: or a7, s3, s2
-; RV32I-NEXT: or t0, s5, s4
-; RV32I-NEXT: lbu t3, 24(a0)
-; RV32I-NEXT: lbu s5, 25(a0)
-; RV32I-NEXT: lbu s6, 26(a0)
-; RV32I-NEXT: lbu t6, 27(a0)
-; RV32I-NEXT: slli s8, s8, 8
-; RV32I-NEXT: slli s9, s9, 16
-; RV32I-NEXT: slli s10, s10, 24
-; RV32I-NEXT: slli ra, ra, 8
-; RV32I-NEXT: or s7, s8, s7
-; RV32I-NEXT: or s2, s10, s9
-; RV32I-NEXT: or s3, ra, s11
-; RV32I-NEXT: lbu s4, 28(a0)
-; RV32I-NEXT: lbu s8, 29(a0)
-; RV32I-NEXT: lbu s9, 30(a0)
-; RV32I-NEXT: lbu s10, 31(a0)
-; RV32I-NEXT: lbu a0, 0(a1)
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: or t1, s2, s0
+; RV32I-NEXT: or t2, s5, s4
+; RV32I-NEXT: or t3, s7, s6
+; RV32I-NEXT: lbu t6, 28(a0)
+; RV32I-NEXT: lbu s4, 29(a0)
+; RV32I-NEXT: lbu s5, 30(a0)
+; RV32I-NEXT: lbu s6, 31(a0)
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: slli ra, ra, 16
+; RV32I-NEXT: slli a3, a3, 24
+; RV32I-NEXT: or a0, s9, s8
+; RV32I-NEXT: or s0, s11, s10
+; RV32I-NEXT: or s2, a3, ra
+; RV32I-NEXT: lbu a3, 0(a1)
+; RV32I-NEXT: lbu s7, 1(a1)
+; RV32I-NEXT: lbu s8, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
; RV32I-NEXT: sw zero, 56(sp)
; RV32I-NEXT: sw zero, 60(sp)
; RV32I-NEXT: sw zero, 64(sp)
@@ -1555,90 +1619,89 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sw zero, 44(sp)
; RV32I-NEXT: sw zero, 48(sp)
; RV32I-NEXT: sw zero, 52(sp)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or s1, s3, s1
+; RV32I-NEXT: addi s3, sp, 8
; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli s0, s0, 24
-; RV32I-NEXT: or t4, s0, t4
-; RV32I-NEXT: addi s0, sp, 8
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: slli t5, t5, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: slli s5, s5, 8
-; RV32I-NEXT: slli s6, s6, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s8, s8, 8
-; RV32I-NEXT: slli s9, s9, 16
-; RV32I-NEXT: slli s10, s10, 24
-; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: slli t5, t5, 24
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 16
+; RV32I-NEXT: slli s6, s6, 24
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s4, t6
+; RV32I-NEXT: or t6, s6, s5
+; RV32I-NEXT: or a3, s7, a3
+; RV32I-NEXT: or a1, a1, s8
+; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a4, a4, s4
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a7, t2, t1
+; RV32I-NEXT: or t0, a0, t3
+; RV32I-NEXT: or t1, s2, s0
+; RV32I-NEXT: or t2, t4, s1
+; RV32I-NEXT: or t3, t6, t5
+; RV32I-NEXT: or a0, a1, a3
+; RV32I-NEXT: sw t0, 24(sp)
+; RV32I-NEXT: sw t1, 28(sp)
+; RV32I-NEXT: sw t2, 32(sp)
+; RV32I-NEXT: sw t3, 36(sp)
+; RV32I-NEXT: sw a4, 8(sp)
+; RV32I-NEXT: sw a5, 12(sp)
+; RV32I-NEXT: sw a6, 16(sp)
+; RV32I-NEXT: sw a7, 20(sp)
; RV32I-NEXT: srli a1, a0, 3
-; RV32I-NEXT: or t2, s1, t5
-; RV32I-NEXT: andi t5, a0, 31
-; RV32I-NEXT: or t3, s5, t3
-; RV32I-NEXT: or t6, t6, s6
-; RV32I-NEXT: or s1, s8, s4
-; RV32I-NEXT: or s4, s10, s9
-; RV32I-NEXT: andi s5, a1, 28
-; RV32I-NEXT: xori a1, t5, 31
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, s2, s7
-; RV32I-NEXT: or a7, t4, s3
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or t1, t6, t3
-; RV32I-NEXT: or t2, s4, s1
-; RV32I-NEXT: add s0, s0, s5
-; RV32I-NEXT: sw a7, 24(sp)
-; RV32I-NEXT: sw t0, 28(sp)
-; RV32I-NEXT: sw t1, 32(sp)
-; RV32I-NEXT: sw t2, 36(sp)
-; RV32I-NEXT: sw a3, 8(sp)
-; RV32I-NEXT: sw a4, 12(sp)
-; RV32I-NEXT: sw a5, 16(sp)
-; RV32I-NEXT: sw a6, 20(sp)
-; RV32I-NEXT: lw a3, 0(s0)
-; RV32I-NEXT: lw a4, 4(s0)
-; RV32I-NEXT: lw a5, 8(s0)
-; RV32I-NEXT: lw a6, 12(s0)
-; RV32I-NEXT: lw a7, 16(s0)
-; RV32I-NEXT: lw t0, 20(s0)
-; RV32I-NEXT: lw t1, 24(s0)
-; RV32I-NEXT: lw t2, 28(s0)
-; RV32I-NEXT: srl t3, a4, a0
-; RV32I-NEXT: slli t4, a5, 1
+; RV32I-NEXT: andi a3, a0, 31
+; RV32I-NEXT: andi a4, a1, 28
+; RV32I-NEXT: xori a1, a3, 31
+; RV32I-NEXT: add a4, s3, a4
+; RV32I-NEXT: lw a3, 0(a4)
+; RV32I-NEXT: lw a5, 4(a4)
+; RV32I-NEXT: lw a6, 8(a4)
+; RV32I-NEXT: lw a7, 12(a4)
+; RV32I-NEXT: lw t0, 16(a4)
+; RV32I-NEXT: lw t1, 20(a4)
+; RV32I-NEXT: lw t2, 24(a4)
+; RV32I-NEXT: lw a4, 28(a4)
+; RV32I-NEXT: srl t3, a5, a0
+; RV32I-NEXT: slli t4, a6, 1
; RV32I-NEXT: srl a3, a3, a0
-; RV32I-NEXT: slli a4, a4, 1
-; RV32I-NEXT: srl t5, a6, a0
-; RV32I-NEXT: slli t6, a7, 1
-; RV32I-NEXT: srl a5, a5, a0
-; RV32I-NEXT: slli a6, a6, 1
-; RV32I-NEXT: srl s0, t0, a0
-; RV32I-NEXT: slli s1, t1, 1
-; RV32I-NEXT: srl a7, a7, a0
-; RV32I-NEXT: slli t0, t0, 1
-; RV32I-NEXT: srl t1, t1, a0
-; RV32I-NEXT: slli s2, t2, 1
+; RV32I-NEXT: slli a5, a5, 1
+; RV32I-NEXT: srl t5, a7, a0
+; RV32I-NEXT: slli t6, t0, 1
+; RV32I-NEXT: srl a6, a6, a0
+; RV32I-NEXT: slli a7, a7, 1
+; RV32I-NEXT: srl s0, t1, a0
+; RV32I-NEXT: slli s1, t2, 1
+; RV32I-NEXT: srl t0, t0, a0
+; RV32I-NEXT: slli t1, t1, 1
; RV32I-NEXT: srl t2, t2, a0
+; RV32I-NEXT: slli s2, a4, 1
+; RV32I-NEXT: srl s3, a4, a0
; RV32I-NEXT: sll a0, t4, a1
-; RV32I-NEXT: sll a4, a4, a1
-; RV32I-NEXT: sll t4, t6, a1
-; RV32I-NEXT: sll a6, a6, a1
-; RV32I-NEXT: sll t6, s1, a1
-; RV32I-NEXT: sll t0, t0, a1
-; RV32I-NEXT: sll s1, s2, a1
-; RV32I-NEXT: srli s2, t2, 24
-; RV32I-NEXT: srli s3, t2, 16
-; RV32I-NEXT: srli s4, t2, 8
+; RV32I-NEXT: sll a4, a5, a1
+; RV32I-NEXT: sll a5, t6, a1
+; RV32I-NEXT: sll a7, a7, a1
+; RV32I-NEXT: sll t4, s1, a1
+; RV32I-NEXT: sll t1, t1, a1
+; RV32I-NEXT: sll t6, s2, a1
+; RV32I-NEXT: srli s1, s3, 24
+; RV32I-NEXT: srli s2, s3, 16
+; RV32I-NEXT: srli s4, s3, 8
; RV32I-NEXT: or a0, t3, a0
; RV32I-NEXT: or a1, a3, a4
-; RV32I-NEXT: or a3, t5, t4
-; RV32I-NEXT: or a4, a5, a6
-; RV32I-NEXT: or a5, s0, t6
-; RV32I-NEXT: or a6, a7, t0
-; RV32I-NEXT: or a7, t1, s1
-; RV32I-NEXT: sb t2, 28(a2)
+; RV32I-NEXT: or a3, t5, a5
+; RV32I-NEXT: or a4, a6, a7
+; RV32I-NEXT: or a5, s0, t4
+; RV32I-NEXT: or a6, t0, t1
+; RV32I-NEXT: or a7, t2, t6
+; RV32I-NEXT: sb s3, 28(a2)
; RV32I-NEXT: sb s4, 29(a2)
-; RV32I-NEXT: sb s3, 30(a2)
-; RV32I-NEXT: sb s2, 31(a2)
+; RV32I-NEXT: sb s2, 30(a2)
+; RV32I-NEXT: sb s1, 31(a2)
; RV32I-NEXT: srli t0, a7, 24
; RV32I-NEXT: srli t1, a7, 16
; RV32I-NEXT: srli t2, a7, 8
@@ -1712,17 +1775,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: shl_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -144
-; RV64I-NEXT: sd s0, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -160
+; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill
; RV64I-NEXT: lbu a3, 0(a0)
; RV64I-NEXT: lbu a4, 1(a0)
; RV64I-NEXT: lbu a5, 2(a0)
@@ -1739,125 +1804,146 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: lbu s1, 13(a0)
; RV64I-NEXT: lbu s2, 14(a0)
; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
; RV64I-NEXT: lbu s4, 16(a0)
; RV64I-NEXT: lbu s5, 17(a0)
; RV64I-NEXT: lbu s6, 18(a0)
; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli s8, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
; RV64I-NEXT: slli t0, t0, 8
; RV64I-NEXT: slli t1, t1, 16
; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or a5, a4, a3
+; RV64I-NEXT: or a6, a6, s8
+; RV64I-NEXT: or a3, t0, a7
+; RV64I-NEXT: or a4, t2, t1
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
; RV64I-NEXT: slli t4, t4, 8
; RV64I-NEXT: slli t5, t5, 16
; RV64I-NEXT: slli t6, t6, 24
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
-; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: lbu t5, 20(a0)
-; RV64I-NEXT: lbu t6, 21(a0)
-; RV64I-NEXT: lbu s8, 22(a0)
-; RV64I-NEXT: lbu s9, 23(a0)
; RV64I-NEXT: slli s1, s1, 8
; RV64I-NEXT: slli s2, s2, 16
; RV64I-NEXT: slli s3, s3, 24
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: or t0, t6, t5
+; RV64I-NEXT: or t1, s1, s0
+; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: lbu t6, 24(a0)
+; RV64I-NEXT: lbu s0, 25(a0)
+; RV64I-NEXT: lbu s1, 26(a0)
+; RV64I-NEXT: lbu s2, 27(a0)
; RV64I-NEXT: slli s5, s5, 8
; RV64I-NEXT: slli s6, s6, 16
; RV64I-NEXT: slli s7, s7, 24
-; RV64I-NEXT: or t1, s1, s0
-; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: slli s9, s9, 8
; RV64I-NEXT: or t3, s5, s4
; RV64I-NEXT: or t4, s7, s6
-; RV64I-NEXT: lbu s0, 24(a0)
-; RV64I-NEXT: lbu s1, 25(a0)
-; RV64I-NEXT: lbu s2, 26(a0)
-; RV64I-NEXT: lbu s3, 27(a0)
-; RV64I-NEXT: slli t6, t6, 8
-; RV64I-NEXT: slli s8, s8, 16
-; RV64I-NEXT: slli s9, s9, 24
-; RV64I-NEXT: slli s1, s1, 8
-; RV64I-NEXT: or t5, t6, t5
-; RV64I-NEXT: or t6, s9, s8
-; RV64I-NEXT: or s0, s1, s0
-; RV64I-NEXT: lbu s1, 28(a0)
+; RV64I-NEXT: or t5, s9, s8
+; RV64I-NEXT: lbu s3, 28(a0)
; RV64I-NEXT: lbu s4, 29(a0)
; RV64I-NEXT: lbu s5, 30(a0)
; RV64I-NEXT: lbu s6, 31(a0)
-; RV64I-NEXT: lbu a0, 0(a1)
+; RV64I-NEXT: slli s10, s10, 16
+; RV64I-NEXT: slli s11, s11, 24
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: slli s1, s1, 16
+; RV64I-NEXT: slli s2, s2, 24
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or a0, s11, s10
+; RV64I-NEXT: or t6, s0, t6
+; RV64I-NEXT: or s0, s2, s1
+; RV64I-NEXT: or s1, s4, s3
+; RV64I-NEXT: lbu s2, 0(a1)
+; RV64I-NEXT: lbu s3, 1(a1)
+; RV64I-NEXT: lbu s4, 2(a1)
+; RV64I-NEXT: lbu s7, 3(a1)
+; RV64I-NEXT: slli s5, s5, 16
+; RV64I-NEXT: slli s6, s6, 24
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: slli s7, s7, 24
+; RV64I-NEXT: or s5, s6, s5
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, s7, s4
+; RV64I-NEXT: lbu s4, 5(a1)
+; RV64I-NEXT: lbu s6, 4(a1)
+; RV64I-NEXT: lbu s7, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or s4, s4, s6
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, s7
; RV64I-NEXT: sd zero, 0(sp)
; RV64I-NEXT: sd zero, 8(sp)
; RV64I-NEXT: sd zero, 16(sp)
; RV64I-NEXT: sd zero, 24(sp)
-; RV64I-NEXT: slli s2, s2, 16
-; RV64I-NEXT: slli s3, s3, 24
-; RV64I-NEXT: or a1, s3, s2
-; RV64I-NEXT: addi s2, sp, 32
-; RV64I-NEXT: slli s4, s4, 8
-; RV64I-NEXT: slli s5, s5, 16
-; RV64I-NEXT: slli s6, s6, 24
-; RV64I-NEXT: or s1, s4, s1
-; RV64I-NEXT: srli s3, a0, 3
-; RV64I-NEXT: or s4, s6, s5
-; RV64I-NEXT: andi s5, a0, 63
-; RV64I-NEXT: andi s3, s3, 24
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
-; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: or a1, a1, s0
-; RV64I-NEXT: or t1, s4, s1
-; RV64I-NEXT: sub t2, s2, s3
-; RV64I-NEXT: slli a4, a4, 32
-; RV64I-NEXT: slli a6, a6, 32
-; RV64I-NEXT: slli t0, t0, 32
-; RV64I-NEXT: slli t1, t1, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: addi a6, sp, 32
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a1, t1, a1
+; RV64I-NEXT: or a4, t0, a7
+; RV64I-NEXT: or a7, t2, t1
+; RV64I-NEXT: or t0, t4, t3
+; RV64I-NEXT: or a0, a0, t5
+; RV64I-NEXT: or t1, s0, t6
+; RV64I-NEXT: or t2, s5, s1
+; RV64I-NEXT: or t3, s3, s2
+; RV64I-NEXT: or a1, a1, s4
+; RV64I-NEXT: slli a3, a3, 32
+; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: slli t2, t2, 32
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a3, a3, a5
+; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: or a5, t2, t1
+; RV64I-NEXT: or a1, a1, t3
; RV64I-NEXT: sd a3, 32(sp)
; RV64I-NEXT: sd a4, 40(sp)
-; RV64I-NEXT: sd a5, 48(sp)
-; RV64I-NEXT: sd a1, 56(sp)
-; RV64I-NEXT: ld a1, 0(t2)
-; RV64I-NEXT: ld a3, 8(t2)
-; RV64I-NEXT: ld a4, 16(t2)
-; RV64I-NEXT: ld a5, 24(t2)
-; RV64I-NEXT: xori a6, s5, 63
-; RV64I-NEXT: sll a7, a3, a0
-; RV64I-NEXT: srli t0, a1, 1
-; RV64I-NEXT: sll a5, a5, a0
-; RV64I-NEXT: srli t1, a4, 1
-; RV64I-NEXT: sll a4, a4, a0
-; RV64I-NEXT: srli a3, a3, 1
-; RV64I-NEXT: sll t2, a1, a0
-; RV64I-NEXT: srl a0, t0, a6
-; RV64I-NEXT: srl a1, t1, a6
-; RV64I-NEXT: srl a3, a3, a6
-; RV64I-NEXT: srli a6, t2, 56
-; RV64I-NEXT: srli t0, t2, 48
-; RV64I-NEXT: srli t1, t2, 40
-; RV64I-NEXT: srli t3, t2, 32
-; RV64I-NEXT: srli t4, t2, 24
-; RV64I-NEXT: srli t5, t2, 16
-; RV64I-NEXT: srli t6, t2, 8
-; RV64I-NEXT: or a0, a7, a0
-; RV64I-NEXT: or a1, a5, a1
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: sb t3, 4(a2)
-; RV64I-NEXT: sb t1, 5(a2)
-; RV64I-NEXT: sb t0, 6(a2)
-; RV64I-NEXT: sb a6, 7(a2)
-; RV64I-NEXT: sb t2, 0(a2)
-; RV64I-NEXT: sb t6, 1(a2)
-; RV64I-NEXT: sb t5, 2(a2)
-; RV64I-NEXT: sb t4, 3(a2)
+; RV64I-NEXT: sd a0, 48(sp)
+; RV64I-NEXT: sd a5, 56(sp)
+; RV64I-NEXT: srli a0, a1, 3
+; RV64I-NEXT: andi a3, a1, 63
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: sub a0, a6, a0
+; RV64I-NEXT: ld a4, 0(a0)
+; RV64I-NEXT: ld a5, 8(a0)
+; RV64I-NEXT: ld a6, 16(a0)
+; RV64I-NEXT: ld a0, 24(a0)
+; RV64I-NEXT: xori a3, a3, 63
+; RV64I-NEXT: sll a7, a5, a1
+; RV64I-NEXT: srli t0, a4, 1
+; RV64I-NEXT: sll t1, a0, a1
+; RV64I-NEXT: srli a0, a6, 1
+; RV64I-NEXT: sll a6, a6, a1
+; RV64I-NEXT: srli a5, a5, 1
+; RV64I-NEXT: sll a4, a4, a1
+; RV64I-NEXT: srl a1, t0, a3
+; RV64I-NEXT: srl t0, a0, a3
+; RV64I-NEXT: srl a3, a5, a3
+; RV64I-NEXT: srli a5, a4, 56
+; RV64I-NEXT: srli t2, a4, 48
+; RV64I-NEXT: srli t3, a4, 40
+; RV64I-NEXT: srli t4, a4, 32
+; RV64I-NEXT: srli t5, a4, 24
+; RV64I-NEXT: srli t6, a4, 16
+; RV64I-NEXT: srli s0, a4, 8
+; RV64I-NEXT: or a0, a7, a1
+; RV64I-NEXT: or a1, t1, t0
+; RV64I-NEXT: or a3, a6, a3
+; RV64I-NEXT: sb t4, 4(a2)
+; RV64I-NEXT: sb t3, 5(a2)
+; RV64I-NEXT: sb t2, 6(a2)
+; RV64I-NEXT: sb a5, 7(a2)
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb s0, 1(a2)
+; RV64I-NEXT: sb t6, 2(a2)
+; RV64I-NEXT: sb t5, 3(a2)
; RV64I-NEXT: srli a4, a3, 56
; RV64I-NEXT: srli a5, a3, 48
; RV64I-NEXT: srli a6, a3, 40
@@ -1903,17 +1989,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: sb a1, 9(a2)
; RV64I-NEXT: sb a5, 10(a2)
; RV64I-NEXT: sb a3, 11(a2)
-; RV64I-NEXT: ld s0, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 144
+; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 160
; RV64I-NEXT: ret
;
; RV32I-LABEL: shl_32bytes:
@@ -1938,55 +2026,67 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu a7, 3(a0)
; RV32I-NEXT: lbu a5, 4(a0)
; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t3, 6(a0)
-; RV32I-NEXT: lbu t6, 7(a0)
-; RV32I-NEXT: lbu s2, 8(a0)
-; RV32I-NEXT: lbu s3, 9(a0)
-; RV32I-NEXT: lbu s4, 10(a0)
-; RV32I-NEXT: lbu s5, 11(a0)
-; RV32I-NEXT: lbu s7, 12(a0)
-; RV32I-NEXT: lbu s8, 13(a0)
-; RV32I-NEXT: lbu s9, 14(a0)
-; RV32I-NEXT: lbu s10, 15(a0)
-; RV32I-NEXT: lbu s11, 16(a0)
-; RV32I-NEXT: lbu ra, 17(a0)
-; RV32I-NEXT: lbu t4, 18(a0)
-; RV32I-NEXT: lbu s0, 19(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s0, 12(a0)
+; RV32I-NEXT: lbu s2, 13(a0)
+; RV32I-NEXT: lbu s4, 14(a0)
+; RV32I-NEXT: lbu s5, 15(a0)
+; RV32I-NEXT: lbu s6, 16(a0)
+; RV32I-NEXT: lbu s7, 17(a0)
+; RV32I-NEXT: lbu s8, 18(a0)
+; RV32I-NEXT: lbu s9, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
; RV32I-NEXT: or a4, a7, a6
-; RV32I-NEXT: lbu t1, 20(a0)
-; RV32I-NEXT: lbu t2, 21(a0)
-; RV32I-NEXT: lbu t5, 22(a0)
-; RV32I-NEXT: lbu s1, 23(a0)
+; RV32I-NEXT: lbu s10, 20(a0)
+; RV32I-NEXT: lbu s11, 21(a0)
+; RV32I-NEXT: lbu ra, 22(a0)
+; RV32I-NEXT: lbu a3, 23(a0)
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or a5, t0, a5
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: lbu s1, 24(a0)
+; RV32I-NEXT: lbu s3, 25(a0)
+; RV32I-NEXT: lbu t4, 26(a0)
+; RV32I-NEXT: lbu t5, 27(a0)
+; RV32I-NEXT: slli s2, s2, 8
; RV32I-NEXT: slli s4, s4, 16
; RV32I-NEXT: slli s5, s5, 24
-; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: or a6, t6, t3
-; RV32I-NEXT: or a7, s3, s2
-; RV32I-NEXT: or t0, s5, s4
-; RV32I-NEXT: lbu t3, 24(a0)
-; RV32I-NEXT: lbu s5, 25(a0)
-; RV32I-NEXT: lbu s6, 26(a0)
-; RV32I-NEXT: lbu t6, 27(a0)
-; RV32I-NEXT: slli s8, s8, 8
-; RV32I-NEXT: slli s9, s9, 16
-; RV32I-NEXT: slli s10, s10, 24
-; RV32I-NEXT: slli ra, ra, 8
-; RV32I-NEXT: or s7, s8, s7
-; RV32I-NEXT: or s2, s10, s9
-; RV32I-NEXT: or s3, ra, s11
-; RV32I-NEXT: lbu s4, 28(a0)
-; RV32I-NEXT: lbu s8, 29(a0)
-; RV32I-NEXT: lbu s9, 30(a0)
-; RV32I-NEXT: lbu s10, 31(a0)
-; RV32I-NEXT: lbu a0, 0(a1)
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: or t1, s2, s0
+; RV32I-NEXT: or t2, s5, s4
+; RV32I-NEXT: or t3, s7, s6
+; RV32I-NEXT: lbu t6, 28(a0)
+; RV32I-NEXT: lbu s4, 29(a0)
+; RV32I-NEXT: lbu s5, 30(a0)
+; RV32I-NEXT: lbu s6, 31(a0)
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli s9, s9, 24
+; RV32I-NEXT: slli s11, s11, 8
+; RV32I-NEXT: slli ra, ra, 16
+; RV32I-NEXT: slli a3, a3, 24
+; RV32I-NEXT: or a0, s9, s8
+; RV32I-NEXT: or s0, s11, s10
+; RV32I-NEXT: or s2, a3, ra
+; RV32I-NEXT: lbu a3, 0(a1)
+; RV32I-NEXT: lbu s7, 1(a1)
+; RV32I-NEXT: lbu s8, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
; RV32I-NEXT: sw zero, 24(sp)
; RV32I-NEXT: sw zero, 28(sp)
; RV32I-NEXT: sw zero, 32(sp)
@@ -1995,89 +2095,88 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: sw zero, 12(sp)
; RV32I-NEXT: sw zero, 16(sp)
; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: slli s3, s3, 8
+; RV32I-NEXT: or s1, s3, s1
+; RV32I-NEXT: addi s3, sp, 40
; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli s0, s0, 24
-; RV32I-NEXT: or t4, s0, t4
-; RV32I-NEXT: addi s0, sp, 40
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: slli t5, t5, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: slli s5, s5, 8
-; RV32I-NEXT: slli s6, s6, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: slli s8, s8, 8
-; RV32I-NEXT: slli s9, s9, 16
-; RV32I-NEXT: slli s10, s10, 24
-; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: slli t5, t5, 24
+; RV32I-NEXT: slli s4, s4, 8
+; RV32I-NEXT: slli s5, s5, 16
+; RV32I-NEXT: slli s6, s6, 24
+; RV32I-NEXT: slli s7, s7, 8
+; RV32I-NEXT: slli s8, s8, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s4, t6
+; RV32I-NEXT: or t6, s6, s5
+; RV32I-NEXT: or a3, s7, a3
+; RV32I-NEXT: or a1, a1, s8
+; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a4, a4, s4
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a7, t2, t1
+; RV32I-NEXT: or t0, a0, t3
+; RV32I-NEXT: or t1, s2, s0
+; RV32I-NEXT: or t2, t4, s1
+; RV32I-NEXT: or t3, t6, t5
+; RV32I-NEXT: or a0, a1, a3
+; RV32I-NEXT: sw t0, 56(sp)
+; RV32I-NEXT: sw t1, 60(sp)
+; RV32I-NEXT: sw t2, 64(sp)
+; RV32I-NEXT: sw t3, 68(sp)
+; RV32I-NEXT: sw a4, 40(sp)
+; RV32I-NEXT: sw a5, 44(sp)
+; RV32I-NEXT: sw a6, 48(sp)
+; RV32I-NEXT: sw a7, 52(sp)
; RV32I-NEXT: srli a1, a0, 3
-; RV32I-NEXT: or t2, s1, t5
-; RV32I-NEXT: andi t5, a0, 31
-; RV32I-NEXT: or t3, s5, t3
-; RV32I-NEXT: or t6, t6, s6
-; RV32I-NEXT: or s1, s8, s4
-; RV32I-NEXT: or s4, s10, s9
-; RV32I-NEXT: andi s5, a1, 28
-; RV32I-NEXT: xori a1, t5, 31
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, s2, s7
-; RV32I-NEXT: or a7, t4, s3
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or t1, t6, t3
-; RV32I-NEXT: or t2, s4, s1
-; RV32I-NEXT: sub t3, s0, s5
-; RV32I-NEXT: sw a7, 56(sp)
-; RV32I-NEXT: sw t0, 60(sp)
-; RV32I-NEXT: sw t1, 64(sp)
-; RV32I-NEXT: sw t2, 68(sp)
-; RV32I-NEXT: sw a3, 40(sp)
-; RV32I-NEXT: sw a4, 44(sp)
-; RV32I-NEXT: sw a5, 48(sp)
-; RV32I-NEXT: sw a6, 52(sp)
-; RV32I-NEXT: lw a3, 0(t3)
-; RV32I-NEXT: lw a4, 4(t3)
-; RV32I-NEXT: lw a5, 8(t3)
-; RV32I-NEXT: lw a6, 12(t3)
-; RV32I-NEXT: lw a7, 16(t3)
-; RV32I-NEXT: lw t0, 20(t3)
-; RV32I-NEXT: lw t1, 24(t3)
-; RV32I-NEXT: lw t2, 28(t3)
-; RV32I-NEXT: sll t3, a4, a0
-; RV32I-NEXT: srli t4, a3, 1
-; RV32I-NEXT: sll t5, a6, a0
-; RV32I-NEXT: srli t6, a5, 1
-; RV32I-NEXT: sll a5, a5, a0
-; RV32I-NEXT: srli a4, a4, 1
-; RV32I-NEXT: sll s0, t0, a0
-; RV32I-NEXT: srli s1, a7, 1
-; RV32I-NEXT: sll a7, a7, a0
-; RV32I-NEXT: srli a6, a6, 1
+; RV32I-NEXT: andi a3, a0, 31
+; RV32I-NEXT: andi a4, a1, 28
+; RV32I-NEXT: xori a1, a3, 31
+; RV32I-NEXT: sub a3, s3, a4
+; RV32I-NEXT: lw a4, 0(a3)
+; RV32I-NEXT: lw a5, 4(a3)
+; RV32I-NEXT: lw a6, 8(a3)
+; RV32I-NEXT: lw a7, 12(a3)
+; RV32I-NEXT: lw t0, 16(a3)
+; RV32I-NEXT: lw t1, 20(a3)
+; RV32I-NEXT: lw t2, 24(a3)
+; RV32I-NEXT: lw a3, 28(a3)
+; RV32I-NEXT: sll t3, a5, a0
+; RV32I-NEXT: srli t4, a4, 1
+; RV32I-NEXT: sll t5, a7, a0
+; RV32I-NEXT: srli t6, a6, 1
+; RV32I-NEXT: sll a6, a6, a0
+; RV32I-NEXT: srli a5, a5, 1
+; RV32I-NEXT: sll s0, t1, a0
+; RV32I-NEXT: srli s1, t0, 1
+; RV32I-NEXT: sll t0, t0, a0
+; RV32I-NEXT: srli a7, a7, 1
+; RV32I-NEXT: sll s2, a3, a0
+; RV32I-NEXT: srli a3, t2, 1
; RV32I-NEXT: sll t2, t2, a0
-; RV32I-NEXT: srli s2, t1, 1
-; RV32I-NEXT: sll t1, t1, a0
-; RV32I-NEXT: srli t0, t0, 1
-; RV32I-NEXT: sll s3, a3, a0
+; RV32I-NEXT: srli t1, t1, 1
+; RV32I-NEXT: sll s3, a4, a0
; RV32I-NEXT: srl a0, t4, a1
-; RV32I-NEXT: srl a3, t6, a1
-; RV32I-NEXT: srl a4, a4, a1
+; RV32I-NEXT: srl a4, t6, a1
+; RV32I-NEXT: srl a5, a5, a1
; RV32I-NEXT: srl t4, s1, a1
-; RV32I-NEXT: srl a6, a6, a1
-; RV32I-NEXT: srl t6, s2, a1
-; RV32I-NEXT: srl t0, t0, a1
+; RV32I-NEXT: srl a7, a7, a1
+; RV32I-NEXT: srl t6, a3, a1
+; RV32I-NEXT: srl t1, t1, a1
; RV32I-NEXT: srli s1, s3, 24
-; RV32I-NEXT: srli s2, s3, 16
-; RV32I-NEXT: srli s4, s3, 8
+; RV32I-NEXT: srli s4, s3, 16
+; RV32I-NEXT: srli s5, s3, 8
; RV32I-NEXT: or a0, t3, a0
-; RV32I-NEXT: or a1, t5, a3
-; RV32I-NEXT: or a3, a5, a4
+; RV32I-NEXT: or a1, t5, a4
+; RV32I-NEXT: or a3, a6, a5
; RV32I-NEXT: or a4, s0, t4
-; RV32I-NEXT: or a5, a7, a6
-; RV32I-NEXT: or a6, t2, t6
-; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a5, t0, a7
+; RV32I-NEXT: or a6, s2, t6
+; RV32I-NEXT: or a7, t2, t1
; RV32I-NEXT: sb s3, 0(a2)
-; RV32I-NEXT: sb s4, 1(a2)
-; RV32I-NEXT: sb s2, 2(a2)
+; RV32I-NEXT: sb s5, 1(a2)
+; RV32I-NEXT: sb s4, 2(a2)
; RV32I-NEXT: sb s1, 3(a2)
; RV32I-NEXT: srli t0, a7, 24
; RV32I-NEXT: srli t1, a7, 16
@@ -2152,17 +2251,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: ashr_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -144
-; RV64I-NEXT: sd s0, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -160
+; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill
; RV64I-NEXT: lbu a3, 0(a0)
; RV64I-NEXT: lbu a4, 1(a0)
; RV64I-NEXT: lbu a5, 2(a0)
@@ -2179,123 +2280,144 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: lbu s1, 13(a0)
; RV64I-NEXT: lbu s2, 14(a0)
; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: slli a4, a4, 8
-; RV64I-NEXT: slli a5, a5, 16
-; RV64I-NEXT: slli a6, a6, 24
-; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: or a4, a6, a5
; RV64I-NEXT: lbu s4, 16(a0)
; RV64I-NEXT: lbu s5, 17(a0)
; RV64I-NEXT: lbu s6, 18(a0)
; RV64I-NEXT: lbu s7, 19(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
; RV64I-NEXT: slli t0, t0, 8
; RV64I-NEXT: slli t1, t1, 16
; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a5, t0, a7
+; RV64I-NEXT: or a6, t2, t1
+; RV64I-NEXT: lbu s8, 20(a0)
+; RV64I-NEXT: lbu s9, 21(a0)
+; RV64I-NEXT: lbu s10, 22(a0)
+; RV64I-NEXT: lbu s11, 23(a0)
; RV64I-NEXT: slli t4, t4, 8
; RV64I-NEXT: slli t5, t5, 16
; RV64I-NEXT: slli t6, t6, 24
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a6, t2, t1
-; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: lbu t5, 20(a0)
-; RV64I-NEXT: lbu t6, 21(a0)
-; RV64I-NEXT: lbu s8, 22(a0)
-; RV64I-NEXT: lbu s9, 23(a0)
; RV64I-NEXT: slli s1, s1, 8
; RV64I-NEXT: slli s2, s2, 16
; RV64I-NEXT: slli s3, s3, 24
+; RV64I-NEXT: or a7, t4, t3
+; RV64I-NEXT: or t0, t6, t5
+; RV64I-NEXT: or t1, s1, s0
+; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: lbu t6, 24(a0)
+; RV64I-NEXT: lbu s0, 25(a0)
+; RV64I-NEXT: lbu s1, 26(a0)
+; RV64I-NEXT: lbu s2, 27(a0)
; RV64I-NEXT: slli s5, s5, 8
; RV64I-NEXT: slli s6, s6, 16
; RV64I-NEXT: slli s7, s7, 24
-; RV64I-NEXT: or t1, s1, s0
-; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: slli s9, s9, 8
; RV64I-NEXT: or t3, s5, s4
; RV64I-NEXT: or t4, s7, s6
-; RV64I-NEXT: lbu s0, 24(a0)
-; RV64I-NEXT: lbu s1, 25(a0)
-; RV64I-NEXT: lbu s2, 26(a0)
-; RV64I-NEXT: lbu s3, 27(a0)
-; RV64I-NEXT: slli t6, t6, 8
-; RV64I-NEXT: slli s8, s8, 16
-; RV64I-NEXT: slli s9, s9, 24
-; RV64I-NEXT: slli s1, s1, 8
-; RV64I-NEXT: or t5, t6, t5
-; RV64I-NEXT: or t6, s9, s8
-; RV64I-NEXT: or s0, s1, s0
-; RV64I-NEXT: lbu s1, 28(a0)
+; RV64I-NEXT: or t5, s9, s8
+; RV64I-NEXT: lbu s3, 28(a0)
; RV64I-NEXT: lbu s4, 29(a0)
; RV64I-NEXT: lbu s5, 30(a0)
; RV64I-NEXT: lbu s6, 31(a0)
-; RV64I-NEXT: lbu a0, 0(a1)
-; RV64I-NEXT: slli s2, s2, 16
-; RV64I-NEXT: slli s3, s3, 24
-; RV64I-NEXT: or a1, s3, s2
-; RV64I-NEXT: mv s2, sp
+; RV64I-NEXT: slli s10, s10, 16
+; RV64I-NEXT: slli s11, s11, 24
+; RV64I-NEXT: slli s0, s0, 8
+; RV64I-NEXT: slli s1, s1, 16
+; RV64I-NEXT: slli s2, s2, 24
; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or a0, s11, s10
+; RV64I-NEXT: or t6, s0, t6
+; RV64I-NEXT: or s0, s2, s1
+; RV64I-NEXT: or s1, s4, s3
+; RV64I-NEXT: lbu s2, 0(a1)
+; RV64I-NEXT: lbu s3, 1(a1)
+; RV64I-NEXT: lbu s4, 2(a1)
+; RV64I-NEXT: lbu s7, 3(a1)
; RV64I-NEXT: slli s5, s5, 16
; RV64I-NEXT: slli s6, s6, 24
-; RV64I-NEXT: or s1, s4, s1
-; RV64I-NEXT: srli s3, a0, 3
-; RV64I-NEXT: or s4, s6, s5
-; RV64I-NEXT: andi s5, a0, 63
-; RV64I-NEXT: andi s3, s3, 24
-; RV64I-NEXT: xori s5, s5, 63
+; RV64I-NEXT: slli s3, s3, 8
+; RV64I-NEXT: slli s4, s4, 16
+; RV64I-NEXT: slli s7, s7, 24
+; RV64I-NEXT: or s5, s6, s5
+; RV64I-NEXT: or s2, s3, s2
+; RV64I-NEXT: or s3, s7, s4
+; RV64I-NEXT: lbu s4, 5(a1)
+; RV64I-NEXT: lbu s6, 4(a1)
+; RV64I-NEXT: lbu s7, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli s4, s4, 8
+; RV64I-NEXT: or s4, s4, s6
+; RV64I-NEXT: slli s7, s7, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, s7
+; RV64I-NEXT: mv s6, sp
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: or a4, a6, a5
; RV64I-NEXT: or a5, t0, a7
; RV64I-NEXT: or a6, t2, t1
; RV64I-NEXT: or a7, t4, t3
-; RV64I-NEXT: or t0, t6, t5
-; RV64I-NEXT: or a1, a1, s0
-; RV64I-NEXT: or t1, s4, s1
-; RV64I-NEXT: add s2, s2, s3
+; RV64I-NEXT: or a0, a0, t5
+; RV64I-NEXT: or t0, s0, t6
+; RV64I-NEXT: or t1, s5, s1
+; RV64I-NEXT: or t2, s3, s2
+; RV64I-NEXT: or a1, a1, s4
; RV64I-NEXT: slli a4, a4, 32
; RV64I-NEXT: slli a6, a6, 32
-; RV64I-NEXT: slli t0, t0, 32
-; RV64I-NEXT: slli t2, t1, 32
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: slli t3, t1, 32
+; RV64I-NEXT: slli a1, a1, 32
; RV64I-NEXT: sraiw t1, t1, 31
; RV64I-NEXT: or a3, a4, a3
; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a5, t0, a7
-; RV64I-NEXT: or a1, t2, a1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: or a5, t3, t0
+; RV64I-NEXT: or a1, a1, t2
; RV64I-NEXT: sd t1, 32(sp)
; RV64I-NEXT: sd t1, 40(sp)
; RV64I-NEXT: sd t1, 48(sp)
; RV64I-NEXT: sd t1, 56(sp)
; RV64I-NEXT: sd a3, 0(sp)
; RV64I-NEXT: sd a4, 8(sp)
-; RV64I-NEXT: sd a5, 16(sp)
-; RV64I-NEXT: sd a1, 24(sp)
-; RV64I-NEXT: ld a1, 8(s2)
-; RV64I-NEXT: ld a3, 16(s2)
-; RV64I-NEXT: ld a4, 0(s2)
-; RV64I-NEXT: ld a5, 24(s2)
-; RV64I-NEXT: srl a6, a1, a0
-; RV64I-NEXT: slli a7, a3, 1
-; RV64I-NEXT: srl a4, a4, a0
-; RV64I-NEXT: slli a1, a1, 1
-; RV64I-NEXT: srl a3, a3, a0
+; RV64I-NEXT: sd a0, 16(sp)
+; RV64I-NEXT: sd a5, 24(sp)
+; RV64I-NEXT: srli a0, a1, 3
+; RV64I-NEXT: andi a3, a1, 63
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: xori a3, a3, 63
+; RV64I-NEXT: add a0, s6, a0
+; RV64I-NEXT: ld a4, 8(a0)
+; RV64I-NEXT: ld a5, 16(a0)
+; RV64I-NEXT: ld a6, 0(a0)
+; RV64I-NEXT: ld a0, 24(a0)
+; RV64I-NEXT: srl a7, a4, a1
; RV64I-NEXT: slli t0, a5, 1
-; RV64I-NEXT: sra a5, a5, a0
-; RV64I-NEXT: sll a0, a7, s5
-; RV64I-NEXT: sll a1, a1, s5
-; RV64I-NEXT: sll a7, t0, s5
-; RV64I-NEXT: srli t0, a5, 56
-; RV64I-NEXT: srli t1, a5, 48
-; RV64I-NEXT: srli t2, a5, 40
-; RV64I-NEXT: srli t3, a5, 32
-; RV64I-NEXT: srli t4, a5, 24
-; RV64I-NEXT: srli t5, a5, 16
-; RV64I-NEXT: srli t6, a5, 8
-; RV64I-NEXT: or a0, a6, a0
-; RV64I-NEXT: or a1, a4, a1
-; RV64I-NEXT: or a3, a3, a7
+; RV64I-NEXT: srl a6, a6, a1
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: srl a5, a5, a1
+; RV64I-NEXT: slli t1, a0, 1
+; RV64I-NEXT: sra t2, a0, a1
+; RV64I-NEXT: sll a0, t0, a3
+; RV64I-NEXT: sll a1, a4, a3
+; RV64I-NEXT: sll a3, t1, a3
+; RV64I-NEXT: srli a4, t2, 56
+; RV64I-NEXT: srli t0, t2, 48
+; RV64I-NEXT: srli t1, t2, 40
+; RV64I-NEXT: srli t3, t2, 32
+; RV64I-NEXT: srli t4, t2, 24
+; RV64I-NEXT: srli t5, t2, 16
+; RV64I-NEXT: srli t6, t2, 8
+; RV64I-NEXT: or a0, a7, a0
+; RV64I-NEXT: or a1, a6, a1
+; RV64I-NEXT: or a3, a5, a3
; RV64I-NEXT: sb t3, 28(a2)
-; RV64I-NEXT: sb t2, 29(a2)
-; RV64I-NEXT: sb t1, 30(a2)
-; RV64I-NEXT: sb t0, 31(a2)
-; RV64I-NEXT: sb a5, 24(a2)
+; RV64I-NEXT: sb t1, 29(a2)
+; RV64I-NEXT: sb t0, 30(a2)
+; RV64I-NEXT: sb a4, 31(a2)
+; RV64I-NEXT: sb t2, 24(a2)
; RV64I-NEXT: sb t6, 25(a2)
; RV64I-NEXT: sb t5, 26(a2)
; RV64I-NEXT: sb t4, 27(a2)
@@ -2316,45 +2438,47 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: srli s3, a0, 56
; RV64I-NEXT: srli s4, a0, 48
; RV64I-NEXT: srli s5, a0, 40
+; RV64I-NEXT: srli s6, a0, 32
; RV64I-NEXT: sb a7, 20(a2)
; RV64I-NEXT: sb a6, 21(a2)
; RV64I-NEXT: sb a5, 22(a2)
; RV64I-NEXT: sb a4, 23(a2)
-; RV64I-NEXT: srli a4, a0, 32
+; RV64I-NEXT: srli a4, a0, 24
; RV64I-NEXT: sb a3, 16(a2)
; RV64I-NEXT: sb t2, 17(a2)
; RV64I-NEXT: sb t1, 18(a2)
; RV64I-NEXT: sb t0, 19(a2)
-; RV64I-NEXT: srli a3, a0, 24
+; RV64I-NEXT: srli a3, a0, 16
; RV64I-NEXT: sb t6, 4(a2)
; RV64I-NEXT: sb t5, 5(a2)
; RV64I-NEXT: sb t4, 6(a2)
; RV64I-NEXT: sb t3, 7(a2)
-; RV64I-NEXT: srli a5, a0, 16
+; RV64I-NEXT: srli a5, a0, 8
; RV64I-NEXT: sb a1, 0(a2)
; RV64I-NEXT: sb s2, 1(a2)
; RV64I-NEXT: sb s1, 2(a2)
; RV64I-NEXT: sb s0, 3(a2)
-; RV64I-NEXT: srli a1, a0, 8
-; RV64I-NEXT: sb a4, 12(a2)
+; RV64I-NEXT: sb s6, 12(a2)
; RV64I-NEXT: sb s5, 13(a2)
; RV64I-NEXT: sb s4, 14(a2)
; RV64I-NEXT: sb s3, 15(a2)
; RV64I-NEXT: sb a0, 8(a2)
-; RV64I-NEXT: sb a1, 9(a2)
-; RV64I-NEXT: sb a5, 10(a2)
-; RV64I-NEXT: sb a3, 11(a2)
-; RV64I-NEXT: ld s0, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 144
+; RV64I-NEXT: sb a5, 9(a2)
+; RV64I-NEXT: sb a3, 10(a2)
+; RV64I-NEXT: sb a4, 11(a2)
+; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 160
; RV64I-NEXT: ret
;
; RV32I-LABEL: ashr_32bytes:
@@ -2379,148 +2503,159 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV32I-NEXT: lbu a7, 3(a0)
; RV32I-NEXT: lbu a5, 4(a0)
; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t3, 6(a0)
-; RV32I-NEXT: lbu t4, 7(a0)
-; RV32I-NEXT: lbu t6, 8(a0)
-; RV32I-NEXT: lbu s0, 9(a0)
-; RV32I-NEXT: lbu s4, 10(a0)
-; RV32I-NEXT: lbu s5, 11(a0)
-; RV32I-NEXT: lbu s6, 12(a0)
-; RV32I-NEXT: lbu s7, 13(a0)
-; RV32I-NEXT: lbu s8, 14(a0)
-; RV32I-NEXT: lbu s9, 15(a0)
-; RV32I-NEXT: lbu s10, 16(a0)
-; RV32I-NEXT: lbu s11, 17(a0)
-; RV32I-NEXT: lbu s2, 18(a0)
-; RV32I-NEXT: lbu s3, 19(a0)
+; RV32I-NEXT: lbu t1, 6(a0)
+; RV32I-NEXT: lbu t2, 7(a0)
+; RV32I-NEXT: lbu t3, 8(a0)
+; RV32I-NEXT: lbu t4, 9(a0)
+; RV32I-NEXT: lbu t5, 10(a0)
+; RV32I-NEXT: lbu t6, 11(a0)
+; RV32I-NEXT: lbu s0, 12(a0)
+; RV32I-NEXT: lbu s1, 13(a0)
+; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: lbu s3, 15(a0)
+; RV32I-NEXT: lbu s4, 16(a0)
+; RV32I-NEXT: lbu s5, 17(a0)
+; RV32I-NEXT: lbu s6, 18(a0)
+; RV32I-NEXT: lbu s7, 19(a0)
; RV32I-NEXT: slli a4, a4, 8
; RV32I-NEXT: slli a6, a6, 16
; RV32I-NEXT: slli a7, a7, 24
; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
; RV32I-NEXT: or a4, a7, a6
-; RV32I-NEXT: lbu t1, 20(a0)
-; RV32I-NEXT: lbu t2, 21(a0)
-; RV32I-NEXT: lbu t5, 22(a0)
-; RV32I-NEXT: lbu s1, 23(a0)
+; RV32I-NEXT: lbu s8, 20(a0)
+; RV32I-NEXT: lbu s9, 21(a0)
+; RV32I-NEXT: lbu s10, 22(a0)
+; RV32I-NEXT: lbu s11, 23(a0)
; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: slli s4, s4, 16
-; RV32I-NEXT: slli s5, s5, 24
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: slli t4, t4, 8
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli t6, t6, 24
; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: or a6, t4, t3
-; RV32I-NEXT: or a7, s0, t6
-; RV32I-NEXT: or t0, s5, s4
-; RV32I-NEXT: lbu t3, 24(a0)
-; RV32I-NEXT: lbu s4, 25(a0)
-; RV32I-NEXT: lbu s5, 26(a0)
-; RV32I-NEXT: lbu ra, 27(a0)
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: slli s11, s11, 8
-; RV32I-NEXT: or t4, s7, s6
-; RV32I-NEXT: or t6, s9, s8
-; RV32I-NEXT: or s0, s11, s10
-; RV32I-NEXT: lbu s6, 28(a0)
-; RV32I-NEXT: lbu s7, 29(a0)
-; RV32I-NEXT: lbu s8, 30(a0)
-; RV32I-NEXT: lbu s9, 31(a0)
-; RV32I-NEXT: lbu a0, 0(a1)
+; RV32I-NEXT: or a6, t2, t1
+; RV32I-NEXT: or a7, t4, t3
+; RV32I-NEXT: or t0, t6, t5
+; RV32I-NEXT: lbu ra, 24(a0)
+; RV32I-NEXT: lbu a3, 25(a0)
+; RV32I-NEXT: lbu t4, 26(a0)
+; RV32I-NEXT: lbu t5, 27(a0)
+; RV32I-NEXT: slli s1, s1, 8
; RV32I-NEXT: slli s2, s2, 16
; RV32I-NEXT: slli s3, s3, 24
-; RV32I-NEXT: or s2, s3, s2
-; RV32I-NEXT: addi s3, sp, 8
-; RV32I-NEXT: slli t2, t2, 8
-; RV32I-NEXT: slli t5, t5, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: slli s4, s4, 8
-; RV32I-NEXT: slli s5, s5, 16
-; RV32I-NEXT: slli ra, ra, 24
-; RV32I-NEXT: slli s7, s7, 8
-; RV32I-NEXT: slli s8, s8, 16
-; RV32I-NEXT: slli s9, s9, 24
-; RV32I-NEXT: or t1, t2, t1
-; RV32I-NEXT: srli a1, a0, 3
+; RV32I-NEXT: slli s5, s5, 8
+; RV32I-NEXT: or t1, s1, s0
+; RV32I-NEXT: or t2, s3, s2
+; RV32I-NEXT: or t3, s5, s4
+; RV32I-NEXT: lbu t6, 28(a0)
+; RV32I-NEXT: lbu s0, 29(a0)
+; RV32I-NEXT: lbu s1, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli s6, s6, 16
+; RV32I-NEXT: slli s7, s7, 24
+; RV32I-NEXT: slli s9, s9, 8
+; RV32I-NEXT: slli s10, s10, 16
+; RV32I-NEXT: slli s11, s11, 24
+; RV32I-NEXT: or s2, s7, s6
+; RV32I-NEXT: or s3, s9, s8
+; RV32I-NEXT: or s4, s11, s10
+; RV32I-NEXT: lbu s5, 0(a1)
+; RV32I-NEXT: lbu s6, 1(a1)
+; RV32I-NEXT: lbu s7, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, ra
+; RV32I-NEXT: addi s8, sp, 8
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli t5, t5, 24
+; RV32I-NEXT: slli s0, s0, 8
+; RV32I-NEXT: slli s1, s1, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: slli s6, s6, 8
+; RV32I-NEXT: slli s7, s7, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or t4, t5, t4
+; RV32I-NEXT: or t5, s0, t6
+; RV32I-NEXT: or s1, a0, s1
+; RV32I-NEXT: or t6, s6, s5
+; RV32I-NEXT: or a1, a1, s7
+; RV32I-NEXT: srai s0, a0, 31
+; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: or a4, a4, a0
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a7, t2, t1
+; RV32I-NEXT: or t0, s2, t3
+; RV32I-NEXT: or t1, s4, s3
+; RV32I-NEXT: or a3, t4, a3
; RV32I-NEXT: or t2, s1, t5
-; RV32I-NEXT: andi t5, a0, 31
-; RV32I-NEXT: or t3, s4, t3
-; RV32I-NEXT: or s1, ra, s5
-; RV32I-NEXT: or s4, s7, s6
-; RV32I-NEXT: or s5, s9, s8
-; RV32I-NEXT: srai s6, s9, 31
-; RV32I-NEXT: andi s7, a1, 28
-; RV32I-NEXT: xori a1, t5, 31
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or a6, t6, t4
-; RV32I-NEXT: or a7, s2, s0
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or t1, s1, t3
-; RV32I-NEXT: or t2, s5, s4
-; RV32I-NEXT: sw s6, 56(sp)
-; RV32I-NEXT: sw s6, 60(sp)
-; RV32I-NEXT: sw s6, 64(sp)
-; RV32I-NEXT: sw s6, 68(sp)
-; RV32I-NEXT: sw s6, 40(sp)
-; RV32I-NEXT: sw s6, 44(sp)
-; RV32I-NEXT: sw s6, 48(sp)
-; RV32I-NEXT: sw s6, 52(sp)
-; RV32I-NEXT: add s3, s3, s7
-; RV32I-NEXT: sw a7, 24(sp)
-; RV32I-NEXT: sw t0, 28(sp)
-; RV32I-NEXT: sw t1, 32(sp)
+; RV32I-NEXT: or a0, a1, t6
+; RV32I-NEXT: sw s0, 56(sp)
+; RV32I-NEXT: sw s0, 60(sp)
+; RV32I-NEXT: sw s0, 64(sp)
+; RV32I-NEXT: sw s0, 68(sp)
+; RV32I-NEXT: sw s0, 40(sp)
+; RV32I-NEXT: sw s0, 44(sp)
+; RV32I-NEXT: sw s0, 48(sp)
+; RV32I-NEXT: sw s0, 52(sp)
+; RV32I-NEXT: sw t0, 24(sp)
+; RV32I-NEXT: sw t1, 28(sp)
+; RV32I-NEXT: sw a3, 32(sp)
; RV32I-NEXT: sw t2, 36(sp)
-; RV32I-NEXT: sw a3, 8(sp)
-; RV32I-NEXT: sw a4, 12(sp)
-; RV32I-NEXT: sw a5, 16(sp)
-; RV32I-NEXT: sw a6, 20(sp)
-; RV32I-NEXT: lw a3, 0(s3)
-; RV32I-NEXT: lw a4, 4(s3)
-; RV32I-NEXT: lw a5, 8(s3)
-; RV32I-NEXT: lw a6, 12(s3)
-; RV32I-NEXT: lw a7, 16(s3)
-; RV32I-NEXT: lw t0, 20(s3)
-; RV32I-NEXT: lw t1, 24(s3)
-; RV32I-NEXT: lw t2, 28(s3)
-; RV32I-NEXT: srl t3, a4, a0
-; RV32I-NEXT: slli t4, a5, 1
+; RV32I-NEXT: sw a4, 8(sp)
+; RV32I-NEXT: sw a5, 12(sp)
+; RV32I-NEXT: sw a6, 16(sp)
+; RV32I-NEXT: sw a7, 20(sp)
+; RV32I-NEXT: srli a1, a0, 3
+; RV32I-NEXT: andi a3, a0, 31
+; RV32I-NEXT: andi a4, a1, 28
+; RV32I-NEXT: xori a1, a3, 31
+; RV32I-NEXT: add a4, s8, a4
+; RV32I-NEXT: lw a3, 0(a4)
+; RV32I-NEXT: lw a5, 4(a4)
+; RV32I-NEXT: lw a6, 8(a4)
+; RV32I-NEXT: lw a7, 12(a4)
+; RV32I-NEXT: lw t0, 16(a4)
+; RV32I-NEXT: lw t1, 20(a4)
+; RV32I-NEXT: lw t2, 24(a4)
+; RV32I-NEXT: lw a4, 28(a4)
+; RV32I-NEXT: srl t3, a5, a0
+; RV32I-NEXT: slli t4, a6, 1
; RV32I-NEXT: srl a3, a3, a0
-; RV32I-NEXT: slli a4, a4, 1
-; RV32I-NEXT: srl t5, a6, a0
-; RV32I-NEXT: slli t6, a7, 1
-; RV32I-NEXT: srl a5, a5, a0
-; RV32I-NEXT: slli a6, a6, 1
-; RV32I-NEXT: srl s0, t0, a0
-; RV32I-NEXT: slli s1, t1, 1
-; RV32I-NEXT: srl a7, a7, a0
-; RV32I-NEXT: slli t0, t0, 1
-; RV32I-NEXT: srl t1, t1, a0
-; RV32I-NEXT: slli s2, t2, 1
-; RV32I-NEXT: sra t2, t2, a0
+; RV32I-NEXT: slli a5, a5, 1
+; RV32I-NEXT: srl t5, a7, a0
+; RV32I-NEXT: slli t6, t0, 1
+; RV32I-NEXT: srl a6, a6, a0
+; RV32I-NEXT: slli a7, a7, 1
+; RV32I-NEXT: srl s0, t1, a0
+; RV32I-NEXT: slli s1, t2, 1
+; RV32I-NEXT: srl t0, t0, a0
+; RV32I-NEXT: slli t1, t1, 1
+; RV32I-NEXT: srl t2, t2, a0
+; RV32I-NEXT: slli s2, a4, 1
+; RV32I-NEXT: sra s3, a4, a0
; RV32I-NEXT: sll a0, t4, a1
-; RV32I-NEXT: sll a4, a4, a1
-; RV32I-NEXT: sll t4, t6, a1
-; RV32I-NEXT: sll a6, a6, a1
-; RV32I-NEXT: sll t6, s1, a1
-; RV32I-NEXT: sll t0, t0, a1
-; RV32I-NEXT: sll s1, s2, a1
-; RV32I-NEXT: srli s2, t2, 24
-; RV32I-NEXT: srli s3, t2, 16
-; RV32I-NEXT: srli s4, t2, 8
+; RV32I-NEXT: sll a4, a5, a1
+; RV32I-NEXT: sll a5, t6, a1
+; RV32I-NEXT: sll a7, a7, a1
+; RV32I-NEXT: sll t4, s1, a1
+; RV32I-NEXT: sll t1, t1, a1
+; RV32I-NEXT: sll t6, s2, a1
+; RV32I-NEXT: srli s1, s3, 24
+; RV32I-NEXT: srli s2, s3, 16
+; RV32I-NEXT: srli s4, s3, 8
; RV32I-NEXT: or a0, t3, a0
; RV32I-NEXT: or a1, a3, a4
-; RV32I-NEXT: or a3, t5, t4
-; RV32I-NEXT: or a4, a5, a6
-; RV32I-NEXT: or a5, s0, t6
-; RV32I-NEXT: or a6, a7, t0
-; RV32I-NEXT: or a7, t1, s1
-; RV32I-NEXT: sb t2, 28(a2)
+; RV32I-NEXT: or a3, t5, a5
+; RV32I-NEXT: or a4, a6, a7
+; RV32I-NEXT: or a5, s0, t4
+; RV32I-NEXT: or a6, t0, t1
+; RV32I-NEXT: or a7, t2, t6
+; RV32I-NEXT: sb s3, 28(a2)
; RV32I-NEXT: sb s4, 29(a2)
-; RV32I-NEXT: sb s3, 30(a2)
-; RV32I-NEXT: sb s2, 31(a2)
+; RV32I-NEXT: sb s2, 30(a2)
+; RV32I-NEXT: sb s1, 31(a2)
; RV32I-NEXT: srli t0, a7, 24
; RV32I-NEXT: srli t1, a7, 16
; RV32I-NEXT: srli t2, a7, 8
diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll
index bbf4d50bd716d..8a6a30318ae58 100644
--- a/llvm/test/CodeGen/SystemZ/pr60413.ll
+++ b/llvm/test/CodeGen/SystemZ/pr60413.ll
@@ -16,31 +16,31 @@ define dso_local void @m() local_unnamed_addr #1 {
; CHECK-NEXT: stmg %r13, %r15, 104(%r15)
; CHECK-NEXT: aghi %r15, -168
; CHECK-NEXT: lhrl %r1, f+4
+; CHECK-NEXT: sll %r1, 8
; CHECK-NEXT: larl %r2, f
-; CHECK-NEXT: llc %r2, 6(%r2)
-; CHECK-NEXT: larl %r3, e
-; CHECK-NEXT: lb %r0, 3(%r3)
-; CHECK-NEXT: rosbg %r2, %r1, 32, 55, 8
-; CHECK-NEXT: vlvgp %v0, %r2, %r0
-; CHECK-NEXT: vlvgf %v0, %r2, 0
-; CHECK-NEXT: vlvgf %v0, %r2, 2
-; CHECK-NEXT: vlvgp %v1, %r0, %r2
-; CHECK-NEXT: vlvgp %v2, %r2, %r2
-; CHECK-NEXT: lr %r1, %r2
+; CHECK-NEXT: ic %r1, 6(%r2)
+; CHECK-NEXT: larl %r2, e
+; CHECK-NEXT: lb %r0, 3(%r2)
+; CHECK-NEXT: vlvgp %v0, %r0, %r1
+; CHECK-NEXT: vlvgp %v1, %r1, %r0
+; CHECK-NEXT: vlvgf %v1, %r1, 0
+; CHECK-NEXT: vlvgf %v1, %r1, 2
+; CHECK-NEXT: vlvgp %v2, %r1, %r1
+; CHECK-NEXT: # kill: def $r1l killed $r1l killed $r1d
; CHECK-NEXT: nilh %r1, 255
; CHECK-NEXT: chi %r1, 128
; CHECK-NEXT: ipm %r1
; CHECK-NEXT: risbg %r1, %r1, 63, 191, 36
+; CHECK-NEXT: vlvgf %v0, %r0, 0
+; CHECK-NEXT: vlvgf %v0, %r0, 2
; CHECK-NEXT: vgbm %v3, 30583
; CHECK-NEXT: vn %v0, %v0, %v3
-; CHECK-NEXT: vlvgf %v1, %r0, 0
-; CHECK-NEXT: vlvgf %v1, %r0, 2
; CHECK-NEXT: vn %v1, %v1, %v3
; CHECK-NEXT: vrepf %v2, %v2, 1
; CHECK-NEXT: vn %v2, %v2, %v3
; CHECK-NEXT: vrepif %v3, 127
-; CHECK-NEXT: vchlf %v0, %v0, %v3
-; CHECK-NEXT: vlgvf %r13, %v0, 0
+; CHECK-NEXT: vchlf %v1, %v1, %v3
+; CHECK-NEXT: vlgvf %r13, %v1, 0
; CHECK-NEXT: vchlf %v2, %v2, %v3
; CHECK-NEXT: vlgvf %r3, %v2, 1
; CHECK-NEXT: nilf %r3, 1
@@ -54,13 +54,13 @@ define dso_local void @m() local_unnamed_addr #1 {
; CHECK-NEXT: nilf %r14, 1
; CHECK-NEXT: rosbg %r2, %r14, 32, 51, 12
; CHECK-NEXT: rosbg %r2, %r13, 52, 52, 11
-; CHECK-NEXT: vlgvf %r13, %v0, 1
+; CHECK-NEXT: vlgvf %r13, %v1, 1
; CHECK-NEXT: rosbg %r2, %r13, 53, 53, 10
-; CHECK-NEXT: vlgvf %r13, %v0, 2
+; CHECK-NEXT: vlgvf %r13, %v1, 2
; CHECK-NEXT: rosbg %r2, %r13, 54, 54, 9
-; CHECK-NEXT: vlgvf %r13, %v0, 3
+; CHECK-NEXT: vlgvf %r13, %v1, 3
; CHECK-NEXT: rosbg %r2, %r13, 55, 55, 8
-; CHECK-NEXT: vchlf %v0, %v1, %v3
+; CHECK-NEXT: vchlf %v0, %v0, %v3
; CHECK-NEXT: vlgvf %r13, %v0, 0
; CHECK-NEXT: rosbg %r2, %r13, 56, 56, 7
; CHECK-NEXT: vlgvf %r13, %v0, 1
diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll
index f6d66ab47ce05..6e22d855dc831 100644
--- a/llvm/test/CodeGen/X86/abds-neg.ll
+++ b/llvm/test/CodeGen/X86/abds-neg.ll
@@ -1058,15 +1058,15 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: subl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: sarl $31, %edx
-; X86-NEXT: xorl %edx, %ecx
; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: xorl %edx, %ecx
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: subl %esi, %eax
-; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: sbbl %esi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
@@ -1089,15 +1089,15 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: subl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: sarl $31, %edx
-; X86-NEXT: xorl %edx, %ecx
; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: xorl %edx, %ecx
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: subl %esi, %eax
-; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: sbbl %esi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
@@ -1121,11 +1121,11 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: subl {{[0-9]+}}(%esp), %edi
; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi
; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx
@@ -1178,11 +1178,11 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: subl {{[0-9]+}}(%esp), %edi
; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi
; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index 9be816655072c..0de308a9e0738 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -1734,20 +1734,20 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; SSE2-LABEL: not_avg_v16i8_wide_constants:
; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm1
-; SSE2-NEXT: movdqa (%rsi), %xmm2
+; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
-; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movd %eax, %xmm2
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
-; SSE2-NEXT: movd %eax, %xmm4
+; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
-; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: movd %eax, %xmm4
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
; SSE2-NEXT: movd %eax, %xmm5
@@ -1762,6 +1762,9 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; SSE2-NEXT: movd %eax, %xmm8
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
+; SSE2-NEXT: movd %eax, %xmm10
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: decl %eax
; SSE2-NEXT: movd %eax, %xmm9
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
@@ -1771,9 +1774,6 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; SSE2-NEXT: movd %eax, %xmm12
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
-; SSE2-NEXT: movd %eax, %xmm10
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT: decl %eax
; SSE2-NEXT: movd %eax, %xmm13
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
@@ -1783,43 +1783,45 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; SSE2-NEXT: movd %eax, %xmm15
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: decl %eax
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,0,0]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
; SSE2-NEXT: movapd %xmm4, %xmm5
; SSE2-NEXT: andpd %xmm1, %xmm5
; SSE2-NEXT: xorpd %xmm4, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: paddw %xmm5, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,0,0,0]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; SSE2-NEXT: movapd %xmm0, %xmm3
-; SSE2-NEXT: andpd %xmm2, %xmm3
-; SSE2-NEXT: xorpd %xmm0, %xmm2
-; SSE2-NEXT: psrlw $1, %xmm2
-; SSE2-NEXT: paddw %xmm3, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSE2-NEXT: movapd %xmm2, %xmm3
+; SSE2-NEXT: andpd %xmm0, %xmm3
+; SSE2-NEXT: xorpd %xmm2, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: paddw %xmm3, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: packuswb %xmm0, %xmm1
; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: retq
;
@@ -1829,71 +1831,75 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT: vpextrw $3, %xmm3, %edx
-; AVX1-NEXT: vpextrw $2, %xmm3, %ecx
-; AVX1-NEXT: vpextrw $1, %xmm3, %eax
+; AVX1-NEXT: vpextrw $7, %xmm3, %edx
+; AVX1-NEXT: vpextrw $6, %xmm3, %ecx
+; AVX1-NEXT: vpextrw $5, %xmm3, %eax
; AVX1-NEXT: decl %edx
; AVX1-NEXT: vmovd %edx, %xmm4
-; AVX1-NEXT: vpextrw $0, %xmm3, %edx
+; AVX1-NEXT: vpextrw $4, %xmm3, %edx
; AVX1-NEXT: decl %ecx
; AVX1-NEXT: vmovd %ecx, %xmm5
-; AVX1-NEXT: vpextrw $3, %xmm2, %ecx
+; AVX1-NEXT: vpextrw $1, %xmm3, %ecx
; AVX1-NEXT: decl %eax
; AVX1-NEXT: vmovd %eax, %xmm6
-; AVX1-NEXT: vpextrw $2, %xmm2, %eax
+; AVX1-NEXT: vpextrw $0, %xmm3, %eax
; AVX1-NEXT: decl %edx
; AVX1-NEXT: vmovd %edx, %xmm7
-; AVX1-NEXT: vpextrw $1, %xmm2, %edx
-; AVX1-NEXT: decl %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm8
-; AVX1-NEXT: vpextrw $0, %xmm2, %ecx
-; AVX1-NEXT: decl %eax
-; AVX1-NEXT: vmovd %eax, %xmm9
-; AVX1-NEXT: vpextrw $7, %xmm3, %eax
+; AVX1-NEXT: vpextrw $3, %xmm3, %edx
+; AVX1-NEXT: decq %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm8
+; AVX1-NEXT: vpextrw $2, %xmm3, %ecx
+; AVX1-NEXT: decq %rax
+; AVX1-NEXT: vmovq %rax, %xmm3
+; AVX1-NEXT: vpextrw $7, %xmm2, %eax
; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm10
-; AVX1-NEXT: vpextrw $6, %xmm3, %edx
+; AVX1-NEXT: vmovd %edx, %xmm9
+; AVX1-NEXT: vpextrw $6, %xmm2, %edx
; AVX1-NEXT: decl %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm11
-; AVX1-NEXT: vpextrw $7, %xmm2, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm10
+; AVX1-NEXT: vpextrw $5, %xmm2, %ecx
; AVX1-NEXT: decl %eax
-; AVX1-NEXT: vmovd %eax, %xmm12
-; AVX1-NEXT: vpextrw $6, %xmm2, %eax
+; AVX1-NEXT: vmovd %eax, %xmm11
+; AVX1-NEXT: vpextrw $4, %xmm2, %eax
; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm13
-; AVX1-NEXT: vpextrw $5, %xmm3, %edx
+; AVX1-NEXT: vmovd %edx, %xmm12
+; AVX1-NEXT: vpextrw $1, %xmm2, %edx
; AVX1-NEXT: decl %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm14
-; AVX1-NEXT: vpextrw $4, %xmm3, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm13
+; AVX1-NEXT: vpextrw $0, %xmm2, %ecx
; AVX1-NEXT: decl %eax
-; AVX1-NEXT: vmovd %eax, %xmm3
-; AVX1-NEXT: vpextrw $5, %xmm2, %eax
-; AVX1-NEXT: decl %edx
-; AVX1-NEXT: vmovd %edx, %xmm15
-; AVX1-NEXT: vpextrw $4, %xmm2, %edx
-; AVX1-NEXT: decl %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
+; AVX1-NEXT: vmovd %eax, %xmm14
+; AVX1-NEXT: vpextrw $3, %xmm2, %eax
+; AVX1-NEXT: decq %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm15
+; AVX1-NEXT: vpextrw $2, %xmm2, %edx
+; AVX1-NEXT: decq %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm2
; AVX1-NEXT: decl %eax
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; AVX1-NEXT: vmovd %eax, %xmm5
; AVX1-NEXT: decl %edx
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX1-NEXT: vmovd %edx, %xmm7
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
-; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
-; AVX1-NEXT: vmovddup {{.*#+}} ymm2 = ymm2[0,0,2,2]
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7]
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vandps %ymm0, %ymm2, %ymm1
; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index d869f8ec01a5a..8f82a5bc6554e 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -172,10 +172,9 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%ebp), %ecx
-; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: movl 40(%ebp), %esi
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: sarl $31, %edx
-; X86-NEXT: movl %ecx, %esi
; X86-NEXT: xorl %edx, %esi
; X86-NEXT: movl 36(%ebp), %ecx
; X86-NEXT: xorl %edx, %ecx
@@ -204,45 +203,45 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: sete %al
; X86-NEXT: orb %cl, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: bsrl %eax, %edx
+; X86-NEXT: bsrl %esi, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: addl $32, %edx
-; X86-NEXT: bsrl %esi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: bsrl %eax, %ecx
; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
; X86-NEXT: testl %esi, %esi
-; X86-NEXT: cmovel %edx, %ecx
+; X86-NEXT: cmovnel %edx, %ecx
; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: bsrl %edi, %edi
; X86-NEXT: xorl $31, %edi
-; X86-NEXT: addl $32, %edi
+; X86-NEXT: orl $32, %edi
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %edi
-; X86-NEXT: addl $64, %edi
+; X86-NEXT: orl $64, %edi
; X86-NEXT: movl %eax, %edx
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %esi, %edx
; X86-NEXT: cmovnel %ecx, %edi
-; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: xorl $31, %edx
-; X86-NEXT: addl $32, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: bsrl %eax, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
; X86-NEXT: testl %eax, %eax
-; X86-NEXT: cmovel %edx, %ecx
+; X86-NEXT: cmovnel %edx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: bsrl %ebx, %esi
; X86-NEXT: xorl $31, %esi
; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: addl $32, %edx
+; X86-NEXT: orl $32, %edx
; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %esi, %edx
-; X86-NEXT: addl $64, %edx
+; X86-NEXT: orl $64, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: orl %eax, %esi
; X86-NEXT: cmovnel %ecx, %edx
@@ -380,9 +379,9 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl $-1, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: adcl $-1, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 7bbddefd82721..0bef9ee50bd54 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -173,17 +173,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: xorl $31, %edx
; X86-NEXT: bsrl 36(%ebp), %ecx
; X86-NEXT: xorl $31, %ecx
-; X86-NEXT: addl $32, %ecx
+; X86-NEXT: orl $32, %ecx
; X86-NEXT: testl %esi, %esi
; X86-NEXT: cmovnel %edx, %ecx
; X86-NEXT: bsrl %edi, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: bsrl %ebx, %eax
; X86-NEXT: xorl $31, %eax
-; X86-NEXT: addl $32, %eax
+; X86-NEXT: orl $32, %eax
; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %edx, %eax
-; X86-NEXT: addl $64, %eax
+; X86-NEXT: orl $64, %eax
; X86-NEXT: movl 36(%ebp), %edx
; X86-NEXT: orl %esi, %edx
; X86-NEXT: cmovnel %ecx, %eax
@@ -193,7 +193,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl 20(%ebp), %ecx
; X86-NEXT: bsrl %ecx, %ecx
; X86-NEXT: xorl $31, %ecx
-; X86-NEXT: addl $32, %ecx
+; X86-NEXT: orl $32, %ecx
; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %ecx
; X86-NEXT: movl 16(%ebp), %edi
@@ -201,10 +201,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: xorl $31, %esi
; X86-NEXT: bsrl 12(%ebp), %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: addl $32, %edx
+; X86-NEXT: orl $32, %edx
; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %esi, %edx
-; X86-NEXT: addl $64, %edx
+; X86-NEXT: orl $64, %edx
; X86-NEXT: movl 20(%ebp), %edi
; X86-NEXT: movl %edi, %esi
; X86-NEXT: orl %ebx, %esi
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index 0f66d42697d97..953a5e7285fe4 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -171,15 +171,15 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: vmovdqa (%ecx), %xmm0
-; X86-NEXT: vpand (%edx), %xmm0, %xmm0
+; X86-NEXT: vmovdqa (%edx), %xmm0
+; X86-NEXT: vpand (%ecx), %xmm0, %xmm0
; X86-NEXT: vpextrb $6, %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: freeze_extractelement:
; X64: # %bb.0:
-; X64-NEXT: vmovdqa (%rsi), %xmm0
-; X64-NEXT: vpand (%rdi), %xmm0, %xmm0
+; X64-NEXT: vmovdqa (%rdi), %xmm0
+; X64-NEXT: vpand (%rsi), %xmm0, %xmm0
; X64-NEXT: vpextrb $6, %xmm0, (%rdx)
; X64-NEXT: retq
%i0 = load <16 x i8>, ptr %origin0
@@ -198,8 +198,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst,
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: vmovdqa (%edx), %xmm0
-; X86-NEXT: vpand (%esi), %xmm0, %xmm0
+; X86-NEXT: vmovdqa (%esi), %xmm0
+; X86-NEXT: vpand (%edx), %xmm0, %xmm0
; X86-NEXT: vmovdqa %xmm0, (%ecx)
; X86-NEXT: vpextrb $6, %xmm0, (%eax)
; X86-NEXT: popl %esi
@@ -207,8 +207,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst,
;
; X64-LABEL: freeze_extractelement_escape:
; X64: # %bb.0:
-; X64-NEXT: vmovdqa (%rsi), %xmm0
-; X64-NEXT: vpand (%rdi), %xmm0, %xmm0
+; X64-NEXT: vmovdqa (%rdi), %xmm0
+; X64-NEXT: vpand (%rsi), %xmm0, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rcx)
; X64-NEXT: vpextrb $6, %xmm0, (%rdx)
; X64-NEXT: retq
@@ -239,8 +239,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id
; X86-NEXT: movl 32(%ebp), %edx
; X86-NEXT: movl 12(%ebp), %esi
; X86-NEXT: movl 8(%ebp), %edi
-; X86-NEXT: vmovaps (%esi), %xmm0
-; X86-NEXT: vandps (%edi), %xmm0, %xmm0
+; X86-NEXT: vmovaps (%edi), %xmm0
+; X86-NEXT: vandps (%esi), %xmm0, %xmm0
; X86-NEXT: vmovaps %xmm0, (%esp)
; X86-NEXT: movzbl (%esp,%ecx), %ecx
; X86-NEXT: cmpb (%esp,%eax), %cl
@@ -255,8 +255,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id
; X64: # %bb.0:
; X64-NEXT: andl $15, %ecx
; X64-NEXT: andl $15, %edx
-; X64-NEXT: vmovaps (%rsi), %xmm0
-; X64-NEXT: vandps (%rdi), %xmm0, %xmm0
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vandps (%rsi), %xmm0, %xmm0
; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -24(%rsp,%rdx), %eax
; X64-NEXT: cmpb -24(%rsp,%rcx), %al
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index d2b292f1a7996..2ac2be5545dfd 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -119,8 +119,8 @@ define void @failing(ptr %0, ptr %1) nounwind {
; CHECK-AVX2-NEXT: .LBB0_2: # %vector.body
; CHECK-AVX2-NEXT: # Parent Loop BB0_1 Depth=1
; CHECK-AVX2-NEXT: # => This Inner Loop Header: Depth=2
-; CHECK-AVX2-NEXT: vmovdqu 1024(%rdx,%rsi), %ymm5
-; CHECK-AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; CHECK-AVX2-NEXT: vmovdqu 1024(%rdx,%rsi), %xmm5
+; CHECK-AVX2-NEXT: vmovdqu 1040(%rdx,%rsi), %xmm6
; CHECK-AVX2-NEXT: vpextrq $1, %xmm5, %rdi
; CHECK-AVX2-NEXT: vpextrq $1, %xmm6, %r8
; CHECK-AVX2-NEXT: vmovq %xmm5, %r9
>From e1e229034bd46f0c1f494080335dc8ef376fca03 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 23 Jul 2025 09:20:50 +0100
Subject: [PATCH 5/5] [RISCV] Add test coverage for #148084
---
llvm/test/CodeGen/RISCV/pr148084.ll | 279 ++++++++++++++++++++++++++++
1 file changed, 279 insertions(+)
create mode 100644 llvm/test/CodeGen/RISCV/pr148084.ll
diff --git a/llvm/test/CodeGen/RISCV/pr148084.ll b/llvm/test/CodeGen/RISCV/pr148084.ll
new file mode 100644
index 0000000000000..9fa26c74021cb
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr148084.ll
@@ -0,0 +1,279 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+
+source_filename = "external/libaom/av1/encoder/tx_search.c"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-android10000"
+
+define fastcc void @search_tx_type() #0 {
+; CHECK-LABEL: search_tx_type:
+; CHECK: # %bb.0: # %._crit_edge.i
+; CHECK-NEXT: # %bb.1: # %bb
+; CHECK-NEXT: lbu a1, 0(zero)
+; CHECK-NEXT: lw a0, 0(zero)
+; CHECK-NEXT: lh a2, 0(zero)
+; CHECK-NEXT: seqz a1, a1
+; CHECK-NEXT: srai a3, a0, 63
+; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: and a1, a1, a2
+; CHECK-NEXT: andi a2, a1, 1
+; CHECK-NEXT: addi a2, a2, -1
+; CHECK-NEXT: or a3, a3, a0
+; CHECK-NEXT: or a2, a2, a3
+; CHECK-NEXT: bgez a2, .LBB0_3
+; CHECK-NEXT: # %bb.2:
+; CHECK-NEXT: bexti a3, a1, 1
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a2, a2, a3
+; CHECK-NEXT: .LBB0_3: # %bb
+; CHECK-NEXT: andi a4, a1, 4
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: beqz a4, .LBB0_5
+; CHECK-NEXT: # %bb.4: # %bb
+; CHECK-NEXT: mv a3, a0
+; CHECK-NEXT: .LBB0_5: # %bb
+; CHECK-NEXT: blt a2, a0, .LBB0_7
+; CHECK-NEXT: # %bb.6: # %bb
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: .LBB0_7: # %bb
+; CHECK-NEXT: andi a5, a1, 8
+; CHECK-NEXT: sext.w a4, a3
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: beqz a5, .LBB0_9
+; CHECK-NEXT: # %bb.8: # %bb
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: .LBB0_9: # %bb
+; CHECK-NEXT: blt a4, a0, .LBB0_11
+; CHECK-NEXT: # %bb.10: # %bb
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: .LBB0_11: # %bb
+; CHECK-NEXT: andi a5, a1, 16
+; CHECK-NEXT: sext.w a4, a2
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: beqz a5, .LBB0_13
+; CHECK-NEXT: # %bb.12: # %bb
+; CHECK-NEXT: mv a3, a0
+; CHECK-NEXT: .LBB0_13: # %bb
+; CHECK-NEXT: blt a4, a0, .LBB0_15
+; CHECK-NEXT: # %bb.14: # %bb
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: .LBB0_15: # %bb
+; CHECK-NEXT: andi a5, a1, 32
+; CHECK-NEXT: sext.w a4, a3
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: beqz a5, .LBB0_17
+; CHECK-NEXT: # %bb.16: # %bb
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: .LBB0_17: # %bb
+; CHECK-NEXT: blt a4, a0, .LBB0_19
+; CHECK-NEXT: # %bb.18: # %bb
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: .LBB0_19: # %bb
+; CHECK-NEXT: andi a5, a1, 64
+; CHECK-NEXT: sext.w a4, a2
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: beqz a5, .LBB0_21
+; CHECK-NEXT: # %bb.20: # %bb
+; CHECK-NEXT: mv a3, a0
+; CHECK-NEXT: .LBB0_21: # %bb
+; CHECK-NEXT: blt a4, a0, .LBB0_23
+; CHECK-NEXT: # %bb.22: # %bb
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: .LBB0_23: # %bb
+; CHECK-NEXT: andi a5, a1, 128
+; CHECK-NEXT: sext.w a4, a3
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: beqz a5, .LBB0_25
+; CHECK-NEXT: # %bb.24: # %bb
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: .LBB0_25: # %bb
+; CHECK-NEXT: blt a4, a0, .LBB0_27
+; CHECK-NEXT: # %bb.26: # %bb
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: .LBB0_27: # %bb
+; CHECK-NEXT: andi a5, a1, 256
+; CHECK-NEXT: sext.w a4, a2
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: beqz a5, .LBB0_29
+; CHECK-NEXT: # %bb.28: # %bb
+; CHECK-NEXT: mv a3, a0
+; CHECK-NEXT: .LBB0_29: # %bb
+; CHECK-NEXT: blt a4, a0, .LBB0_31
+; CHECK-NEXT: # %bb.30: # %bb
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: .LBB0_31: # %bb
+; CHECK-NEXT: andi a5, a1, 512
+; CHECK-NEXT: sext.w a4, a3
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: beqz a5, .LBB0_33
+; CHECK-NEXT: # %bb.32: # %bb
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: .LBB0_33: # %bb
+; CHECK-NEXT: blt a4, a0, .LBB0_35
+; CHECK-NEXT: # %bb.34: # %bb
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: .LBB0_35: # %bb
+; CHECK-NEXT: andi a5, a1, 1024
+; CHECK-NEXT: sext.w a4, a2
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: beqz a5, .LBB0_37
+; CHECK-NEXT: # %bb.36: # %bb
+; CHECK-NEXT: mv a3, a0
+; CHECK-NEXT: .LBB0_37: # %bb
+; CHECK-NEXT: blt a4, a0, .LBB0_39
+; CHECK-NEXT: # %bb.38: # %bb
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: .LBB0_39: # %bb
+; CHECK-NEXT: slli a5, a1, 52
+; CHECK-NEXT: sext.w a4, a3
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: bgez a5, .LBB0_41
+; CHECK-NEXT: # %bb.40: # %bb
+; CHECK-NEXT: mv a2, a0
+; CHECK-NEXT: .LBB0_41: # %bb
+; CHECK-NEXT: blt a4, a0, .LBB0_43
+; CHECK-NEXT: # %bb.42: # %bb
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: .LBB0_43: # %bb
+; CHECK-NEXT: slli a4, a1, 51
+; CHECK-NEXT: sext.w a3, a2
+; CHECK-NEXT: mv a1, a2
+; CHECK-NEXT: bltz a4, .LBB0_49
+; CHECK-NEXT: # %bb.44: # %bb
+; CHECK-NEXT: bge a3, a0, .LBB0_50
+; CHECK-NEXT: .LBB0_45: # %bb
+; CHECK-NEXT: sext.w a2, a1
+; CHECK-NEXT: blt a2, a0, .LBB0_47
+; CHECK-NEXT: .LBB0_46: # %bb
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: .LBB0_47: # %bb
+; CHECK-NEXT: sext.w a0, a0
+; CHECK-NEXT: # %bb.48: # %get_tx_mask.exit
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_49: # %bb
+; CHECK-NEXT: mv a1, a0
+; CHECK-NEXT: blt a3, a0, .LBB0_45
+; CHECK-NEXT: .LBB0_50: # %bb
+; CHECK-NEXT: mv a1, a2
+; CHECK-NEXT: sext.w a2, a2
+; CHECK-NEXT: bge a2, a0, .LBB0_46
+; CHECK-NEXT: j .LBB0_47
+._crit_edge.i:
+ %.in196.i = load i16, ptr null, align 2
+ %i2 = load i16, ptr null, align 2
+ %i3 = and i16 %i2, %.in196.i
+ %i9 = trunc nuw i8 0 to i1
+ br i1 %i9, label %get_tx_mask.exit, label %bb
+
+bb: ; preds = %._crit_edge.i
+ %i13 = load i8, ptr null, align 1
+ %i14 = icmp eq i8 %i13, 0
+ %spec.select211.i = select i1 %i14, i16 0, i16 %i3
+ %i19 = load i32, ptr null, align 4
+ %i20 = zext i16 %spec.select211.i to i32
+ %i21 = load i32, ptr null, align 4
+ %i22 = icmp sgt i32 %i21, -1
+ %i23 = and i32 %i20, 1
+ %.not203.i = icmp eq i32 %i23, 0
+ %spec.select212.i = select i1 %.not203.i, i32 -1, i32 %i21
+ %.1174.i = select i1 %i22, i32 %spec.select212.i, i32 -1
+ %i28 = icmp sgt i32 0, %.1174.i
+ %i29 = and i32 %i20, 2
+ %.not203.1.not.i = icmp eq i32 %i29, 0
+ %spec.select212.1.i = select i1 %.not203.1.not.i, i32 %.1174.i, i32 0
+ %.1174.1.i = select i1 %i28, i32 %spec.select212.1.i, i32 %.1174.i
+ %i30 = load i32, ptr null, align 4
+ %i31 = icmp sgt i32 %i30, %.1174.1.i
+ %i32 = and i32 %i20, 4
+ %.not203.2.i = icmp eq i32 %i32, 0
+ %spec.select212.2.i = select i1 %.not203.2.i, i32 %.1174.1.i, i32 %i30
+ %.1174.2.i = select i1 %i31, i32 %spec.select212.2.i, i32 %.1174.1.i
+ %i36 = load i32, ptr null, align 4
+ %i37 = icmp sgt i32 %i36, %.1174.2.i
+ %i38 = and i32 %i20, 8
+ %.not203.3.i = icmp eq i32 %i38, 0
+ %spec.select212.3.i = select i1 %.not203.3.i, i32 %.1174.2.i, i32 %i36
+ %.1174.3.i = select i1 %i37, i32 %spec.select212.3.i, i32 %.1174.2.i
+ %i42 = load i32, ptr null, align 4
+ %i43 = icmp sgt i32 %i42, %.1174.3.i
+ %i44 = and i32 %i20, 16
+ %.not203.4.i = icmp eq i32 %i44, 0
+ %spec.select212.4.i = select i1 %.not203.4.i, i32 %.1174.3.i, i32 %i42
+ %.1174.4.i = select i1 %i43, i32 %spec.select212.4.i, i32 %.1174.3.i
+ %i48 = load i32, ptr null, align 4
+ %i49 = icmp sgt i32 %i48, %.1174.4.i
+ %i50 = and i32 %i20, 32
+ %.not203.5.i = icmp eq i32 %i50, 0
+ %spec.select212.5.i = select i1 %.not203.5.i, i32 %.1174.4.i, i32 %i48
+ %.1174.5.i = select i1 %i49, i32 %spec.select212.5.i, i32 %.1174.4.i
+ %i51 = load i32, ptr null, align 4
+ %i52 = icmp sgt i32 %i51, %.1174.5.i
+ %i53 = and i32 %i20, 64
+ %.not203.6.i = icmp eq i32 %i53, 0
+ %spec.select212.6.i = select i1 %.not203.6.i, i32 %.1174.5.i, i32 %i51
+ %.1174.6.i = select i1 %i52, i32 %spec.select212.6.i, i32 %.1174.5.i
+ %i56 = load i32, ptr null, align 4
+ %i57 = icmp sgt i32 %i56, %.1174.6.i
+ %i58 = and i32 %i20, 128
+ %.not203.7.i = icmp eq i32 %i58, 0
+ %spec.select212.7.i = select i1 %.not203.7.i, i32 %.1174.6.i, i32 %i56
+ %.1174.7.i = select i1 %i57, i32 %spec.select212.7.i, i32 %.1174.6.i
+ %i60 = load i32, ptr null, align 4
+ %i61 = icmp sgt i32 %i60, %.1174.7.i
+ %i62 = and i32 %i20, 256
+ %.not203.8.i = icmp eq i32 %i62, 0
+ %spec.select212.8.i = select i1 %.not203.8.i, i32 %.1174.7.i, i32 %i60
+ %.1174.8.i = select i1 %i61, i32 %spec.select212.8.i, i32 %.1174.7.i
+ %i63 = load i32, ptr null, align 4
+ %i64 = icmp sgt i32 %i63, %.1174.8.i
+ %i65 = and i32 %i20, 512
+ %.not203.9.i = icmp eq i32 %i65, 0
+ %spec.select212.9.i = select i1 %.not203.9.i, i32 %.1174.8.i, i32 %i63
+ %.1174.9.i = select i1 %i64, i32 %spec.select212.9.i, i32 %.1174.8.i
+ %i67 = load i32, ptr null, align 4
+ %i68 = icmp sgt i32 %i67, %.1174.9.i
+ %i69 = and i32 %i20, 1024
+ %.not203.10.i = icmp eq i32 %i69, 0
+ %spec.select212.10.i = select i1 %.not203.10.i, i32 %.1174.9.i, i32 %i67
+ %.1174.10.i = select i1 %i68, i32 %spec.select212.10.i, i32 %.1174.9.i
+ %i70 = load i32, ptr null, align 4
+ %i71 = icmp sgt i32 %i70, %.1174.10.i
+ %i72 = and i32 %i20, 2048
+ %.not203.11.i = icmp eq i32 %i72, 0
+ %spec.select212.11.i = select i1 %.not203.11.i, i32 %.1174.10.i, i32 %i70
+ %.1174.11.i = select i1 %i71, i32 %spec.select212.11.i, i32 %.1174.10.i
+ %i75 = load i32, ptr null, align 4
+ %i76 = icmp sgt i32 %i75, %.1174.11.i
+ %i77 = and i32 %i20, 4096
+ %.not203.12.i = icmp eq i32 %i77, 0
+ %spec.select212.12.i = select i1 %.not203.12.i, i32 %.1174.11.i, i32 %i75
+ %.1174.12.i = select i1 %i76, i32 %spec.select212.12.i, i32 %.1174.11.i
+ %i80 = load i32, ptr null, align 4
+ %i81 = icmp sgt i32 %i80, %.1174.12.i
+ %spec.select212.13.i = select i1 false, i32 %.1174.12.i, i32 %i80
+ %.1174.13.i = select i1 %i81, i32 %spec.select212.13.i, i32 %.1174.12.i
+ %.1172.13.i = select i1 %i81, i32 13, i32 0
+ %i84 = icmp sgt i32 0, %.1174.13.i
+ %.1172.14.i = select i1 %i84, i32 14, i32 %.1172.13.i
+ %i88 = icmp slt i32 0, %i19
+ %i89 = select i1 %i88, i16 -32768, i16 0
+ %i90 = zext i16 %i89 to i32
+ %i91 = shl nuw nsw i32 1, %.1172.14.i
+ %i92 = and i32 %i91, %i90
+ %.not200.i = icmp eq i32 %i92, 0
+ %i93 = trunc nuw i32 %i91 to i16
+ %i94 = xor i16 %i93, -1
+ %i95 = select i1 %.not200.i, i16 -1, i16 %i94
+ %.2177.i = and i16 %i95, %i89
+ %i96 = xor i16 %.2177.i, -1
+ %i97 = and i16 %spec.select211.i, %i96
+ br label %get_tx_mask.exit
+
+get_tx_mask.exit: ; preds = %._crit_edge.i, %bb
+ %.1261.i = phi i16 [ %i97, %bb ], [ 0, %._crit_edge.i ]
+ %i99 = icmp eq i16 %.1261.i, 0
+ %.2262.i = select i1 %i99, i16 0, i16 %.1261.i
+ ret void
+}
+
+attributes #0 = { noimplicitfloat nounwind sspstrong uwtable vscale_range(2,1024) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+b,+c,+d,+f,+m,+relax,+unaligned-scalar-mem,+unaligned-vector-mem,+v,+zaamo,+zalrsc,+zba,+zbb,+zbs,+zca,+zcd,+zicsr,+zifencei,+zmmul,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-p,-experimental-smctr,-experimental-ssctr,-experimental-svukte,-experimental-xqccmp,-experimental-xqcia,-experimental-xqciac,-experimental-xqcibi,-experimental-xqcibm,-experimental-xqcicli,-experimental-xqcicm,-experimental-xqcics,-experimental-xqcicsr,-experimental-xqciint,-experimental-xqciio,-experimental-xqcilb,-experimental-xqcili,-experimental-xqcilia,-experimental-xqcilo,-experimental-xqcilsm,-experimental-xqcisim,-experimental-xqcisls,-experimental-xqcisync,-experimental-xrivosvisni,-experimental-xrivosvizip,-experimental-xsfmclic,-experimental-xsfsclic,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-experimental-zvbc32e,-experimental-zvkgs,-experimental-zvqdotq,-h,-q,-sdext,-sdtrig,-sha,-shcounterenw,-shgatpa,-shlcofideleg,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcntrpmf,-smcsrind,-smdbltrp,-smepmp,-smmpm,-smnpm,-smrnmi,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssdbltrp,-ssnpm,-sspm,-ssqosid,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-supm,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-svvptc,-xandesperf,-xandesvbfhcvt,-xandesvdot,-xandesvpackfph,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xmipscmov,-xmipslsp,-xsfcease,-xsfmm128t,-xsfmm16t,-xsfmm32a16f,-xsfmm32a32f,-xsfmm32a8f,-xsfmm32a8i,-xsfmm32t,-xsfmm64a64f,-xsfmm64t,-xsfmmbase,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-za64rs,-zabha,-zacas,-zama16b,-zawrs,-zbc,-zbkb,-zbkc,-zbkx,-zcb,-zce,-zcf,-zclsd,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfbfmin,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zic64b,-zicbom,-zicbop,-zicboz,-ziccamoa,-ziccamoc,-ziccif,-zicclsm,-ziccrse,-zicntr,-zicond,-zihintntl,-zihintpause,-zihpm,-zilsd,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-ztso,-zvbb,-zvbc,-zvfbfmin,-zvfbfwma,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
More information about the llvm-branch-commits
mailing list